# CA Gubernatorial Race

This is a new twitter scrape notebook for the California Gubernatorial race.

The current list of announced and possible candidates that the project will be tracking are the ones listed on _La Times_ article [_California's next governor: Who's running, who's on the fence?_](http://www.latimes.com/politics/la-pol-ca-california-governor-list-2018-htmlstory.html) 

This is a list of the candidates listed in the article as of September 6th, 2017:

**Announced**
* [Gavin Newsom - D](https://twitter.com/GavinNewsom) [02/10/2015](http://www.latimes.com/local/politics/la-me-pol-gavin-newsom-20150212-story.html)
* [John Chiang - D](https://twitter.com/JohnChiangCA) [05/17/2016](http://www.latimes.com/politics/la-pol-sac-essential-poli-john-chiang-jumps-into-californias-2018-governor-1463506797-htmlstory.html)
* [Antonio Villaraigosa - D](https://twitter.com/antonio4ca) [11/10/2016](http://www.dailynews.com/2016/11/10/former-la-mayor-antonio-villaraigosa-launches-bid-for-california-governor/)
* [Delaine Eastin - D](https://twitter.com/DelaineEastin) [11/01/2016](https://ballotpedia.org/Delaine_Eastin)
* [John Cox - R](https://twitter.com/TheRealJohnHCox) [03/07/2017](https://en.wikipedia.org/wiki/John_H._Cox#2018_California_gubernatorial_election)
* [Travis Allen- R](https://twitter.com/JoinTravisAllen) [06/22/2017](https://ballotpedia.org/Travis_Allen)
* [Zoltan Istvan - L](https://twitter.com/zoltan_istvan) [02/12/2017](http://www.newsweek.com/zoltan-istvan-california-governor-libertarian-555088)

**Speculated**
* [Kevin Faulconer](https://twitter.com/Kevin_Faulconer)
* [Eric Garcetti](https://twitter.com/ericgarcetti)
* [Tom Steyer](https://twitter.com/TomSteyer)
* [Ashley Sweargin](https://twitter.com/ashleycvcf)
* [Steve Westly](https://twitter.com/SteveWestly)

In [3]:
# coding: utf-8

In [13]:
# import necessary python packages
import sys
#sys.path.append("/usr/local/lib/python2.7/site-packages")
import tweepy #https://github.com/tweepy/tweepy
import dropbox #https://www.dropbox.com/developers-v1/core/docs/python
import csv
import time
import os
from datetime import datetime
from collections import defaultdict
import logging
import gspread
import pandas as pd
import numpy as np
from openpyxl import load_workbook
from unidecode import unidecode

#Twitter and Dropbox API credentials
import api_cred as ac

In [5]:
# setup debug logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [6]:
# modify print precison for easier debugging
np.set_printoptions(precision=20)

# Helper Functions

In [7]:
def authenticate_twitter():
  auth = tweepy.OAuthHandler(ac.consumer_key, ac.consumer_secret)
  auth.set_access_token(ac.access_key, ac.access_secret)
  api = tweepy.API(auth)
  return api

In [8]:
def get_new_tweets(tweet_name, since_id):
  api = authenticate_twitter()
  tweets = []
  new_tweets = api.user_timeline(screen_name = tweet_name, since_id = since_id, count = 200)
  tweets.extend(new_tweets)
  if len(tweets) > 0:
    max_id = tweets[-1].id - 1
  while (len(new_tweets) > 0):
    new_tweets = api.user_timeline(screen_name = tweet_name, since_id = since_id, count = 200, max_id = max_id)
    tweets.extend(new_tweets)
    max_id = tweets[-1].id - 1
  
  tweets = [[tweet.id_str, tweet.created_at, tweet.text, "", "", "",tweet.retweet_count, tweet.favorite_count] for tweet in tweets]
  logger.info("Downloading %d tweets from %s" % (len(tweets), tweet_name))
  return tweets[::-1]

In [9]:
def get_lists(df):
  # put twitter handles, last acquired tweet ID, tweet count and store them in respective lists
  names = filter(lambda x: x > 0, df.iloc[:, 1])
  max_ids = df.iloc[:, 2]
  counts = df.iloc[:, 3]
  
  # save the number of entries
  indices = range(1,len(names)+1)
  
  lists = zip(names, max_ids, counts, indices)
  del lists[0] # the first one is column title
  return lists

In [10]:
def load_sheets(path):
  sheet_book = load_workbook(path)
  sheet_writer = pd.ExcelWriter(path, engine='openpyxl')
  sheet_writer.book = sheet_book
  sheet_writer.sheets = dict((ws.title, ws) for ws in sheet_book.worksheets)
  logger.info("Downloaded %s" % path)
  return sheet_writer

# Write to Sheets ↓

The functions below write data to the currently local sheets.

In [14]:
def collect_data():
  # start timer
  start = time.time()
  logger.info("Start...")
  # dp_client = authenticate_dropbox()
  
  # set file pathway variables an expand to HOME
  path = '~/Dropbox/Summer_of_Tweets/ca_working_sheets/'
  tweet_list = "Tweet_List.xlsx"
  cand_tweets = "cand_tweets.xlsx"
  sheetname = 'cand'
  # pac_tweets = "PAC_Tweets.xlsx"
  path = os.path.expanduser(path)
  
  # load and prepare list of twitter accounts    
  list_writer = load_sheets(path + tweet_list)
  list_df = pd.read_excel(path + tweet_list, sheetname=sheetname)
  list_df = list_df.dropna(thresh=4)
  # list_df['Last_Pulled'] = pd.to_datetime(list_df['Last_Pulled'], errors='coerce') 
  # properly load spreadsheet to append new data
  tweet_writer = load_sheets(path + cand_tweets)
  
  # loop through the list of Cand/PACs and updates each tweet sheet appropriately
  for index, row in list_df.iterrows():       
    name, since_id, count = row[1], row[2],row[3]
    
    new_tweets = get_new_tweets(name, since_id)
    # if there are no new tweets continue to the next account
    if (len(new_tweets) > 0):
      # turn the new tweets into a dataframe and write them to the corresponding excel sheet
      df = pd.DataFrame(new_tweets)
      # df = df.applymap(lambda x: x.encode('unicode_escape').
      #            decode('utf-8') if isinstance(x, str) else x) # this line removes illegal characters, implemented due
      #                                                          # to issues with Antonio V. tweets
      df = df.applymap(FormatString) 
      df.to_excel(tweet_writer, sheet_name=name, startrow=count+1, header=False, index=False)
  
      # update since_id, count, and last_pull date in tweet list
      list_df.iat[index,2] = new_tweets[len(new_tweets)-1][0] # since_id
      list_df.iat[index,3] = count + len(new_tweets) # last_pull
      list_df.iat[index,4] = pd.to_datetime(time.strftime("%m/%d/%Y %H:%M:%S"), errors='coerce') # last_pull date
      
      logger.info("Updated new tweets on spreadsheet for %s" % name)
      time.sleep(100)
  
  # write the updated list and save the changes to the excel sheets
  list_df.to_excel(list_writer, sheet_name=sheetname, index=False)
  tweet_writer.save()
  list_writer.save()
  
  logger.info("Done appending new tweets")
  # stop timer and print time elapsed for the current data pull
  end = time.time()
  logger.info("Time Elapsed: %d", float((end-start))/60)

In [15]:
collect_data()

INFO:__main__:Start...
INFO:__main__:Downloaded /Users/SoloMune/Dropbox/Summer_of_Tweets/ca_working_sheets/Tweet_List.xlsx
INFO:__main__:Downloaded /Users/SoloMune/Dropbox/Summer_of_Tweets/ca_working_sheets/cand_tweets.xlsx
INFO:__main__:Downloading 3223 tweets from GavinNewsom
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(s

INFO:__main__:Updated new tweets on spreadsheet for GavinNewsom
INFO:__main__:Downloading 1413 tweets from JohnChiangCA
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
INFO:__main__:Updated new tweets on spreadsheet for JohnChiangCA
INFO:__main__:Downloading 1734 tweets from antonio4ca
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)
  return _unidecode(string)


IllegalCharacterError: 

In [2]:
def FormatString(s):
  if isinstance(s, unicode):
    try:
      s.encode('ascii')
      return s
    except:
      return unidecode(s)
  else:
    return s
  