# FISP Presidential Project Scraper

In [1]:
# coding: utf-8

In [2]:
# import necessary python packages
#import tweepy #https://github.com/tweepy/tweepy
import csv
import time
import os
import re
import tweepy
from datetime import datetime
from collections import defaultdict
import logging
import gspread
import pandas as pd
import numpy as np
import xlrd
from openpyxl import load_workbook
from bs4 import BeautifulSoup

#Twitter API credentials
import api_cred as ac

In [3]:
# setup debug logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [4]:
# modify print precison for easier debugging
np.set_printoptions(precision=20)

# Helper Functions and Variables

In [5]:
# depreceated function from when data was saved to a Google sheet
from oauth2client.service_account import ServiceAccountCredentials
def authenticate_gspread():
  # scopes that your application should be granted access
  scope = ['https://spreadsheets.google.com/feeds'] 
  # Create a Credentials object from the service account's credentials and the scopes
  credentials = ServiceAccountCredentials.from_json_keyfile_name('auth.json', scope)
  gc = gspread.authorize(credentials)
  return gc
  
# gets the list of cand or pac and returns it in a list
def gspread_get_lists(worksheet, is_cand):
  names = filter(lambda x: len(x) > 0, worksheet.col_values(2))
  max_ids = worksheet.col_values(3)[:len(names)]
  counts = worksheet.col_values(4)[:len(names)]
  indices = range(1,len(names)+1)
  lists = zip(names, max_ids, counts, indices)
  del lists[0] # the first one is column title
  return lists

In [6]:
# from dropbox import DropboxOAuth2FlowNoRedirect
def authenticate_dropbox():
  auth_flow = DropboxOAuth2FlowNoRedirect(ac.APP_KEY, ac.APP_SECRET)
  
  authorize_url = auth_flow.start()
  print "1. Go to: " + authorize_url
  print "2. Click \"Allow\" (you might have to log in first)."
  print "3. Copy the authorization code."
  auth_code = raw_input("Enter the authorization code here: ").strip()
  
  try:
    oauth_result = auth_flow.finish(auth_code)
  except Exception, e:
    print ('Error: %s' % (e,))
    return
  
  dbx = dropbox.Dropbox(oauth_result.access_token)
  return dbx

In [7]:
def authenticate_twitter():
  auth = tweepy.OAuthHandler(ac.consumer_key, ac.consumer_secret)
  auth.set_access_token(ac.access_key, ac.access_secret)
  api = tweepy.API(auth)
  return api

In [8]:
def get_new_tweets(tweet_name, since_id):
  api = authenticate_twitter()
  tweets = []
  new_tweets = api.user_timeline(screen_name = tweet_name, since_id = since_id, count = 200)
  tweets.extend(new_tweets)
  if len(tweets) > 0:
    max_id = tweets[-1].id - 1
  while (len(new_tweets) > 0):
    new_tweets = api.user_timeline(screen_name = tweet_name, since_id = since_id, count = 200, max_id = max_id)
    tweets.extend(new_tweets)
    max_id = tweets[-1].id - 1
  
  tweets = [[tweet.id_str, tweet.created_at, tweet.text, "", "", "",tweet.retweet_count, tweet.favorite_count] for tweet in tweets]
  logger.info("Downloading %d tweets from %s" % (len(tweets), tweet_name))
  return tweets[::-1]

In [9]:
def get_lists(df):
  # put twitter handles, last acquired tweet ID, tweet count and store them in respective lists
  names = filter(lambda x: x > 0, df.iloc[:, 1])
  max_ids = df.iloc[:, 2]
  counts = df.iloc[:, 3]
  
  # save the number of entries
  indices = range(1,len(names)+1)
  
  lists = zip(names, max_ids, counts, indices)
  del lists[0] # the first one is column title
  return lists

In [10]:
# take the rows with multiple tweets checked and make an individual row for each tweets
def expand_lists(df):
  # create a list for each columns and a dict to later convert into an df
  id_ = []
  ratings = []
  sources = []
  tweets = {'id': id_, 'rating': ratings, 'source': sources}
  
  # loop thru each row and if tweet id is stored in a list then create df 
  # with each id in a separate row with its fact check data
  for index, row in df.iterrows():
    if (type(row[0]) == list):
      for i in row[0]:
        id_.append(i)
        ratings.append(row[1])
        sources.append(row[2])
      # drop the row containing multiple tweets
      df.drop(index, inplace=True)
  # create new df with tweets in their own row, then append them to the original dataframe
  new_df = pd.DataFrame(tweets)
  df = df.append(new_df)
      
  return df

In [11]:
def dupe_check(tweets_df, cand_name):
  # check for duplicates
  dupe_df = tweets_df[tweets_df.id.duplicated()]
  
  # if there exists a duplicate save it to a csv
  if (len(dupe_df) > 0):
    # make sure that the dupe_folder dir exists
    dupe_df.to_csv('./dupe_folder/' + cand_name + '.csv', encoding='utf-8')

In [12]:
# temp function to properly sort the tweets by date in ascending order
def sort_sheet (tweet_sheet, sheetname, tweet_list):
  start = time.time()
  logger.info("Start...")
        
  # load and prepare list of twitter accounts    
  list_book = load_workbook(tweet_list)
  list_writer = pd.ExcelWriter(tweet_list, engine='openpyxl')
  list_writer.book = list_book
  list_writer.sheets = dict((ws.title, ws) for ws in list_book.worksheets)
  list_df = pd.read_excel(tweet_list, sheetname=sheetname)
  list_df = list_df.dropna(thresh=4)
  # properly load spreadsheet to append new data
  work_book = load_workbook(tweet_sheet)
  tweet_writer = pd.ExcelWriter(tweet_sheet, engine='openpyxl')
  tweet_writer.book = work_book
  tweet_writer.sheets = dict((ws.title, ws) for ws in work_book.worksheets)    
  logger.info("Downloaded tweets list")
       
  # loop through the list of Cand/PACs and updates each tweet sheet appropriately
  for index, row in list_df.iterrows():       
    name, since_id, count = row[1], row[2],row[3]
    tweets_df = pd.read_excel(tweet_sheet, sheetname=name)
    
    tweets_df = tweets_df.sort_values('id')

In [13]:
def load_sheets(path):
  sheet_book = load_workbook(path)
  sheet_writer = pd.ExcelWriter(path, engine='openpyxl')
  sheet_writer.book = sheet_book
  sheet_writer.sheets = dict((ws.title, ws) for ws in sheet_book.worksheets)
  logger.info("Downloaded %s" % path)
  return sheet_writer

In [96]:
def handle_to_lastname(handle_df, handle_col, lastname_dict):
  # make a parallel list of each cand's last name to be appeneded to the original dataframe
  lastname_col = []
  for handle in handle_df[handle_col]:
    lastname_col.append(lastname_dict[handle])
    
  handle_df['lastname'] = lastname_col
  return handle_df

In [135]:
# set file pathway variables an expand to HOME
path = '~/Dropbox/Summer_of_Tweets/working_sheets--THIS_IS_ACTUAL_DATA/'
tweet_list = "Tweet_List.xlsx"
cand_tweets = "Presidential_Tweets.xlsx"
pac_tweets = "PAC_Tweets.xlsx"
path = os.path.expanduser(path)

# sheetnames
cand_sheet = 'candidate'
pac_sheet = 'pac'

# Write to Sheets ↓

All the following functions write to excel or csv sheets.

## Data Pull Func

The following functin gets the most up to date tweets and writes them to the master excel sheet.

In [None]:
def collect_data(tweet_sheet, sheetname, tweet_list):
  # start timer
  start = time.time()
  logger.info("Start...")
  # dp_client = authenticate_dropbox()
    
  # load and prepare list of twitter accounts    
  list_writer = load_sheets(tweet_list)
  list_df = pd.read_excel(tweet_list, sheet_name=sheetname)
  list_df = list_df.dropna(thresh=4)
  # list_df['Last_Pulled'] = pd.to_datetime(list_df['Last_Pulled'], errors='coerce') 
  
  # properly load spreadsheet to append new data
  tweet_writer = load_sheets(tweet_sheet)
   
  # loop through the list of Cand/PACs and updates each tweet sheet appropriately
  for index, row in list_df.iterrows():       
    name, since_id, count = row[1], row[2],row[3]
    
    # Lessign has deleted this account, so skip it while updating tweets
    if (name == 'Lessig2016'):
      continue
    
    # grab new tweets since last id and save it to a dataframe
    new_tweets = get_new_tweets(name, since_id)
    
    # if there are no new tweets continue to the next account
    if (len(new_tweets) > 0):
      # turn the new tweets into a dataframe and write them to the corresponding excel sheet
      df = pd.DataFrame(new_tweets)
      df.to_excel(tweet_writer, sheet_name=name, startrow=count+1, header=False, index=False)
  
      # update since_id, count, and last_pull date in tweet list
      list_df.iat[index,2] = new_tweets[len(new_tweets)-1][0] # since_id
      list_df.iat[index,3] = count + len(new_tweets) # last_pull
      list_df.iat[index,4] = pd.to_datetime(time.strftime("%m/%d/%Y %H:%M:%S"), errors='coerce') # last_pull date
      
      logger.info("Updated new tweets on spreadsheet for %s" % name)
      time.sleep(100)
  
  # write the updated list and save the changes to the excel sheets
  list_df.to_excel(list_writer, sheet_name=sheetname, index=False)
  tweet_writer.save()
  list_writer.save()
  
  logger.info("Done appending new tweets")
  # stop timer and print time elapsed for the current data pull
  end = time.time()
  logger.info("Time Elapsed: %d", float((end-start))/60)

In [138]:
collect_data(path + cand_tweets, cand_sheet, path + tweet_list)
collect_data(path + pac_tweets, pac_sheet, path + tweet_list)

INFO:__main__:Start...
INFO:__main__:Downloaded /Users/SoloMune/Dropbox/Summer_of_Tweets/working_sheets--THIS_IS_ACTUAL_DATA/Tweet_List.xlsx
INFO:__main__:Downloaded /Users/SoloMune/Dropbox/Summer_of_Tweets/working_sheets--THIS_IS_ACTUAL_DATA/Presidential_Tweets.xlsx
INFO:__main__:Downloading 132 tweets from BernieSanders
INFO:__main__:Updated new tweets on spreadsheet for BernieSanders
INFO:__main__:Downloading 24 tweets from BobbyJindal
INFO:__main__:Updated new tweets on spreadsheet for BobbyJindal
INFO:__main__:Downloading 24 tweets from CarlyFiorina
INFO:__main__:Updated new tweets on spreadsheet for CarlyFiorina
INFO:__main__:Downloading 0 tweets from ChrisChristie
INFO:__main__:Downloading 155 tweets from gov_gilmore
INFO:__main__:Updated new tweets on spreadsheet for gov_gilmore
INFO:__main__:Downloading 59 tweets from GovernorPataki
INFO:__main__:Updated new tweets on spreadsheet for GovernorPataki
INFO:__main__:Downloading 27 tweets from GovernorPerry
INFO:__main__:Updated ne

## Update of metadata

A tweets ability to stay in the public discouse is dependent on the number of retweets and favorites. The initial pull of a tweet will not complete picture of the tweets effectiveness. This script allows us to continously update a tweet's metadata counts. 

In [None]:
# Params: is_cand - determines whether to pull candidates tweets or PAC tweets
# Purpose: Updates like and retweet totals
def collect_addition_data(tweet_sheet, sheetname, tweet_list):
  # start the timer
  start = time.time()
  logger.info("Start...")
  # dp_client = authenticate_dropbox()
  
  # load and prepare list of twitter accounts
  list_writer = load_sheets(tweet_list)
  list_df = pd.read_excel(tweet_list, sheetname=sheetname)
  list_df = list_df.dropna(thresh=4)
  # properly load spreadsheet to append new data
  tweet_writer = load_sheets(tweet_sheet)
  logger.info("Downloaded tweets list")
  
  # loop through the list of Cand/PACs and updates each tweet sheet appropriately
  for row in list_df.itertuples():       
    name, since_id, count = row[2], row[3],row[4]
    
    # Lessig has deleted this account, so skip it while updating tweets
    if (name == 'Lessig2016'):
      continue
    
    # read cand tweet sheet
    tweets_df = pd.read_excel(tweet_sheet, sheetname=name)
    logger.info("Retrived data from spreadsheet for %s" % name)
    
    # retreive updated tweets
    tweets = get_new_tweets(name, 1)
    updates_df = pd.DataFrame(tweets)
    
    # clean dataframe to only include id, retweets, and favorites
    updates_df = updates_df[[0, 6, 7]]
    updates_df.columns = ['id', 'retweets', 'favorites']
    
    # call helper fuction to match updated metadata with correct tweets
    tweets_df = update_metadata(tweets_df, updates_df, name)
    
    # write the updated data to the twitter profile's sheet to be saved
    tweets_df.to_excel(tweet_writer, sheet_name=name, index=False, startcol=1)
    logger.info("Updated data on spreadsheet for %s" % name)
    # 100 second pause between data pulls to avoid token exceptions
    time.sleep(20)
  
  tweet_writer.save()
  
  logger.info("Done collecting additional data")
  # stop timer and print time elapsed for the current data pull
  end = time.time()
  logger.info("Time Elapsed: %d", float((end-start))/60)

In [None]:
# This function takes the up to date metadata and matches it to their respective tweet using a tweet's unique id
def update_metadata(tweets_df, updates_df, cand_name): 
  # convert tweet id to the same type as the updates sheet
  tweets_df['id'] = tweets_df['id'].astype(str)
  tweets_df.set_index('id', inplace=True)
  
  ## loop through the updates metadata and updates the tweet sheet
  for row in updates_df.itertuples():
    tweets_df.set_value(row[1], 'retweets', row[2])
    tweets_df.set_value(row[1], 'favorites', row[3])

  # drop null rows that could not match with a tweet
  tweets_df.dropna(subset=['created_at'], inplace=True)
  
  return tweets_df

In [None]:
collect_addition_data(path + cand_tweets, cand_sheet, path + tweet_list)
collect_addition_data(path + pac_tweets, pac_sheet, path + tweet_list)

In [None]:
import requests
def get_full_url(short_urls, full_urls):
for i, us in enumerate(short_urls):
full = []
  if not us.startswith("http"):
    continue
  for url in us.split(" "):
    if not url.startswith("http"):
      continue
    try:
      r = requests.head(url, allow_redirects=True)
      full.append(r.url)
    except:
      logger.info("Error occurred for URL - %s" % url)
      continue
  if i % 500 == 0:
      logger.info("Extracting URL %d/%d" % (i, len(short_urls)))
      time.sleep(60)
  full_urls[i] = " ".join(full)q

In [None]:
def update_full_url(tweets_df, updates_df, cand_name):
  start = time.time()
  logger.info("Start...")
  # dp_client = authenticate_dropbox()
  
  # load and prepare list of twitter accounts
  list_writer = load_sheets(tweet_list)
  list_df = pd.read_excel(tweet_list, sheetname=sheetname)
  list_df = list_df.dropna(thresh=4)
  # properly load spreadsheet to append new data
  tweet_writer = load_sheets(tweet_sheet)
  logger.info("Downloaded tweets list")
    
  logger.info("Successfully download the list...")
  for e, entry in enumerate(list_df):
    if e < 15:
      continue

    name, since_id, count, index = entry[0], entry[1],entry[2], entry[3]

    short_urls = worksheet.col_values(6)
    logger.info("Downloaded %s URL", name)
    url_datas = ['' for i in xrange(len(short_urls))]
    url_datas[0] = 'full URL'

    get_full_url(short_urls, url_datas) # transfer short url to full urls and store in url_datas

    count = 1

    while count < len(short_urls):
      amount = min(100, len(short_urls) - count)
      cells = worksheet.range('I'+str(count)+':'+'I'+str(count+amount-1))
      assert(len(cells) == amount)
      for i in range(amount):
        cells[i].value = url_datas[count-1]
        count += 1
      worksheet.update_cells(cells)
      logger.info("Update cells %d/%d for %s" %(count, len(short_urls), name))

In [None]:
update_full_url(path + cand_tweets, cand_sheet, path + tweet_list)
update_full_url(path + pac_tweets, pac_sheet, path + tweet_list)

# Convert into one large csv

In [None]:
def convert_xlsx_csv (tweet_sheet, sheetname, tweet_list):
  # start timer
  start = time.time()
  logger.info("Start...")
  # dp_client = authenticate_dropbox()
    
  # load and prepare list of twitter accounts    
  list_writer = load_sheets(tweet_list)
  list_df = pd.read_excel(tweet_list, sheetname=sheetname)
  list_df = list_df.dropna(thresh=4) 
  
  #merged_corpus = pd.DataFrame(columns=['id', 'created_at', 'text', 'hashtag#', 'at@', 'link', 'retweets', 'favorites', 'full URL'])
  merged_df = pd.DataFrame()

  initial_loop = True
  
  # loop through the list of Cand/PACs and updates each tweet sheet appropriately
  for index, row in list_df.iterrows():
    name, since_id, count = row[1], row[2],row[3]
    
    if(name == 'POTUS'):
      continue
    
    if (initial_loop):
      merged_df = pd.read_excel(tweet_sheet, sheetname=name)
      merged_df['Name'] = name
      logger.info("Retrived data from spreadsheet for %s" % name)
      initial_loop = False 
    
    else:
      # read current cand tweet sheet
      curr_df = pd.read_excel(tweet_sheet, sheetname=name)
      curr_df['Name'] = name
      #print (curr_df)
      logger.info("Retrived data from spreadsheet for %s" % name)
      
      merged_df = merged_df.append(curr_df)
      if(name =='ChrisChristie'):
        break
  
  # write the updated list and save the changes to the excel sheets
  merged_df.to_csv('merged_corpus.csv', encoding='utf-8')
  
  logger.info("done")

In [None]:
convert_xlsx_csv(path + cand_tweets, cand_sheet, path + tweet_list)

# WaPo Fact Checking
The cell below collects fact checks from the Washington Post's '2016 Election Fact Checker' and 'RealDonaldContext' chrome extension. The election fact checker data was hand collected and is stored in a json file while the extension data is pulled directly from the online hosted json file from the extension's developer blog.
They are collected into an single dataframe consisting of the tweet id, rating, and source. They are then merged with a master sheet using tweet id.

['2016 Election Fact Checker'](https://www.washingtonpost.com/graphics/politics/2016-election/fact-checker/)

['RealDonaldContext'](https://chrome.google.com/webstore/detail/realdonaldcontext/ddbkmnomngnlcdglabflidgmhmcafogn?hl=en-US)

['RealDonaldContext json file'](https://www.pbump.net/files/post/extension/core/data.php)

['Rating System Scale'](https://www.washingtonpost.com/news/fact-checker/about-the-fact-checker/)

In [None]:
# this code is from the fact checking portion of this project. It grabs the fact checked tweets from
# the WaPo Trump tweet fact checking extension and adds the ratings to correspoding tweets in the spreadsheet

# sheetnames
trump_sheet = 'realDonaldTrump'
potus_sheet = 'POTUS'

logger.info("Start...")

# read in WaPo fact checks of Donald Trump from the WaPo Trump tweet chrome extension
trump_check = pd.read_json('https://www.pbump.net/files/post/extension/core/data.php')
# rename columns and remove text columns
trump_check.columns = ['id', 'rating', 'tweet', 'source']
trump_check = trump_check[['id', 'rating', 'source']]
# call expand lists to turn fact checks of multiple tweets into multiple columns
trump_check = expand_lists(trump_check)

# load pre-election fact checks and filter for just id, rating, and source
election_checks = pd.read_json('preelection_wapo.json')
election_checks = election_checks[['id', 'rating', 'source']]

# append the hand collected data with the data collected from the extension
trump_check = trump_check.append(election_checks, ignore_index=True)
trump_check.columns = ['id', 'WAPO_RATING', 'WAPO_SOURCE']
logger.info("read in fact checks")

# set file pathway variables an expand to HOME
in_path = '~/Dropbox/Summer_of_Tweets/fact_checking/Presidential_Fact_Checking.xlsx'
in_path = os.path.expanduser(in_path)

# properly load spreadsheet to append new data
work_book = load_workbook(in_path)
tweet_writer = pd.ExcelWriter(in_path, engine='openpyxl')
tweet_writer.book = work_book
tweet_writer.sheets = dict((ws.title, ws) for ws in work_book.worksheets)
tweets_df = pd.read_excel(in_path, sheetname=trump_sheet, dtype={'id': str})
logger.info("Downloaded excel sheets list")

# change data type to match excel sheet's
trump_check['id'] = trump_check['id'].astype(str)
#merge the fact check data set with the tweets set using tweet id
merged_df = tweets_df.merge(trump_check, on='id', how='left')

logger.info(merged_df.shape) # used for debugging
# write merged data to the excel sheet
merged_df.to_excel(tweet_writer, sheet_name=trump_sheet, index=False)
tweet_writer.save()

# merged_df.to_csv('WaPo.csv', encoding='utf-8') # used for viewing test results

logger.info("done")

# Follower Growth
The follower growth for Hillary Clinton and Donald Trump is being collected for the project. After some research on which sites are best for follower growth data, [Trackalytics](http://www.trackalytics.com) is the best free resource for tracking follower growth. However it does not have comprehensive follower growth data for the rest of the candidates, the others either are not present on the site or their data starts to get collected well into the election cycle.

The data is scraped using the IMPORTHTML function in google sheets. Information on the function and how to use it can be found [here](http://lenagroeger.s3.amazonaws.com/talks/orlando/gettingdata.html)  while the sheet itself can be found [here](https://docs.google.com/spreadsheets/d/1rahomcsDJFf_za0S_Tbzi1kv79bdNM2ZqNZ_H7XcMIM/edit?usp=sharing). 

The following function runs to clean the data sheet to move daily delta in followers into its own column and then downloading and moving the sheet onto the FISP dropbox.

Implemented using the df2gspread module, documentation for the module can be found [here](https://github.com/maybelinot/df2gspread)

[Trump Follower Tracker](http://www.trackalytics.com/twitter/profile/RealDonaldTrump/)

[Clinton Follower Tracker](http://www.trackalytics.com/twitter/profile/HillaryClinton/)

*The Site is missing data for July 3-5th 2016

In [99]:
# the path to the follower growth sheeet in my drive
#sheet_path = "archive/fisp_twitter/sheets/follower_growth_test"
sheet_path = "./follower_growth.xlsx"

# currently it contains Clinton and Trump follower growth
clinton_sheet = "HillaryClinton"
trump_sheet = "realDonaldTrump"

In [100]:
# passed into the apply func to remove daily change in metadata for twitter accounts
def split_func (string):
  return string.split()[0]

In [101]:
# import module for downloading the sheets from the drive 
# from df2gspread import gspread2df as g2d 

def gen_cand_followers (file_path, sheet_name):
  # download the follower growth sheet from gdrive
  #df = g2d.download(file_path, sheet_name, col_names = True) # use this line if directly importing from gSheets
  df = pd.read_excel(file_path, sheet_name) # use this line if importing locally aftering downloading it from Sheets

  # take the top row and convert it to the column header, this code is only needed if data is pulled from gdrive
  #df.columns = df.iloc[0]
  #df = df.reindex(df.index.drop(0))
  
  # trim out the change data that is appended at the end ot the daily value for the metadata
  df['Followers'] = df.Followers_change.apply(split_func)
  df['Following'] = df.Following_change.apply(split_func)
  df['Tweets'] = df.Tweets_change.apply(split_func)
  
  # drop the change rows
  df = df.drop(['Followers_change', 'Following_change', 'Tweets_change', 
                'Lists_change', 'Favourites_change', 'Tweets', 'id', 'Following'], 1)
  
  # convert date to MM/DD/YYYY format and rename the column
  df['Date'] = pd.to_datetime(df.Date)
  df['Date'] = df['Date'].dt.strftime('%m/%d/%Y')
  df.columns = ['date', 'follower_count']
  
  # add col with candidate name
  df['handle'] = sheet_name
  
  return df

In [102]:
clinton_follower_df = gen_cand_followers (sheet_path, clinton_sheet)
trump_follower_df = gen_cand_followers (sheet_path, trump_sheet)

In [103]:
trump_follower_df.head()

Unnamed: 0,date,follower_count,handle
0,04/25/2018,51164021,realDonaldTrump
1,04/24/2018,51122142,realDonaldTrump
2,04/23/2018,51078478,realDonaldTrump
3,04/22/2018,51042524,realDonaldTrump
4,04/21/2018,51005809,realDonaldTrump


In [104]:
# append row with each candidate's last name
clinton_follower_df['lastname'] = 'Clinton'
trump_follower_df['lastname'] = 'Trump'

In [107]:
gen_cand_followers_df = clinton_follower_df.append(trump_follower_df, ignore_index=True)

In [106]:
follower_growth_df = pd.read_csv('follower_count_growth.csv')
follower_growth_df = handle_to_lastname(follower_growth_df, 'handle', cand_to_lastname)

In [108]:
follower_growth_df = follower_growth_df.append(gen_cand_followers_df, ignore_index=True)

In [113]:
follower_growth_df.date = pd.to_datetime(follower_growth_df['date'].str.replace("-", "/"))

In [123]:
# save csv's path and load it into a panda's dataframe
path = '~/Dropbox/Summer_of_Tweets/Deduped_Tweets/polling_merged.csv'
df = pd.read_csv(path, low_memory=False)

# duplicated the tweet created at columns to made a modify it to a MM/DD/YY format
df['date'] = df['created_at']
df['date'] = pd.to_datetime(df.date)
df['date'] = pd.to_datetime(df['date'].dt.strftime('%m/%d/%Y'))

In [125]:
# merge value needs to be sorted to go through merge_asof 
df = df.sort_values(['date'])
follower_growth_df = follower_growth_df.sort_values(['date'])

In [132]:
follower_growth_df.to_csv('test.csv', index=False)

In [131]:
# merge the full data set with the general cadidate's follower growth using the date and lastname fields
merged_df = pd.merge_asof(df, follower_growth_df, on='date', by='lastname')
follower_df = merged_df.to_csv('followers_polling_merged.csv', index=False)

## Wayback Machine Follower Pull

In [None]:
# path variables 
twitter_pages = '~/fisp_testing/new_pull/' # replace with path to wayback machine archive directories 

In [7]:
def compile_follower_growth(pages_dir):
  # start timer
  start = time.time()
  logger.info("Start...")
  
  # intialize dataframe
  follower_count_df = pd.DataFrame(columns=['date', 'follower_count', 'handle'])
  
  # Get the handles from dir present in dir of twitter pages pulled
  handles = [name for name in os.listdir(os.path.expanduser(pages_dir)) if not name.startswith('.')]
  
  # loop through each handle present and extract date and corresponding follower count
  for handle in handles:
    # dict with key:value equaling date:follower_count
    follower_count_dict = {}
    
    # keep a list of dates failed for debugging purposes
    failed_dates = []
    passed_dates = []
    
    logger.info("Current Handle: %s", handle)
    
    # make a list of the dates present in the wayback archive
    dates = [date for date in os.listdir(os.path.expanduser(pages_dir + handle)) if not date.startswith('.')]
    
    # for each archive date find 'follower_count' element and extract total
    for date in dates:
      # intialize len of number and bool that keeps track of whether number has ended
      num_len = 0
      num = False
      
      # convert string to datetime object
      count_date = datetime.strptime(date, "%Y%m%d%H%M%S") # currently not being utilized and converted after the fact
      
      # get the path of the specific date's archived html
      page = pages_dir + handle + '/' + date + '/twitter.com' + '/' + handle
      # open the file as a BeuatifulSoup object then convert it to an str for easier search indexing
      soup = BeautifulSoup(open(os.path.expanduser(page)), 'html.parser')
      soup = str(soup)
      
      # get the index of where the follower_count element is
      init = soup.find('followers_count')
      
      # for debugging purposes keep track of failed follower_count search
      if (init == -1):
        failed_dates.append(date[0:4])
        continue
      
      # iterate thru the string until a digit is reach
      while (not num):
        init+=1 # keep track of where the number's initial index
        num = soup[init].isdigit() # will return whether current char is a digit
      
      # iterate thru the number until you reach a char that is not a digit
      while (num):
        num_len+=1 # keep track of the len of the number
        num = soup[init:init+num_len].isdigit() # will return whether current char is a digit
      
      # slice the number out of the html string and convert to an int
      follower_count = int(soup[init:init+num_len-1])
      # store the follower count in a dict with a key:value of date:follower_count
      follower_count_dict[date] = follower_count
    
    # initialize a temporary dataframe to store the current handle's follower_count growth
    temp_df = pd.DataFrame.from_dict(follower_count_dict, orient='index')
    temp_df.reset_index(level=0, inplace=True)
    temp_df['handle'] = handle # add a col indicating corresponding handle
    temp_df.columns = ['date', 'follower_count', 'handle'] # rename columns
    
    # append the new data to the comprehensive dataframe with the growth for all present candidates
    follower_count_df = follower_count_df.append(temp_df)
  logger.info('Number of failed follower_count element searches %d', len(failed_dates))
  logger.info('Done!')
  return follower_count_df

In [18]:
follower_count_df = compile_follower_growth(twitter_pages)

INFO:__main__:Start...
INFO:__main__:Current Handle: BobbyJindal
INFO:__main__:Current Handle: gov_gilmore


nope!


INFO:__main__:Current Handle: ChrisChristie
INFO:__main__:Current Handle: Lessig2016
INFO:__main__:Current Handle: GovMikeHuckabee
INFO:__main__:Current Handle: LindseyGrahamSC
INFO:__main__:Current Handle: RandPaul
INFO:__main__:Current Handle: GovernorPataki
INFO:__main__:Current Handle: ScottWalker
INFO:__main__:Current Handle: MartinOMalley
INFO:__main__:Current Handle: RickSantorum
INFO:__main__:Current Handle: JohnKasich
INFO:__main__:Current Handle: marcorubio
INFO:__main__:Current Handle: RealBenCarson
INFO:__main__:Current Handle: JimWebbUSA
INFO:__main__:Current Handle: tedcruz
INFO:__main__:Current Handle: GovernorPerry
INFO:__main__:Current Handle: BernieSanders


nope!


INFO:__main__:Current Handle: CarlyFiorina
INFO:__main__:Done!


In [21]:
# uncomment this to save resulting dataframe
#follower_count_df.to_csv('follower_count_df.csv', index=False)

### Cleaning Follower Count Data

In [68]:
follower_count_df = pd.read_csv('follower_count_df.csv')

In [69]:
# properly sort the data
follower_count_df = follower_count_df.sort_values(['handle','date'])
follower_count_df = follower_count_df.reset_index()
follower_count_df = follower_count_df[['date', 'follower_count', 'handle']]

In [70]:
follower_count_df.head()

Unnamed: 0,date,follower_count,handle
0,20150505022756,35215,BernieSanders
1,20150518030446,35215,BernieSanders
2,20150519234044,37016,BernieSanders
3,20150519234046,37017,BernieSanders
4,20150519234110,37016,BernieSanders


In [8]:
def clean_duplicate_dates(follower_count_df):
  # keep a list of each row to be dropped from the dataframe
  entries_to_drop = []
  # loop through each row
  for index, row in follower_count_df.iterrows():
    # take the current date and modify it to remove time 
    curr_date = int((row['date']) / 1000000)
    #curr_date = row['date'][0:8] # use this line if date is saved as a string
    
    # the first row cannot be compared
    if (index != 0):
      # if prev MMDDYYYY matches the current add it to be removed
      if (curr_date == prev_date[1]):
        entries_to_drop.append(prev_date[0])
    # set prev_date to current_date for next iteration
    prev_date = (index, curr_date)
  
  logger.info("expected size of dataframe: %d", len(follower_count_df) - len(entries_to_drop))
  entries_to_keep = set(range(len(follower_count_df))) - set(entries_to_drop)
  follower_count_df = follower_count_df.take(list(entries_to_keep))
  logger.info("actual size of dataframe %d", len(follower_count_df))
  return follower_count_df

In [73]:
# convert from YYYYMMDDHHMMSS format to YYYYMMDD 
temp_series = pd.Series((follower_count_df.date / 1000000)).astype(int)
follower_count_df.date = pd.to_datetime(temp_series, format="%Y%m%d")

In [75]:
# sort the data by handles and date
follower_count_df.sort_values(['handle','date']).to_csv('follower_count_growth.csv', index=False)

## Polling Data Match

This set of code will take the polling data from 538 and match them to their corresponding day's tweets.

In [73]:
##############################################
# load in the data
prim_df = pd.read_csv('./data/national_primary_poll_average_2016.csv')
# drop unncessary columns
prim_df = prim_df[['lastname', 'poll_avg', 'forecastdate']]

# reformat dates into a mm/dd/YY format and then rename columns for consistency
prim_df['forecastdate'] = pd.to_datetime(prim_df.forecastdate)
prim_df['forecastdate'] = prim_df['forecastdate'].dt.strftime('%m/%d/%Y')
prim_df.columns = ['lastname', 'forecast', 'date']

In [74]:
prim_df.lastname.unique()

array(['Sanders', 'Clinton', 'Trump', 'Kasich', 'Cruz', 'Rubio', 'Carson',
       'Bush', 'Fiorina', 'Christie', 'Santorum', 'Paul', "O'Malley",
       'Huckabee', 'Pataki', 'Graham', 'Jindal', 'Lessig', 'Chafee',
       'Webb', 'Walker', 'Perry'], dtype=object)

In [75]:
# load in the nat data and filter out for polls-only data
nat_df = pd.read_csv('./data/national_topline.csv')
nat_df = nat_df[nat_df.type == 'polls-only']

# reformat dates into a mm/dd/YY format
nat_df['forecastdate'] = pd.to_datetime(nat_df.forecastdate)
nat_df['forecastdate'] = nat_df['forecastdate'].dt.strftime('%m/%d/%Y')

# remove all but date and prediction score for trump and clinton
nat_df = nat_df[['forecastdate', 'ecwin_clinton', 'ecwin_trump']]
nat_df.head()

# create separate dataframes for each candidate to make it easier to manipulate and combine with primary data
clinton_df = nat_df[['forecastdate', 'ecwin_clinton']]
trump_df = nat_df[['forecastdate', 'ecwin_trump']]

# add a corresponding column for lastname to match primary prediction data format
clinton_df['lastname'] = 'Clinton'
trump_df['lastname'] = 'Trump'

# rename and rearrange columns for consistency with primary data
clinton_df.columns = ['date', 'forecast', 'lastname']
trump_df.columns = ['date', 'forecast', 'lastname']
clinton_df = clinton_df[['lastname', 'forecast', 'date']]
trump_df = trump_df[['lastname', 'forecast', 'date']]

In [76]:
# append general election forecast data with primary election forecast data
forecast_df = prim_df.append(clinton_df, ignore_index=True)
forecast_df = forecast_df.append(trump_df, ignore_index=True)
#forecast_df.to_csv('538_polling.csv') # export forecast to a csv

In [77]:
forecast_df.lastname.unique()

array(['Sanders', 'Clinton', 'Trump', 'Kasich', 'Cruz', 'Rubio', 'Carson',
       'Bush', 'Fiorina', 'Christie', 'Santorum', 'Paul', "O'Malley",
       'Huckabee', 'Pataki', 'Graham', 'Jindal', 'Lessig', 'Chafee',
       'Webb', 'Walker', 'Perry'], dtype=object)

In [78]:
# set csv's path and load it into a panda's dataframe
path = '~/Dropbox/Summer_of_Tweets/Deduped_Tweets/deduped_tweets.csv'
df = pd.read_csv(path, low_memory=False)

# duplicated the tweet created at columns to made a modify it to a MM/DD/YY format
df['date'] = df['created_at']
df['date'] = pd.to_datetime(df.date)
df['date'] = df['date'].dt.strftime('%m/%d/%Y')

In [79]:
# grab the cand names from the original sheet and make a parallel array of each cand's last name to match polling data
cand = df.Candidate.unique()
cand_lastname = ['Carson', 'Sanders', 'Jindal', 'Jindal', 'Fiorina', 'Fiorina', 'Christie', 'Christie', 'Sanders',
                 'Pataki', 'Perry', 'Gilmore', 'Huckabee', 'Trump', 'Clinton', 'Clinton', 'Bush', 'Bush', 'Webb',
                 'Kasich', 'Kasich', 'Lessig', 'Chaffee', 'Graham', 'Graham', 'Rubio', 'Rubio', "O'Malley", "O'Malley", 
                 'Huckabee', 'Sanders', 'Paul', 'Paul', 'Carson', 'Trump', 'Trump', 'Perry',
                 'Santorum', 'Santorum', 'Walker', 'Walker', 'Cruz', 'Cruz']

# take the parallel arrays and make a dict with the orig name as key and the lastname as value
cand_to_lastname = {}
for (cand, lastname) in zip(cand, cand_lastname):
    cand_to_lastname[cand] = lastname

# take the parallel arrays and make a dict with the orig name as key and the lastname as value
lastname_to_cand = {}
for (cand, lastname) in zip(cand_lastname, cand):
    lastname_to_cand[cand] = lastname

    

# take the parallel arrays and make a dict with the orig name as key and the lastname as value
lastname_to_cand = {}
for (cand, lastname) in zip(cand_lastname, cand):
    lastname_to_cand[cand] = lastname


In [80]:
df.Candidate.unique()

array(['Ben_Carson', 'BernieSanders', 'BobbyJindal', 'Bobby_Jindal',
       'CarlyFiorina', 'Carly_Fiorina', 'ChrisChristie', 'Chris_Christie',
       'feelthebernorg', 'GovernorPataki', 'GovernorPerry', 'gov_gilmore',
       'GovMikeHuckabee', 'greatamericapac', 'HillaryClinton',
       'Hillary_Clinton', 'JebBush', 'Jeb_Bush', 'JimWebbUSA',
       'JohnKasich', 'John_Kasich', 'Lessig2016', 'LincolnChafee',
       'Lindsey_Graham', 'LindseyGrahamSC', 'marcorubio', 'Marco_Rubio',
       'MartinOMalley', "Martin_O'Malley", 'Mike_Huckabee',
       'progressivekick', 'RandPaul', 'Rand_Paul', 'RealBenCarson',
       'realDonaldTrump', 'RebuildingAmNow', 'Rick_Perry', 'RickSantorum',
       'Rick_Santorum', 'ScottWalker', 'Scott_Walker', 'tedcruz',
       'Ted_Cruz'], dtype=object)

In [81]:
# grab the cand names from the original sheet and make a parallel array of each cand's last name to match polling data
cand = df.Candidate.unique()

cand_lastname = ['Carson', 'Sanders', 'Jindal', 'Jindal', 'Fiorina', 'Fiorina', 'Christie', 'Christie', 'Sanders',
                 'Pataki', 'Perry', 'Gilmore', 'Huckabee', 'Trump', 'Clinton', 'Clinton', 'Bush', 'Bush', 'Webb',
                 'Kasich', 'Kasich', 'Lessig', 'Chafee', 'Graham', 'Graham', 'Rubio', 'Rubio', "O'Malley", "O'Malley", 
                 'Huckabee', 'Sanders', 'Paul', 'Paul', 'Carson', 'Trump', 'Trump', 'Perry',
                 'Santorum', 'Santorum', 'Walker', 'Walker', 'Cruz', 'Cruz']

# take the parallel arrays and make a dict with the orig name as key and the lastname as value
lastname_to_cand = {}
for (cand, lastname) in zip(cand_lastname, cand):
    lastname_to_cand[cand] = lastname

In [82]:
lastname_to_cand = {'Carson': 'Ben_Carson', 'Sanders': 'BernieSanders', 'Jindal': 'BobbyJindal', 
                    'Fiorina': 'CarlyFiorina', 'Christie':'ChrisChristie', 'Pataki': 'GovernorPataki',
                    'Perry': 'GovernorPerry', 'Gilmore': 'gov_gilmore', 'Huckabee': 'GovMikeHuckabee',
                    'Trump': 'realDonaldTrump', 'Clinton': 'Hillary_Clinton', 'Bush': 'JebBush', 'Webb':'JimWebbUSA', 
                    'Kasich': 'John_Kasich', 'Lessig': 'Lessig2016', 'Chafee': 'LincolnChafee',
                    'Graham': 'Lindsey_Graham', 'Rubio': 'marcorubio', "OMalley": 'MartinOMalley', 'Paul': 'RandPaul',
                    'Perry': 'Rick_Perry','Santorum': 'RickSantorum', 'Walker': 'ScottWalker', 'Cruz': 'tedcruz'}

In [72]:
# make a parallel list of each cand's last name to be appeneded to the original dataframe
# lastname_col = []
# for cand in forecast_df.lastname:
#   lastname_col.append(lastname_to_cand[cand])
# 
# forecast_df['lastname'] = lastname_col

#forecast_df.to_csv('538_polling.csv')

KeyError: 'BernieSanders'

In [83]:
# make a parallel list of each cand's last name to be appeneded to the original dataframe
lastname_col = []
for cand in df.Candidate:
  lastname_col.append(cand_to_lastname[cand])
  
df['lastname'] = lastname_col

In [None]:
forecast_df.to_csv('538_polling.csv')

In [84]:
print forecast_df.lastname.unique()
logger.info("Dataframe size: %d", len(forecast_df))

INFO:__main__:Dataframe size: 5596


['Sanders' 'Clinton' 'Trump' 'Kasich' 'Cruz' 'Rubio' 'Carson' 'Bush'
 'Fiorina' 'Christie' 'Santorum' 'Paul' "O'Malley" 'Huckabee' 'Pataki'
 'Graham' 'Jindal' 'Lessig' 'Chafee' 'Webb' 'Walker' 'Perry']


In [88]:
merged_df = df.merge(forecast_df, on=['date', 'lastname'])
polling_merged = merged_df.to_csv('polling_merged.csv', index=False)

In [86]:
merged_df

Unnamed: 0,Candidate,from,id,created_at,text,hashtag.,at.,link,retweets,favorites,...,dur_camp,pre_2016,post_ge,pre_prim,post_prim,clean_text,rt_dummy,date,lastname,forecast
0,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 02:03:00,RT @OneNation4Ben: I support #BenCarson2016 ht...,#BenCarson2016 #RunBenRun #PJET #tcot,@OneNation4Ben,https://t.co/cO9Z9h2Bfg,8.0,0.0,...,1.0,0.0,0.0,1.0,0.0,rt @onenation4ben: i support #bencarson2016,1,12/16/2015,Carson,13.491012
1,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 02:01:00,RT @ThePatriot143: Classy act by Ben Carson #G...,#GOPDebate #MomentOfSilence #SanBernardino,@ThePatriot143,,29.0,0.0,...,1.0,0.0,0.0,1.0,0.0,rt @thepatriot143: classy act by ben carson #g...,1,12/16/2015,Carson,13.491012
2,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 02:01:00,RT @nprpolitics: Extended medical metaphor fro...,,@nprpolitics,,18.0,0.0,...,1.0,0.0,0.0,1.0,0.0,rt @nprpolitics: extended medical metaphor fro...,1,12/16/2015,Carson,13.491012
3,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 01:43:00,Carson takes the stage #WinBenWin #BC2DC16,#WinBenWin #BC2DC16,,,8.0,8.0,...,1.0,0.0,0.0,1.0,0.0,carson takes the stage #winbenwin #bc2dc16,0,12/16/2015,Carson,13.491012
4,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 01:41:00,RT @SandyMPool: BEN CARSON WILL BRING LEADERSH...,,@SandyMPool,,7.0,0.0,...,1.0,0.0,0.0,1.0,0.0,rt @sandympool: ben carson will bring leadersh...,1,12/16/2015,Carson,13.491012
5,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 02:09:00,RT @BreitbartNews: Ben Carson begins with a mo...,#GOPDebate,@BreitbartNews,http,45.0,0.0,...,1.0,0.0,0.0,1.0,0.0,rt @breitbartnews: ben carson begins with a mo...,1,12/16/2015,Carson,13.491012
6,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 01:45:00,"RT @irgrannyg: Go, Ben! https://t.co/7oevzZnFFR",,@irgrannyg,https://t.co/7oevzZnFFR,7.0,0.0,...,1.0,0.0,0.0,1.0,0.0,"rt @irgrannyg: go, ben!",1,12/16/2015,Carson,13.491012
7,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 02:03:00,RT @SusanCucinotta: @RealBenCarson #Carsonator...,#Carsonators #Students4Carson,@SusanCucinotta @RealBenCarson @YouthForCarson...,,13.0,0.0,...,1.0,0.0,0.0,1.0,0.0,rt @susancucinotta: #carsonators #students4car...,1,12/16/2015,Carson,13.491012
8,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 02:00:00,RT @Cry2MarvinsRoom: Ben Carson with the FIRE ...,,@Cry2MarvinsRoom,,7.0,0.0,...,1.0,0.0,0.0,1.0,0.0,rt @cry2marvinsroom: ben carson with the fire ...,1,12/16/2015,Carson,13.491012
9,Ben_Carson,DraftRunBenRun,6.77E+17,2015-12-16 02:04:00,"RT @FoxBusiness: .@RealBenCarson: Right now, t...",#GOPDeb,@FoxBusiness,,28.0,0.0,...,1.0,0.0,0.0,1.0,0.0,"rt @foxbusiness: .@realbencarson: right now, t...",1,12/16/2015,Carson,13.491012


In [87]:
print merged_df.shape

(99032, 48)


# Execute Lines to Pull and Update Tweet Data

In [None]:
collect_data(path + cand_tweets, cand_sheet, path + tweet_list)
#collect_data(path + pac_tweets, pac_sheet, path + tweet_list)

collect_addition_data(path + cand_tweets, cand_sheet, path + tweet_list)
#collect_addition_data(path + pac_tweets, pac_sheet, path + tweet_list)



# Graveyard 

In [None]:
#curr_size = len(follower_count_df)
#prev_size = 0
#while (curr_size != prev_size):
#  prev_size = curr_size
#  entries_to_drop = []
#  for index, row in follower_count_df.iterrows():
#    curr_date = int((row['date']) / 1000000)
#    #curr_date = (row['date'][0:8])
#    if (index != 0):
#      if (curr_date == prev_date[1]):
#        print 'here'
#        entries_to_drop.append(prev_date[0])
#    prev_date = (index, curr_date)
#
#  print  "to drop: " + str(len(entries_to_drop))
#  entries_to_keep = set(range(len(follower_count_df))) - set(entries_to_drop)
#  #test_df = follower_count_df.drop(entries_to_drop, axis=0)
#  follower_count_df = follower_count_df.take(list(entries_to_keep))
#  curr_size = len(follower_count_df)