In [1]:
!python3 -m pip install transformers wordcloud tweepy

Collecting transformers
  Downloading transformers-4.0.0-py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 1.7 MB/s eta 0:00:01
[?25hCollecting wordcloud
  Downloading wordcloud-1.8.1-cp37-cp37m-manylinux1_x86_64.whl (366 kB)
[K     |████████████████████████████████| 366 kB 1.7 MB/s eta 0:00:01
[?25hCollecting tweepy
  Downloading tweepy-3.9.0-py2.py3-none-any.whl (30 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 1.4 MB/s eta 0:00:01
[?25hCollecting regex!=2019.12.17
  Downloading regex-2020.11.13-cp37-cp37m-manylinux2014_x86_64.whl (719 kB)
[K     |████████████████████████████████| 719 kB 394 kB/s eta 0:00:01
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 3.3 MB/s eta 0:00:01
Collecting filelock
  Using cached filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting requests-

## Imports & Setup

In [3]:
import pandas as pd
import numpy as np
import zipfile
import re
from datetime import datetime

In [22]:
import tweepy
TWITTER_KEY = ''
TWITTER_SECRET = ''

In [2]:
zf = zipfile.ZipFile('/content/drive/MyDrive/electiondata.zip')
df = pd.read_csv(zf.open('uselection_tweets_1jul_11nov.csv'), sep=';')

## Trim down dataset to be feasible with Twitter API rate limits

- First, limit it to tweets solely about the Democratic or Republican party (no independents or mixed subject tweets)

- Pick out tweets with high scores ($|score| > 1$, for now)
- English language only
- *then* sample 10k tweets from this subset

In [35]:
df["Created-At"] = pd.to_datetime(df["Created-At"])
trimmed_tweets = df.loc[((df["PartyName"] == 'Democrats') | 
                         (df["PartyName"] == "Republicans")) & 
                         (np.abs(df["Score"]) > 1) & 
                         (df["Language"] == 'en')]

tweets_dataset_20k = trimmed_tweets.sample(20000)

In [36]:
tweets_dataset_20k.reset_index(drop=True, inplace=True)
tweets_dataset_20k

Unnamed: 0,Created-At,From-User-Id,To-User-Id,Language,Retweet-Count,PartyName,Id,Score,Scoring String,Negativity,Positivity,Uncovered Tokens,Total Tokens
0,2020-10-09 11:27:00,826540340349919232,-1,en,127.0,Republicans,1314482702922510336,1.205128,support (0.44) please (0.33) help (0.44),0.000000,1.205128,26,29
1,2020-09-24 10:38:00,1281052459507617796,-1,en,7852.0,Democrats,1309034405457997827,-1.102564,dangerous (-0.54) enemies (-0.56),1.102564,0.000000,24,26
2,2020-10-05 13:48:00,267405095,-1,en,1264.0,Democrats,1313068582805270529,1.487179,actively (0.33) win (0.72) party (0.44),0.000000,1.487179,26,29
3,2020-07-08 04:54:00,825892758246268928,-1,en,588.0,Republicans,1280681559587033089,1.282051,important (0.21) intellectual (0.59) vitally...,0.000000,1.282051,22,26
4,2020-09-10 04:25:00,29928823,-1,en,105.0,Republicans,1303867078458445826,1.307692,smart (0.44) smart (0.44) smart (0.44),0.000000,1.307692,24,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2020-10-16 09:15:00,745691064212807681,1092037021432897537,en,0.0,Republicans,1316986053262233600,1.025641,yeah (0.31) like (0.38) original (0.33),0.000000,1.025641,42,45
19996,2020-09-01 06:41:00,1222954637881376769,-1,en,1021.0,Democrats,1300639789574688770,1.205128,best (0.82) like (0.38),0.000000,1.205128,31,33
19997,2020-08-29 21:05:00,122998049,1275982191361736704,en,0.0,Republicans,1299770268840271873,-1.333333,agreed (0.28) dirty (-0.49) no (-0.31) disa...,3.846154,2.512821,49,62
19998,2020-10-25 17:35:00,812549513541992448,1267679754431475713,en,0.0,Democrats,1320373430714200072,-2.051282,complacent (-0.08) fighting (-0.38) battles ...,2.051282,0.000000,60,65


## Add a 'state' column and new labels


In [21]:
STATES_ABBREVIATIONS = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

STATE_NAMES = ["Alaska", "Alabama", "Arkansas", "Arizona", 
               "California", "Colorado", "Connecticut", "Delaware", 
               "Florida", "Georgia", "Hawaii", "Iowa", "Idaho", 
               "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", 
               "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", 
               "Missouri", "Mississippi", "Montana", "North Carolina", 
               "North Dakota", "Nebraska", "New Hampshire", "New Jersey", 
               "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", 
               "Pennsylvania", "Rhode Island", "South Carolina", 
               "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", 
               "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

STATES_FULL = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas",
                        "CA":"California","CO":"Colorado","CT":"Connecticut",
                        "DE":"Delaware", "FL":"Florida","GA":"Georgia","HI":"Hawaii",
                        "ID":"Idaho","IL":"Illinois","IN":"Indiana","IA":"Iowa",
                        "KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine",
                        "MD":"Maryland","MA":"Massachusetts","MI":"Michigan",
                        "MN":"Minnesota","MS":"Mississippi","MO":"Missouri",
                        "MT":"Montana","NE":"Nebraska","NV":"Nevada",
                        "NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico",
                        "NY":"New York","NC":"North Carolina","ND":"North Dakota",
                        "OH":"Ohio","OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania",
                        "RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota",
                        "TN":"Tennessee","TX":"Texas","UT":"Utah","VT":"Vermont",
                        "VA":"Virginia","WA":"Washington","WV":"West Virginia",
                        "WI":"Wisconsin","WY":"Wyoming"}

state_regex = re.compile('|'.join(STATES_ABBREVIATIONS + STATE_NAMES))
STATES_ABBREVIATIONS = set(STATES_ABBREVIATIONS)

In [None]:
auth = tweepy.AppAuthHandler(TWITTER_KEY, TWITTER_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

tweets_dataset_20k["State"] = ""
tweets_dataset_20k["Text"] = ""

for idx, row in tweets_dataset_20k.iterrows():
  if idx % 100 == 0:
      print(f"Currently at: {idx}")
  try:
    tweet = api.get_status(row['Id'])
    state_match = state_regex.search(tweet.user.location)
    if state_match:
      tweets_dataset_20k.at[idx, "State"] = state_match.group(0)
    tweets_dataset_20k.at[idx, "Text"] = tweet.text

  except tweepy.TweepError: 
    continue

In [47]:
tweets_dataset_20k.drop(tweets_dataset_20k.index[tweets_dataset_20k["State"] == ""], inplace=True)
tweets_dataset_20k.reset_index(drop=True, inplace=True)

for idx, row in tweets_dataset_20k.iterrows():
  if row["State"] in STATES_ABBREVIATIONS:
    tweets_dataset_20k.at[idx, "State"] = STATES_FULL.get(row["State"], "")

In [50]:
tweets_dataset_20k

Unnamed: 0,Created-At,From-User-Id,To-User-Id,Language,Retweet-Count,PartyName,Id,Score,Scoring String,Negativity,Positivity,Uncovered Tokens,Total Tokens,State,Text
0,2020-10-05 13:48:00,267405095,-1,en,1264.0,Democrats,1313068582805270529,1.487179,actively (0.33) win (0.72) party (0.44),0.000000,1.487179,26,29,Arizona,"RT @IngrahamAngle: Reminder: China, still acti..."
1,2020-09-10 04:25:00,29928823,-1,en,105.0,Republicans,1303867078458445826,1.307692,smart (0.44) smart (0.44) smart (0.44),0.000000,1.307692,24,27,North Carolina,RT @MsAvaArmstrong: Smart suburban women WILL ...
2,2020-07-13 07:50:00,13165482,-1,en,538.0,Republicans,1282537894121484288,1.179487,trust (0.59) trust (0.59),0.000000,1.179487,30,32,Texas,RT @AynRandPaulRyan: 26% of voters trust Trump...
3,2020-11-05 06:48:00,479964312,-1,en,70136.0,Democrats,1324196851034345472,1.564103,supporters (0.49) like (0.38) won (0.69),0.000000,1.564103,19,22,Iowa,RT @Vic_Goes: Tr*mp supporters really abt to b...
4,2020-08-20 04:11:00,987817704072495104,-1,en,16267.0,Democrats,1296253496061120514,1.076923,proud (0.54) proud (0.54),0.000000,1.076923,29,31,Utah,RT @NaomiBiden: I am proud to be your granddau...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2961,2020-10-11 02:25:00,1237571449025482752,-1,en,6644.0,Democrats,1315070909368004608,-1.615385,sick (-0.59) struggling (-0.46) worried (-0....,1.615385,0.000000,32,36,Oregon,"RT @JoeBiden: If you're sick, struggling, or w..."
2962,2020-09-19 14:05:00,112556325,-1,en,25888.0,Republicans,1307274512761790466,1.282051,supreme (0.67) justice (0.62),0.000000,1.282051,23,25,Ohio,RT @realDonaldTrump: Statement from the Presid...
2963,2020-09-25 16:12:00,877860473470955520,25073877,en,0.0,Republicans,1309480995284975623,1.820513,wonderful (0.69) promise (0.33) fortunate (0...,0.000000,1.820513,48,52,Texas,@realDonaldTrump We've been waiting for nearly...
2964,2020-11-09 08:56:00,3073561579,-1,en,2865.0,Republicans,1325678648792215552,1.076923,outgoing (0.31) courtesy (0.38) courtesy (0.38),0.000000,1.076923,25,28,Nevada,RT @SteveSchmidtSES: There has been a traditio...


In [49]:
tweets_dataset_20k.to_csv('/content/drive/MyDrive/tweets_3k.csv')