In [4]:
!pip install transformers tweet-preprocessor 

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |▎                               | 10kB 21.0MB/s eta 0:00:01[K     |▌                               | 20kB 15.6MB/s eta 0:00:01[K     |▊                               | 30kB 15.0MB/s eta 0:00:01[K     |█                               | 40kB 14.8MB/s eta 0:00:01[K     |█▏                              | 51kB 11.5MB/s eta 0:00:01[K     |█▌                              | 61kB 11.6MB/s eta 0:00:01[K     |█▊                              | 71kB 11.2MB/s eta 0:00:01[K     |██                              | 81kB 12.3MB/s eta 0:00:01[K     |██▏                             | 92kB 13.1MB/s eta 0:00:01[K     |██▍                             | 102kB 12.3MB/s eta 0:00:01[K     |██▋                             | 112kB 12.3MB/s eta 0:00:01[K     |███                             | 

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import preprocessor as p # tweet-preprocessor
import nltk
import re
import seaborn as sns
import torch

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [6]:
# Give colab access to drive to load csv file
filename = "/content/drive/MyDrive/Junior Year/DSP Final Project/election_tweets_1M.csv"
chunksize = 10 ** 6
df_main_analysis = pd.DataFrame()
for chunk in pd.read_csv(filename, chunksize=chunksize, usecols=['created_at','user_location', 'text']):
    df = chunk
    df_main_analysis = pd.concat([df_main_analysis,df])
    
    
df_main_analysis.rename(columns={"user_location": "location", "text": "full_text"}, inplace=True)
df_main_analysis.drop_duplicates(inplace=True)
df_main_analysis.reset_index(inplace=True)
df_main_analysis['location'] = df_main_analysis['location'].astype(str)
df_main_analysis['full_text'] = df_main_analysis['full_text'].astype(str)
df_main_analysis['created_at'] = df_main_analysis['created_at'].astype(str)
display(df_main_analysis)
display(df_main_analysis.dtypes)

Unnamed: 0,index,created_at,full_text,location
0,0,Sat Oct 24 05:43:05 +0000 2020,RT @MeidasTouch: I already voted for Joe Biden...,
1,1,Sat Oct 24 06:16:50 +0000 2020,RT @poutydobrik: THIS IS A REMINDER THAT KANYE...,"Tucson, AZ"
2,2,Thu Oct 22 10:24:17 +0000 2020,@Thorsha07820326 @Baddiel @realDonaldTrump Are...,"South West, England"
3,3,Tue Oct 13 04:00:04 +0000 2020,RT @RVAT2020: NEW AD: Former Director of the C...,Third rock from the sun
4,4,Sat Oct 31 08:44:41 +0000 2020,RT @tribelaw: “America has to be a functioning...,"Georgia, USA"
...,...,...,...,...
882953,948788,Sat Oct 17 19:15:14 +0000 2020,RT @WeHave2BeBetter: This dude every day is in...,
882954,948789,Tue Oct 27 04:09:33 +0000 2020,RT @SenRubioPress: El sen Rubio le envió una c...,"Pembroke Pines, FL"
882955,948790,Fri Oct 16 15:02:18 +0000 2020,RT @JoeBiden: President Obama and I left Donal...,
882956,948791,Tue Nov 03 14:22:26 +0000 2020,RT @JTHVerhovek: .@JoeBiden is stopping by Car...,Illinois


index          int64
created_at    object
full_text     object
location      object
dtype: object

In [7]:
STATES_ABBREVIATIONS = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

STATE_NAMES = ["Alaska", "Alabama", "Arkansas", "Arizona", 
               "California", "Colorado", "Connecticut", "Delaware", 
               "Florida", "Georgia", "Hawaii", "Iowa", "Idaho", 
               "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", 
               "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", 
               "Missouri", "Mississippi", "Montana", "North Carolina", 
               "North Dakota", "Nebraska", "New Hampshire", "New Jersey", 
               "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", 
               "Pennsylvania", "Rhode Island", "South Carolina", 
               "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", 
               "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

STATES_FULL = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas",
                        "CA":"California","CO":"Colorado","CT":"Connecticut",
                        "DE":"Delaware", "FL":"Florida","GA":"Georgia","HI":"Hawaii",
                        "ID":"Idaho","IL":"Illinois","IN":"Indiana","IA":"Iowa",
                        "KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine",
                        "MD":"Maryland","MA":"Massachusetts","MI":"Michigan",
                        "MN":"Minnesota","MS":"Mississippi","MO":"Missouri",
                        "MT":"Montana","NE":"Nebraska","NV":"Nevada",
                        "NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico",
                        "NY":"New York","NC":"North Carolina","ND":"North Dakota",
                        "OH":"Ohio","OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania",
                        "RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota",
                        "TN":"Tennessee","TX":"Texas","UT":"Utah","VT":"Vermont",
                        "VA":"Virginia","WA":"Washington","WV":"West Virginia",
                        "WI":"Wisconsin","WY":"Wyoming"}

state_regex = re.compile('|'.join(STATES_ABBREVIATIONS + STATE_NAMES))
STATES_ABBREVIATIONS = set(STATES_ABBREVIATIONS)

In [8]:
for idx, row in df_main_analysis.iterrows():
    state_match = state_regex.search(row["location"])
    if state_match:
        df_main_analysis.at[idx, "state"] = state_match.group(0)
#     else:
#         df_main_analysis.at[idx, "state"] = state_match.group(0)

        
df_main_analysis.drop(df_main_analysis.index[df_main_analysis["state"] == None], inplace=True)
df_main_analysis.reset_index(drop=True, inplace=True)

for idx, row in df_main_analysis.iterrows():
    if row["state"] in STATES_ABBREVIATIONS:
        df_main_analysis.at[idx, "state"] = STATES_FULL.get(row["state"], "")

In [9]:
df_main_analysis.dropna(inplace=True)
df_main_analysis.reset_index(drop=True, inplace=True)

df_main_analysis.drop(columns=["location"], inplace=True)
display(df_main_analysis)

Unnamed: 0,index,created_at,full_text,state
0,1,Sat Oct 24 06:16:50 +0000 2020,RT @poutydobrik: THIS IS A REMINDER THAT KANYE...,Arizona
1,4,Sat Oct 31 08:44:41 +0000 2020,RT @tribelaw: “America has to be a functioning...,Georgia
2,6,Tue Oct 13 21:38:18 +0000 2020,@Cockofthewalk00 @ericsslater @ChuckGrassley @...,Indiana
3,7,Sun Nov 01 16:37:38 +0000 2020,RT @444findinghope: According to public record...,Maryland
4,9,Mon Oct 26 20:10:39 +0000 2020,RT @ProjectLincoln: This is Joe Biden’s moment...,North Carolina
...,...,...,...,...
281741,948777,Tue Oct 13 14:52:54 +0000 2020,RT @Jord_45: @JoeBiden China sure hopes so.,West Virginia
281742,948779,Sat Oct 17 14:54:19 +0000 2020,The man is delusional and should be removed fr...,Washington
281743,948784,Wed Oct 14 17:51:39 +0000 2020,"@Adam_Puzio @ErrolWebber Riiiight, yet you’re ...",California
281744,948789,Tue Oct 27 04:09:33 +0000 2020,RT @SenRubioPress: El sen Rubio le envió una c...,Florida


In [10]:
def cleanTweet(row):
    tweet = row["full_text"]
    tweet = str(p.clean(tweet))
    tweet = re.sub(r'[^\w\s]', '', tweet) # punctuation
    tweet = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", tweet) # numbers
    return tweet


df_main_analysis["clean_text"] = df_main_analysis.apply(lambda row: cleanTweet(row), axis=1)
df_main_analysis['data_type'] = 'test'
df_main_analysis['label'] = 'Democrat'
display(df_main_analysis["clean_text"])

0          THIS IS A REMINDER THAT KANYE WEST ADMITTED T...
1          America has to be a functioning democracy bef...
2                                                      True
3          According to public record the big black truc...
4                                 This is Joe Bidens moment
                                ...                        
281741                                  China sure hopes so
281742    The man is delusional and should be removed fr...
281743    Riiiight yet youre not citing your sources eit...
281744     El sen Rubio le envi una carta a instndolo a ...
281745      is stopping by Carpenters Local Union Hall i...
Name: clean_text, Length: 281746, dtype: object

In [11]:
LABEL_MAP = {
    "Democrat": 0,
    "Republican": 1
}

def buildLabels(row):
    return LABEL_MAP.get(row["label"])

df_main_analysis["label"] = df_main_analysis.apply(lambda row: buildLabels(row), axis=1)

In [14]:
def get_dataloaders(data, batch_size):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                            do_lower_case=True)
    # tokenize train and test data so BERT can understand it

    encoded_data_test = tokenizer.batch_encode_plus(
      data[data.data_type=='test'].clean_text.values, 
      add_special_tokens=True, 
      return_attention_mask=True, 
      padding=True, 
      max_length=64, 
      return_tensors='pt'
    )



    input_ids_test = encoded_data_test['input_ids']
    attention_masks_test = encoded_data_test['attention_mask']
    labels_test = torch.tensor(data[data.data_type=='test'].label.values)

    test_data = TensorDataset(input_ids_test, attention_masks_test, labels_test)

    test_dataloader = DataLoader(test_data,
                              sampler=SequentialSampler(test_data),
                              batch_size=batch_size)

    return test_dataloader

In [24]:
def evaluate(model, dataloader, device):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader:

      # convert data to CUDA
        batch = tuple(b.to(device) for b in batch)

        inputs = {
          'input_ids':      batch[0],
          'attention_mask': batch[1],
          'labels':         batch[2],
        }
        
        with torch.no_grad():        
            outputs = model(**inputs) # get predictions

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader) 

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return predictions, true_vals

In [25]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                        num_labels=2,
                                                        output_attentions=False,
                                                        output_hidden_states=False)   
model.load_state_dict(torch.load("/content/drive/MyDrive/Junior Year/DSP Final Project/finetuned_BERT_epoch_2.model", map_location=torch.device('cuda')))
model.to(device)
BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
display(device)
test_dataloader = get_dataloaders(df_main_analysis, BATCH_SIZE)

preds, labels = evaluate(model, test_dataloader, device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

device(type='cuda')

In [28]:
df_main_analysis["label"] = np.argmax(preds,axis=1)

LABEL_MAP_REVERSED = {
    0: "Democrat",
    1: "Republican"
}

def buildLabelsReversed(row):
    return LABEL_MAP_REVERSED.get(row["label"])

df_results = pd.DataFrame()
df_results["clean_text"] = df_main_analysis["clean_text"]
df_results["label"] = df_main_analysis.apply(lambda row: buildLabelsReversed(row), axis=1)
df_results["state"] = df_main_analysis["state"]
display(df_results)

Unnamed: 0,clean_text,label,state
0,THIS IS A REMINDER THAT KANYE WEST ADMITTED T...,Democrat,Arizona
1,America has to be a functioning democracy bef...,Democrat,Georgia
2,True,Republican,Indiana
3,According to public record the big black truc...,Republican,Maryland
4,This is Joe Bidens moment,Republican,North Carolina
...,...,...,...
281741,China sure hopes so,Democrat,West Virginia
281742,The man is delusional and should be removed fr...,Democrat,Washington
281743,Riiiight yet youre not citing your sources eit...,Democrat,California
281744,El sen Rubio le envi una carta a instndolo a ...,Democrat,Florida


In [29]:
#create unique list of names
uniqueStates = df_results.state.unique()

#create a data frame dictionary to store your data frames
DataFrameDict = {elem : pd.DataFrame for elem in uniqueStates}

for key in DataFrameDict.keys():
    DataFrameDict[key] = df_results[:][df_results.state == key]
    
for state in STATE_NAMES:
    value_counts = DataFrameDict[state]["label"].value_counts(normalize=True)
#     if value_counts["Republican"] > value_counts["Democrat"]:
    display(state, value_counts)

'Alaska'

Democrat      0.713789
Republican    0.286211
Name: label, dtype: float64

'Alabama'

Democrat      0.677578
Republican    0.322422
Name: label, dtype: float64

'Arkansas'

Democrat      0.711111
Republican    0.288889
Name: label, dtype: float64

'Arizona'

Democrat      0.715541
Republican    0.284459
Name: label, dtype: float64

'California'

Democrat      0.739798
Republican    0.260202
Name: label, dtype: float64

'Colorado'

Democrat      0.742547
Republican    0.257453
Name: label, dtype: float64

'Connecticut'

Democrat      0.730568
Republican    0.269432
Name: label, dtype: float64

'Delaware'

Democrat      0.720365
Republican    0.279635
Name: label, dtype: float64

'Florida'

Democrat      0.686455
Republican    0.313545
Name: label, dtype: float64

'Georgia'

Democrat      0.700945
Republican    0.299055
Name: label, dtype: float64

'Hawaii'

Democrat      0.730842
Republican    0.269158
Name: label, dtype: float64

'Iowa'

Democrat      0.731407
Republican    0.268593
Name: label, dtype: float64

'Idaho'

Democrat      0.683636
Republican    0.316364
Name: label, dtype: float64

'Illinois'

Democrat      0.750462
Republican    0.249538
Name: label, dtype: float64

'Indiana'

Democrat      0.696624
Republican    0.303376
Name: label, dtype: float64

'Kansas'

Democrat      0.7163
Republican    0.2837
Name: label, dtype: float64

'Kentucky'

Democrat      0.708645
Republican    0.291355
Name: label, dtype: float64

'Louisiana'

Democrat      0.691371
Republican    0.308629
Name: label, dtype: float64

'Massachusetts'

Democrat      0.722197
Republican    0.277803
Name: label, dtype: float64

'Maryland'

Democrat      0.743407
Republican    0.256593
Name: label, dtype: float64

'Maine'

Democrat      0.697291
Republican    0.302709
Name: label, dtype: float64

'Michigan'

Democrat      0.730953
Republican    0.269047
Name: label, dtype: float64

'Minnesota'

Democrat      0.751118
Republican    0.248882
Name: label, dtype: float64

'Missouri'

Democrat      0.695815
Republican    0.304185
Name: label, dtype: float64

'Mississippi'

Democrat      0.683447
Republican    0.316553
Name: label, dtype: float64

'Montana'

Democrat      0.737452
Republican    0.262548
Name: label, dtype: float64

'North Carolina'

Democrat      0.712325
Republican    0.287675
Name: label, dtype: float64

'North Dakota'

Democrat      0.659794
Republican    0.340206
Name: label, dtype: float64

'Nebraska'

Democrat      0.709273
Republican    0.290727
Name: label, dtype: float64

'New Hampshire'

Democrat      0.700227
Republican    0.299773
Name: label, dtype: float64

'New Jersey'

Democrat      0.719866
Republican    0.280134
Name: label, dtype: float64

'New Mexico'

Democrat      0.754624
Republican    0.245376
Name: label, dtype: float64

'Nevada'

Democrat      0.725992
Republican    0.274008
Name: label, dtype: float64

'New York'

Democrat      0.730888
Republican    0.269112
Name: label, dtype: float64

'Ohio'

Democrat      0.722615
Republican    0.277385
Name: label, dtype: float64

'Oklahoma'

Democrat      0.704718
Republican    0.295282
Name: label, dtype: float64

'Oregon'

Democrat      0.754355
Republican    0.245645
Name: label, dtype: float64

'Pennsylvania'

Democrat      0.723204
Republican    0.276796
Name: label, dtype: float64

'Rhode Island'

Democrat      0.733068
Republican    0.266932
Name: label, dtype: float64

'South Carolina'

Democrat      0.682802
Republican    0.317198
Name: label, dtype: float64

'South Dakota'

Democrat      0.707657
Republican    0.292343
Name: label, dtype: float64

'Tennessee'

Democrat      0.699734
Republican    0.300266
Name: label, dtype: float64

'Texas'

Democrat      0.701458
Republican    0.298542
Name: label, dtype: float64

'Utah'

Democrat      0.732649
Republican    0.267351
Name: label, dtype: float64

'Virginia'

Democrat      0.72842
Republican    0.27158
Name: label, dtype: float64

'Vermont'

Democrat      0.758974
Republican    0.241026
Name: label, dtype: float64

'Washington'

Democrat      0.741186
Republican    0.258814
Name: label, dtype: float64

'Wisconsin'

Democrat      0.726912
Republican    0.273088
Name: label, dtype: float64

'West Virginia'

Democrat      0.721103
Republican    0.278897
Name: label, dtype: float64

'Wyoming'

Democrat      0.699708
Republican    0.300292
Name: label, dtype: float64