## Importing Libraries & Data

In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
import html
import re
import string
import spacy
import math

import wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import os

# import random undersampling and other necessary libraries 
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df = pd.read_pickle("../data/full_data.pkl")

In [6]:
df.shape

(927487, 8)

In [8]:
df = df.reset_index(drop=True)

In [9]:
df

Unnamed: 0,hashtags,favorite_count,id,lang,place,retweet_count,text,user_location
0,climatechange climateaction sustainability fb,0,930123883520413697,en,,0,We are not going to get too many more warnings...,"Holmdel, NJ"
1,juice reuse reduce actonclimate,1,965587085993365505,en,,1,I'll be juicing my left over carrots tonight t...,"Miami, FL"
2,Hack4Climate Blockchain ClimateAction ClimateC...,4,928605716159574016,en,,1,In less than 3 days we will #Hack4Climate at @...,"Zürich, Schweiz"
3,climatechange climateaction,1,954288658361802752,en,,1,"""The best way to predict the future is to crea...","Tampere, Finland"
4,protectwhatyoulove climatechangeisreal AnimalR...,0,1012691887440814086,en,,0,Accepting the world for the way it is just mig...,
...,...,...,...,...,...,...,...,...
927482,GlobalWarmingScam,0,948541187224305666,en,,0,@David19531501 @GlobalEcoGuy @alexpiletska bec...,"Longmont, CO"
927483,GlobalWarmingSCAM,1,945668218496192512,en,,0,"New study finds that cosmic rays, solar activi...","Acapulco, Mexico"
927484,GlobalWarming ClimateChange ClimateScam Global...,0,953134860922744832,en,,0,"For all you ""Global Warming experts"" who think...",
927485,blizzard2018 noreaster GlobalWarmingScam,0,948942488751886336,en,,0,"#blizzard2018 #noreaster - oh, the days of Al ...",USA


## Text Cleaning
- remove special characters, hashtags
- turn everything to lowercase
- replace contradictions
- remove mentions, newlines, emojis, URL's, encoding characters, numbers, punctuation
- remove retweets
- remove extra spaces
- remove rows with 3 or less words
- remove duplicates
- lemmatize & tokenize text

In [10]:
df['text']

1         I'll be juicing my left over carrots tonight t...
2         In less than 3 days we will #Hack4Climate at @...
3         "The best way to predict the future is to crea...
4         Accepting the world for the way it is just mig...
                                ...                        
927482    @David19531501 @GlobalEcoGuy @alexpiletska bec...
927483    New study finds that cosmic rays, solar activi...
927484    For all you "Global Warming experts" who think...
927485    #blizzard2018 #noreaster - oh, the days of Al ...
927486    @1776Stonewall Please don't clowd the issue wi...
Name: text, Length: 927487, dtype: object

In [11]:
#remove special characters
df['text'] = df['text'].str.replace(r'[\",]*', '',regex=True)

#remove hashtags
df['text'] = df['text'].replace('([#])','', regex=True)
#df['text'] = df['text'].replace('([A-Z][a-z]+)', r' \1', regex=True) 

#lowercase
df['text'] = df['text'].str.lower()

In [12]:
#replace contradictions

contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the reviews
df['text']=df['text'].apply(lambda x:expand_contractions(x))

In [13]:
#remove mentions
df['text'] = df['text'].replace("@[A-Za-z0-9_]+", "", regex=True)

#remove newlines
df['text'] = df['text'].replace("(\r\n|\r|\n)", "", regex=True)

#remove encoding characters
df['text'] = df['text'].replace(r'\b[a-zA-Z]\b', '', regex=True)
df['text'] = df['text'].str.replace(r'[\'\",]*', '',regex=True)

#remove URL's
df['text'] = df['text'].replace(r"(?:\@|http?\://|https?\://|www)\S+", "", regex=True)

#remove emojis
df['text'] = df['text'].replace("[(\U0001F600-\U0001F92F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F190-\U0001F1FF|\U00002702-\U000027B0|\U0001F926-\U0001FA9F|\u200d|\u2640-\u2642|\u2600-\u2B55|\u23cf|\u23e9|\u231a|\ufe0f)]", "", regex=True)

#remove numbers
df['text'] = df['text'].replace("[0-9]", "", regex=True)

# remove punctuation
df['text'] = df['text'].str.replace(r'[^\w\s]',r'',regex=True)

#remove Retweets
df = df[~df['text'].str.startswith('RT')]

#remove spaces at the front and back
df['text'] = df['text'].str.strip()

# remove extra spaces
df['text'] = df['text'].str.replace(r'\s\s+',r' ',regex=True)

In [14]:
df['text'][0:30]

1     ll be juicing my left over carrots tonight to ...
2     in less than days we will hackclimate at looki...
3     the best way to predict the future is to creat...
4     accepting the world for the way it is just mig...
5     landfills are significant sources of methane e...
6     sea level expected to rise by one metre by due...
7     be kind to everything that lives climatechange...
8     extreme storms to multiply intensify across ne...
9     good to your words of wisdom on thedrum tonigh...
10    microgrids allow houses to share energy they h...
11    who was biking firstnight firstnightmonterey m...
12    thank you the endorsement together we can make...
13    arctic wellbeing essential for earth climatech...
14    why is the sun the only really safe nuclear re...
15    scotland yard investigates officers shown danc...
16    the alarm has been sounded is anyone listening...
17    what are some of the many reasons to protect f...
18    often wonder whether we as the people coul

In [15]:
df.shape

(927487, 8)

In [16]:
#https://xiangyutang2.github.io/tweet-classification/
#drop rows with shorter than 3 words tweets

df['tweet_proc_length'] = [len(text.split(' ')) for text in df['text']]
df = df[df['tweet_proc_length']>3]
df.shape

(907889, 9)

In [17]:
df.isna().sum()

hashtags             196365
favorite_count            0
id                        0
lang                      0
place                626040
retweet_count             0
text                      0
user_location        112981
tweet_proc_length         0
dtype: int64

In [18]:
#drop duplicates
df = df.drop_duplicates(subset=['text']).reset_index(drop=True)
df.shape

(610671, 9)

In [19]:
# Lemmatization & Tokenization

import spacy

tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# Loading model
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

def lemmatization(text):
    return [lemmatizer.lemmatize(x) for x in tokenizer.tokenize(text)]
    
df['text'] = df['text'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop == False)]))

In [20]:
df['text']

1         ll juice left carrot tonight fresh juice morni...
2         day hackclimate look forward exciting event bl...
3         good way predict future create abraham lincoln...
4         accept world way power stand change want prote...
                                ...                        
610666                   fit agenda david globalwarmingscam
610667    new study find cosmic ray solar activity great...
610668    global warming expert think melt ice cause sea...
610669    blizzard noreaster oh day al gore democrats gl...
610670             clowd issue bunch fact globalwarmingscam
Name: text, Length: 610671, dtype: object

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 610671 entries, 0 to 610670
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hashtags           423600 non-null  object
 1   favorite_count     610671 non-null  int64 
 2   id                 610671 non-null  int64 
 3   lang               610671 non-null  object
 4   place              256273 non-null  object
 5   retweet_count      610671 non-null  int64 
 6   text               610671 non-null  object
 7   user_location      511930 non-null  object
 8   tweet_proc_length  610671 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 41.9+ MB


In [22]:
df.to_pickle('../data/cleaned_and_lemmatized.pkl')

## Create Prediction Dataset (USA tweets)

In [23]:
# Process 'user_location' variable to separate it into city and state

data1 = df.dropna(how='any', subset=['user_location'])
data1 = data1[data1["user_location"].str.contains(",")]
location = data1['user_location'].str.split(',', expand = True).iloc[:,[0,1]]
location.columns = ['city','state']
location

Unnamed: 0,city,state
0,Holmdel,NJ
1,Miami,FL
2,Zürich,Schweiz
3,Tampere,Finland
9,Cairns,Queensland
...,...,...
610660,Overland Park,KS
610663,Longmont,CO
610664,Texas,U.S.A.
610666,Longmont,CO


In [24]:
# Add the two new columns back into the dataframe

data1 = pd.concat([data1, location],axis = 1, join = 'outer', 
                 ignore_index=False, sort=False)

# Strip the new columns of any spaces

data1.state = data1.state.str.strip()
data1.city = data1.city.str.strip()

In [25]:
# Filter dataframe to only rows that have a US state

states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", 
          "Alabama","Alaska","Arizona","Arkansas","California","Colorado",
          "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
          "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
          "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
          "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
          "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
          "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
          "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

data1 = data1[data1['state'].isin(states)]

data1.shape

(108233, 11)

In [26]:
data1[['user_location', 'city', 'state']]

Unnamed: 0,user_location,city,state
0,"Holmdel, NJ",Holmdel,NJ
1,"Miami, FL",Miami,FL
11,"Monterey County, CA, USA",Monterey County,CA
26,"Spokane Valley, WA",Spokane Valley,WA
39,"Miami, FL",Miami,FL
...,...,...,...
610641,"Greenbrier, TN",Greenbrier,TN
610651,"Longmont, CO",Longmont,CO
610660,"Overland Park, KS",Overland Park,KS
610663,"Longmont, CO",Longmont,CO


In [27]:
# Process 'place' variable to separate it into city and state

data2 = df.dropna(how='any', subset=['place'])
data2 = data2[data2["place"].str.contains(",")]
place = data2['place'].str.split(',', expand = True).iloc[:,[0,1]]
place.columns = ['city','state']
place

Unnamed: 0,city,state
105,Morogoro,Tanzania
167,Twentynine Palms,CA
211,Aiton,France
220,Salt Lake City,UT
281,New Orleans,LA
...,...,...
610356,Fort Myers,FL
610458,Lehigh Acres,FL
610490,Winnipeg,Manitoba
610527,Charlotte,NC


In [28]:
# Add the two new columns back into the dataframe

data2 = pd.concat([data2, place],axis = 1, join = 'outer', 
                 ignore_index=False, sort=False)

# Strip the new columns of any spaces

data2.state = data2.state.str.strip()
data2.city = data2.city.str.strip()

In [29]:
# Filter dataframe to only rows that have a US state

data2 = data2[data2['state'].isin(states)]

data2.shape

(110797, 11)

In [30]:
# Concatenate the two datasets

datas = [data1, data2]
data_final = pd.concat(datas).drop_duplicates().reset_index(drop=True)
data_final.shape

(197301, 11)

In [31]:
# USA State abbreviations to make everything in state variable 2-letter

us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}


In [32]:
# map a dictionary of abbreviations to the dataframe

data_final['state'] = data_final['state'].map(us_state_to_abbrev).fillna(data_final['state'])
data_final.shape

(197301, 11)

In [33]:
data_final = data_final.reset_index(drop=True)

In [34]:
data_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197301 entries, 0 to 197300
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hashtags           94197 non-null   object
 1   favorite_count     197301 non-null  int64 
 2   id                 197301 non-null  int64 
 3   lang               197301 non-null  object
 4   place              148295 non-null  object
 5   retweet_count      197301 non-null  int64 
 6   text               197301 non-null  object
 7   user_location      183484 non-null  object
 8   tweet_proc_length  197301 non-null  int64 
 9   city               197301 non-null  object
 10  state              197301 non-null  object
dtypes: int64(4), object(7)
memory usage: 16.6+ MB


In [None]:
cols = []
df.drop(df.columns[cols], axis=1, inplace=True)

In [35]:
# save prediction dataset

data_final.to_pickle("usa_tweets.pkl")

## Create Training Dataset (non-USA tweets)

In [36]:
# Drop NA's in hashtags column

df = df.dropna(how='any', subset=['hashtags'])
df.shape

(423600, 9)

In [37]:
# Drop any tweets that appear in USA tweets dataframe

condition = df['id'].isin(data_final['id'])
df.drop(df[condition].index, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 338642 entries, 2 to 610670
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hashtags           338642 non-null  object
 1   favorite_count     338642 non-null  int64 
 2   id                 338642 non-null  int64 
 3   lang               338642 non-null  object
 4   place              59673 non-null   object
 5   retweet_count      338642 non-null  int64 
 6   text               338642 non-null  object
 7   user_location      273348 non-null  object
 8   tweet_proc_length  338642 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 25.8+ MB


In [38]:
# Turn hashtags into lists with all lowercase

df['hashtags'] = df['hashtags'].str.lower()
df['hashtags'] = df.apply(lambda row:  row['hashtags'].replace(' ', ',').split(','), axis=1)

In [39]:
# Believer Hashtags
believer_tags = ['climatechangeisreal', 'actonclimate', 'extinctionrebellion', 'climateemergency', 
                 'climateactionnow', 'capitalism', 'public_health', 'climateaction', 'humanityextinction',
                 'activism', 'noplanetb', 'savetheplanet', 'climateaction']

# Denier Hashtags
denier_tags = ['climatechangeisfalse', 'climatechangenotreal', 'climatechangehoax', 
               'globalwarminghoax', 'tcot', 'ccot', 'tlot', 'pjnet', 'rednationrising', 'votered', 
               'libtard', 'libtards', 'maga', 'climatedeniers', 'climatehoax', 'globalcooling',
              'climatechangescam', 'climatehysteria', 'globalwarmingisahoax', 'globalwarmingscam', 'globalcooling']

In [40]:
# assign believer status to each row in the dataset

believe_series = []

for idx, row in df['hashtags'].iteritems():
    # set a count for matching tags in row
    believe = 0
    deny = 0 
    for tag in row:
        if tag.lower() in denier_tags:
            deny += 1
        elif tag.lower() in believer_tags:
            believe += 1
    
    # check that only one type of tag appears
    if (believe > 0) and (deny == 0):
        believe_series.append(1)
    elif (believe == 0) and (deny > 0):
        believe_series.append(0)
    else: # <-- if no matching tag OR if both appear, label as unknown using nonetype 
        believe_series.append(None)

In [41]:
# create target column in dataframe for the believer_series

df = df.assign(target = believe_series)

# Remove NA's

df_believer_status = df.dropna(subset=['target'])

# subset dataframe to only include positive and negative cases

df = df.dropna(subset = ['target'])

# Turn target column into target labels of 0 and 1

df['target'] = df['target'].astype(int)

df['target'].value_counts()

1    201950
0     74462
Name: target, dtype: int64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276412 entries, 2 to 610670
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hashtags           276412 non-null  object
 1   favorite_count     276412 non-null  int64 
 2   id                 276412 non-null  int64 
 3   lang               276412 non-null  object
 4   place              7777 non-null    object
 5   retweet_count      276412 non-null  int64 
 6   text               276412 non-null  object
 7   user_location      218636 non-null  object
 8   tweet_proc_length  276412 non-null  int64 
 9   target             276412 non-null  int64 
dtypes: int64(5), object(5)
memory usage: 23.2+ MB


In [45]:
# Save file

df.to_pickle("../data/cleaned_lemmatized.pkl")

## Add Demographic Data to USA Tweets Dataset

In [23]:
# import data

usa_tweets = pd.read_pickle("../data/usa_tweets.pkl")
demo = pd.read_csv("../datasets/county_statistics.csv")
us_data = pd.read_csv('../datasets/uscities.csv')

In [24]:
us_data.head()

Unnamed: 0,city,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,density,source,military,incorporated,timezone,ranking,zips,id
0,New York,New York,NY,New York,36061,New York,40.6943,-73.9249,18713220,10715,polygon,False,True,America/New_York,1,11229 11226 11225 11224 11222 11221 11220 1138...,1840034016
1,Los Angeles,Los Angeles,CA,California,6037,Los Angeles,34.1139,-118.4068,12750807,3276,polygon,False,True,America/Los_Angeles,1,90291 90293 90292 91316 91311 90037 90031 9000...,1840020491
2,Chicago,Chicago,IL,Illinois,17031,Cook,41.8373,-87.6862,8604203,4574,polygon,False,True,America/Chicago,1,60018 60649 60641 60640 60643 60642 60645 6064...,1840000494
3,Miami,Miami,FL,Florida,12086,Miami-Dade,25.7839,-80.2102,6445545,5019,polygon,False,True,America/New_York,1,33129 33125 33126 33127 33128 33149 33144 3314...,1840015149
4,Dallas,Dallas,TX,Texas,48113,Dallas,32.7936,-96.7662,5743938,1526,polygon,False,True,America/Chicago,1,75287 75098 75233 75254 75251 75252 75253 7503...,1840019440


In [25]:
# remove unnecessary columns from US data dataset

us_data.rename(columns={'state_name':'state'}, inplace=True)
col_list = ['city', 'state', 'county_name']
us_data = us_data.loc[:, col_list]

In [26]:
us_data.head()

Unnamed: 0,city,state,county_name
0,New York,New York,New York
1,Los Angeles,California,Los Angeles
2,Chicago,Illinois,Cook
3,Miami,Florida,Miami-Dade
4,Dallas,Texas,Dallas


In [27]:
# map a dictionary of us abbreviations to the dataframe

us_state_to_abbrev = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA",
    "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA",
    "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
    "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
    "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS",
    "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH",
    "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC",
    "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA",
    "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN",
    "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA",
    "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY", "District of Columbia": "DC",
    "American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP", "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM", "U.S. Virgin Islands": "VI",
}

us_data['state'] = us_data['state'].map(us_state_to_abbrev).fillna(us_data['state'])

In [28]:
# merge us tweets and us data datasets to get county name in us tweets

# make sure merge columns are the same type

us_data['city'] = us_data['city'].astype(str)
usa_tweets['city'] = usa_tweets['city'].astype(str)

us_data['state'] = us_data['state'].astype(str)
usa_tweets['state'] = usa_tweets['state'].astype(str)

# merge datasets

df = pd.merge(usa_tweets, us_data, how='left', on=['city','state'])

# replace non-matches with city name

df.county_name.fillna(df.city, inplace=True)
df = df.drop_duplicates(subset='id').reset_index(drop=True)

In [29]:
df

Unnamed: 0,id,text,city,state,county_name
0,930123883520413697,go warning people climatechange climateaction ...,Holmdel,NJ,Holmdel
1,965587085993365505,ll juice left carrot tonight fresh juice morni...,Miami,FL,Miami-Dade
2,947957104576315392,bike firstnight firstnightmonterey mayor clyde...,Monterey County,CA,Monterey County
3,930124070238040066,climate fact course warm year concern learn cl...,Spokane Valley,WA,Spokane
4,942159912121102336,planet great winner world move ahead actonclim...,Washington,DC,District of Columbia
...,...,...,...,...,...
169455,1210060038380740608,savage energy partner record break fiscal quar...,Houston,TX,Harris
169456,996746444211011586,hard tell snakeoil chemtrail globalwarmingisah...,Eugene,OR,Lane
169457,948537051418214400,standard winter hurricane warn part florida yi...,Aspen,CO,Pitkin
169458,947614408108335104,happy new yeaя fan globalwarmingisahoax hoax f...,Manhattan,NY,New York


In [30]:
# feature engineering on demographics dataset

demo['democrats'] = demo[['percentage16_Hillary_Clinton', 'percentage20_Joe_Biden']].mean(axis=1)
demo['republicans'] = demo[['percentage16_Donald_Trump', 'percentage20_Donald_Trump']].mean(axis=1)
demo['democrats'] = round(demo['democrats']*100,1)
demo['republicans'] = round(demo['republicans']*100,1)
demo["women"] = round(demo["Women"] * 100 / demo["TotalPop"],1)
demo['voting_age_citizens'] = round(demo['VotingAgeCitizen']*100/demo['TotalPop'],1)

In [31]:
demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4867 entries, 0 to 4866
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    4867 non-null   int64  
 1   county                        4867 non-null   object 
 2   state                         4867 non-null   object 
 3   percentage16_Donald_Trump     3111 non-null   float64
 4   percentage16_Hillary_Clinton  3111 non-null   float64
 5   total_votes16                 3111 non-null   float64
 6   votes16_Donald_Trump          3111 non-null   float64
 7   votes16_Hillary_Clinton       3111 non-null   float64
 8   percentage20_Donald_Trump     4490 non-null   float64
 9   percentage20_Joe_Biden        4490 non-null   float64
 10  total_votes20                 4633 non-null   float64
 11  votes20_Donald_Trump          4633 non-null   float64
 12  votes20_Joe_Biden             4633 non-null   float64
 13  lat

In [32]:
# drop unneccessary columns in demographics dataset

cols = [0,18,19,26,27,28,30]
demo.drop(demo.columns[cols], axis=1, inplace=True)


demo = demo.drop(demo.loc[:,'percentage16_Donald_Trump':'deaths'].columns, axis = 1)
demo = demo.drop(demo.loc[:,'ChildPoverty':'FamilyWork'].columns, axis = 1)

In [33]:
demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4867 entries, 0 to 4866
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   county               4867 non-null   object 
 1   state                4867 non-null   object 
 2   TotalPop             3142 non-null   float64
 3   Hispanic             3142 non-null   float64
 4   White                3142 non-null   float64
 5   Black                3142 non-null   float64
 6   Native               3142 non-null   float64
 7   Asian                3142 non-null   float64
 8   Pacific              3142 non-null   float64
 9   IncomePerCap         3142 non-null   float64
 10  Poverty              3142 non-null   float64
 11  Unemployment         3142 non-null   float64
 12  democrats            4515 non-null   float64
 13  republicans          4515 non-null   float64
 14  women                3142 non-null   float64
 15  voting_age_citizens  3142 non-null   f

In [34]:
# make a location variable with county and state in it

demo["location"] = demo["county"] + ', ' + demo["state"]
df["location"] = df["county_name"] + ', ' + df["state"]

In [45]:
# merge datasets

data = pd.merge(df, demo, on="location",how='left')

In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169460 entries, 0 to 169459
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   169460 non-null  int64  
 1   text                 169460 non-null  object 
 2   city                 169460 non-null  object 
 3   state_x              169460 non-null  object 
 4   county_name          169460 non-null  object 
 5   location             169460 non-null  object 
 6   county               155701 non-null  object 
 7   state_y              155701 non-null  object 
 8   TotalPop             150861 non-null  float64
 9   Hispanic             150861 non-null  float64
 10  White                150861 non-null  float64
 11  Black                150861 non-null  float64
 12  Native               150861 non-null  float64
 13  Asian                150861 non-null  float64
 14  Pacific              150861 non-null  float64
 15  IncomePerCap     

In [47]:
cols = [2,3,4,6,7]
data.drop(data.columns[cols], axis=1, inplace=True)

In [48]:
data.head()

Unnamed: 0,id,text,location,TotalPop,Hispanic,White,Black,Native,Asian,Pacific,IncomePerCap,Poverty,Unemployment,democrats,republicans,women,voting_age_citizens
0,930123883520413697,go warning people climatechange climateaction ...,"Holmdel, NJ",,,,,,,,,,,,,,
1,965587085993365505,ll juice left carrot tonight fresh juice morni...,"Miami-Dade, FL",2702602.0,67.5,13.7,16.3,0.1,1.5,0.0,25481.0,19.0,7.4,58.5,40.1,51.5,58.4
2,947957104576315392,bike firstnight firstnightmonterey mayor clyde...,"Monterey County, CA",,,,,,,,,,,,,,
3,930124070238040066,climate fact course warm year concern learn cl...,"Spokane, WA",490764.0,5.4,85.3,1.6,1.2,2.2,0.5,28325.0,15.2,6.3,44.0,50.3,50.4,75.2
4,942159912121102336,planet great winner world move ahead actonclim...,"District of Columbia, DC",672391.0,10.7,36.0,46.9,0.2,3.7,0.0,50832.0,17.4,8.0,93.4,4.0,52.6,74.8


In [49]:
data.isnull().sum()

id                         0
text                       0
location                   0
TotalPop               18599
Hispanic               18599
White                  18599
Black                  18599
Native                 18599
Asian                  18599
Pacific                18599
IncomePerCap           18599
Poverty                18599
Unemployment           18599
democrats              16342
republicans            16342
women                  18599
voting_age_citizens    18599
dtype: int64

In [50]:
# remove rows with NA's

data = data.dropna().reset_index(drop=True)

In [51]:
data

Unnamed: 0,id,text,location,TotalPop,Hispanic,White,Black,Native,Asian,Pacific,IncomePerCap,Poverty,Unemployment,democrats,republicans,women,voting_age_citizens
0,965587085993365505,ll juice left carrot tonight fresh juice morni...,"Miami-Dade, FL",2702602.0,67.5,13.7,16.3,0.1,1.5,0.0,25481.0,19.0,7.4,58.5,40.1,51.5,58.4
1,930124070238040066,climate fact course warm year concern learn cl...,"Spokane, WA",490764.0,5.4,85.3,1.6,1.2,2.2,0.5,28325.0,15.2,6.3,44.0,50.3,50.4,75.2
2,942159912121102336,planet great winner world move ahead actonclim...,"District of Columbia, DC",672391.0,10.7,36.0,46.9,0.2,3.7,0.0,50832.0,17.4,8.0,93.4,4.0,52.6,74.8
3,1019986232720416769,teen activist meet staff ve lose faith humanit...,"Fairfax, VA",1142004.0,16.2,51.7,9.3,0.1,18.8,0.0,52976.0,6.1,4.5,67.8,28.6,50.5,63.5
4,945772043617812480,rescue refugee land sea fleeing conflict need ...,"San Diego, CA",3283665.0,33.4,46.2,4.7,0.4,11.5,0.4,34350.0,13.3,7.1,58.0,38.2,49.7,67.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150793,1210060038380740608,savage energy partner record break fiscal quar...,"Harris, TX",4525519.0,42.2,30.6,18.5,0.2,6.8,0.1,30856.0,16.8,6.4,55.0,42.3,50.3,57.6
150794,996746444211011586,hard tell snakeoil chemtrail globalwarmingisah...,"Lane, OR",363471.0,8.4,82.6,1.0,0.8,2.5,0.2,27032.0,18.8,7.6,58.0,36.4,50.7,77.7
150795,948537051418214400,standard winter hurricane warn part florida yi...,"Pitkin, CO",17747.0,9.8,85.7,1.2,0.2,1.4,0.0,65800.0,7.1,4.3,72.5,23.8,47.2,76.3
150796,947614408108335104,happy new yeaя fan globalwarmingisahoax hoax f...,"New York, NY",1653877.0,26.1,46.8,12.5,0.1,11.9,0.0,69529.0,17.3,6.2,85.9,12.2,52.7,71.6


In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150798 entries, 0 to 150797
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   150798 non-null  int64  
 1   text                 150798 non-null  object 
 2   location             150798 non-null  object 
 3   TotalPop             150798 non-null  float64
 4   Hispanic             150798 non-null  float64
 5   White                150798 non-null  float64
 6   Black                150798 non-null  float64
 7   Native               150798 non-null  float64
 8   Asian                150798 non-null  float64
 9   Pacific              150798 non-null  float64
 10  IncomePerCap         150798 non-null  float64
 11  Poverty              150798 non-null  float64
 12  Unemployment         150798 non-null  float64
 13  democrats            150798 non-null  float64
 14  republicans          150798 non-null  float64
 15  women            

In [53]:
# rename columns to unify

data.rename(columns={'TotalPop': 'total_pop','Hispanic':'hispanic','White':'white', 
                     'Black':'black','Native':'native','Asian':'asian','Pacific':'pacific',
                     'IncomePerCap':'income_per_cap','Poverty':'poverty',
                     'Unemployment':'unemployment', }, inplace=True)

In [54]:
# reorder columns

data = data.reindex(columns = ['id','text','location','total_pop','women','hispanic','white',
                               'black','native','asian','pacific','income_per_cap',
                               'poverty','unemployment','democrats','republicans',
                               'voting_age_citizens'])

In [55]:
data.head()

Unnamed: 0,id,text,location,total_pop,women,hispanic,white,black,native,asian,pacific,income_per_cap,poverty,unemployment,democrats,republicans,voting_age_citizens
0,965587085993365505,ll juice left carrot tonight fresh juice morni...,"Miami-Dade, FL",2702602.0,51.5,67.5,13.7,16.3,0.1,1.5,0.0,25481.0,19.0,7.4,58.5,40.1,58.4
1,930124070238040066,climate fact course warm year concern learn cl...,"Spokane, WA",490764.0,50.4,5.4,85.3,1.6,1.2,2.2,0.5,28325.0,15.2,6.3,44.0,50.3,75.2
2,942159912121102336,planet great winner world move ahead actonclim...,"District of Columbia, DC",672391.0,52.6,10.7,36.0,46.9,0.2,3.7,0.0,50832.0,17.4,8.0,93.4,4.0,74.8
3,1019986232720416769,teen activist meet staff ve lose faith humanit...,"Fairfax, VA",1142004.0,50.5,16.2,51.7,9.3,0.1,18.8,0.0,52976.0,6.1,4.5,67.8,28.6,63.5
4,945772043617812480,rescue refugee land sea fleeing conflict need ...,"San Diego, CA",3283665.0,49.7,33.4,46.2,4.7,0.4,11.5,0.4,34350.0,13.3,7.1,58.0,38.2,67.2


In [56]:
data['location'].value_counts() # 1,788 unique locations

District of Columbia, DC    9268
Los Angeles, CA             7849
New York, NY                6857
King, WA                    3625
Cook, IL                    3618
                            ... 
Franklin, VT                   1
Hamilton, NY                   1
Louisa, IA                     1
Knox, NE                       1
Bedford, TN                    1
Name: location, Length: 1788, dtype: int64

In [58]:
data['location'].value_counts()[0:50] # top 50 locations

District of Columbia, DC    9268
Los Angeles, CA             7849
New York, NY                6857
King, WA                    3625
Cook, IL                    3618
San Francisco, CA           3165
Multnomah, OR               3077
San Diego, CA               2891
Suffolk, MA                 2789
Maricopa, AZ                2614
Harris, TX                  2586
Kings, NY                   2408
Alameda, CA                 2344
Travis, TX                  2204
Santa Clara, CA             2164
Miami-Dade, FL              1943
Orange, CA                  1853
Denver, CO                  1852
Philadelphia, PA            1708
Fulton, GA                  1698
Clark, NV                   1583
Hennepin, MN                1453
Dallas, TX                  1417
Middlesex, MA               1366
Allegheny, PA               1318
Orleans, LA                 1134
Franklin, OH                 933
Boulder, CO                  917
Montgomery, MD               915
Sacramento, CA               904
Polk, IA  

In [59]:
# Save file

data.to_pickle("../data/usa_tweets_demo.pkl")