Data Source:

https://www.kaggle.com/manchunhui/us-election-2020-tweets

In [84]:
import pandas as pd
import numpy as np
import reverse_geocoder
from tqdm import tqdm
import itertools
import jellyfish

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97.5% !important; }</style>"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Import and Clean Data

In [3]:
biden_tweets = pd.read_csv(r"C:/Users/mikha/OneDrive/Desktop/Dropbox/MIKHAEL NEW/mikhael school/Grad School/Master's/594/Data/US Election Tweets 2020/hashtag_joebiden.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
biden_tweets = biden_tweets[ biden_tweets['country']=='United States of America' ]

biden_tweets = biden_tweets[['user_id', 'lat', 'long']]

biden_tweets.dropna(inplace=True)

In [6]:
trump_tweets = pd.read_csv(r"C:/Users/mikha/OneDrive/Desktop/Dropbox/MIKHAEL NEW/mikhael school/Grad School/Master's/594/Data/US Election Tweets 2020/hashtag_donaldtrump.csv", lineterminator='\n')

In [7]:
trump_tweets = trump_tweets[ trump_tweets['country']=='United States of America' ]

trump_tweets = trump_tweets[['user_id', 'lat', 'long']]

trump_tweets.dropna(inplace=True)

In [8]:
all_tweets_df = pd.concat([biden_tweets, trump_tweets])

In [9]:
all_tweets_df['lat'] = pd.to_numeric(all_tweets_df['lat'])
all_tweets_df['long'] = pd.to_numeric(all_tweets_df['long'])

In [10]:
all_tweets_df = all_tweets_df.groupby('user_id').agg(['unique'])

In [11]:
# flatten column indices
all_tweets_df.columns = all_tweets_df.columns.get_level_values(0)

## Clean up users with multiple locations

In [12]:
def tuple_is_close(tup, threshold):
    '''
    input = (x,y) = tuple of two elements
    '''
    if abs(tup[0] - tup[1]) < threshold:
        
        return True
    
    else:
        
        return False

In [15]:
THRESHOLD = 0.3

lat_good = []
lon_good = []

for i,row in all_tweets_df.iterrows():
    
    if len(row['lat']) > 1: # if multiple latitudes (and by extension multiple longitudes)
        if all([tuple_is_close(x, THRESHOLD) for x in itertools.combinations(row['lat'], 2)]): #if all pairwise permutations of LATITUDES are within THRESHOLD degrees of eachother
            lat_good.append(1)
        else:
            lat_good.append(0)

        if all([tuple_is_close(x, THRESHOLD) for x in itertools.combinations(row['long'], 2)]): #if all pairwise permutations of LONGITUDES are within THRESHOLD degrees of eachother
            lon_good.append(1)
        else:
            lon_good.append(0)
    else: # if there's only one location for this user
        lat_good.append(1)
        lon_good.append(1)
            
                        

all_tweets_df['Coordinates Dont Change'] = (np.array(lat_good)==1) & (np.array(lon_good)==1)

all_tweets_df = all_tweets_df[ all_tweets_df['Coordinates Dont Change']==True ]

del all_tweets_df['Coordinates Dont Change'], lat_good, lon_good

In [16]:
all_tweets_df.to_csv(r"C:/Users/mikha/OneDrive/Desktop/Dropbox/MIKHAEL NEW/mikhael school/Grad School/Master's/594/Data/US Election Tweets 2020/US Election Geolocated Users.csv")

# Reverse GeoCode from Coordinates to Counties/FIPS

In [18]:
'''
Here we're just taking the first set of coordinates (even if there are multiple for a user).
This shouldn't make a difference since we already filtered out all users with multiple coordinates that are far away from eachother.
'''

coords_list = []

for i,row in all_tweets_df.iterrows():
    coords_list.append((row['lat'][0], row['long'][0]))

all_tweets_df['Coordinates'] = coords_list

In [19]:
del all_tweets_df['lat'], all_tweets_df['long'] 

# Filter to Only Look Up CT Users

In [20]:
all_CT_users = pd.read_csv(r"C:\Users\mikha\Dropbox\mikhael_misc\Projects\594\Twitter-Conspiracies\All CT Link and Hashtag Users.csv")

In [21]:
all_tweets_df.reset_index(inplace=True)

In [22]:
all_tweets_df['user_id'] = pd.to_numeric(all_tweets_df['user_id'])

In [23]:
geolocated_users = np.intersect1d(all_tweets_df['user_id'], all_CT_users['ID'])

In [24]:
all_tweets_df = all_tweets_df.set_index('user_id').loc[geolocated_users].reset_index().drop_duplicates()

In [29]:
def reverse_geo_lookup(coordinates):
    location = reverse_geocoder.search(coordinates)[0]#['admin2']
    return location['admin1'], location['admin2']

In [30]:
reverse_geo_lookup(all_tweets_df['Coordinates'].iloc[0])

('Massachusetts', 'Worcester County')

In [32]:
# def reverse_geo_lookup(coordinates):
#     try:
#         return reverse_geocoder.search(coordinates)#[0]['admin2']
#     except:
#         return np.nan

In [34]:
tqdm.pandas()

all_tweets_df['Location'] = all_tweets_df['Coordinates'].progress_apply(reverse_geo_lookup)

100%|██████████████████████████████████████████████████████████████████████████| 14874/14874 [7:23:52<00:00,  1.79s/it]


In [41]:
split_location = pd.DataFrame(all_tweets_df['Location'].tolist(), index=all_tweets_df.index)

In [45]:
all_tweets_df = all_tweets_df.merge(split_location, how='inner', left_index=True, right_index=True)

In [52]:
all_tweets_df.rename(columns={0:'State', 1:'County'}, inplace=True)

In [54]:
del all_tweets_df['Location']

In [60]:
all_tweets_df['State'] = all_tweets_df['State'].str.upper()
all_tweets_df['County'] = all_tweets_df['County'].str.upper()

In [62]:
all_tweets_df.to_csv(r"C:/Users/mikha/Dropbox/mikhael_misc/Projects/594/Twitter-Conspiracies/Geo Cross Referencing/Users Geolocated From US Election 2020.csv", index=False)

# NEXT STEP:

1. Convert County names to FIPS
2. Merge with user master list (with suffix to indicate geodata source)

## FIPS Import and Processing

In [63]:
def make_dict_uppercase(d):
    '''
    input = dictionary
    output = uppercase dictionary
    '''
    
    return {key.upper(): value.upper() for key,value in d.items()}

##################################################################################


def make_list_of_dicts_uppercase(list_of_dicts):
    '''
    input = list of dictionaries
    output = uppercase list of dictionaries
    '''
    
    return [make_dict_uppercase(d) for d in list_of_dicts]

## Get (State & County) --> FIPS dictionary

In [64]:
# import data from wikipedia
fips = pd.read_html("https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county")[1]

# remove all hyperlinks (these look like "... County [h]", etc.)
fips['County or equivalent'] = fips['County or equivalent'].str.replace(r"\[.*\]","")

# convert to uppercase
fips['County or equivalent'] = fips['County or equivalent'].apply(lambda x: x.upper())
fips['State or equivalent'] = fips['State or equivalent'].apply(lambda x: x.upper())

# replace "St." with "Saint"
fips['County or equivalent'] = [x.replace('ST.','SAINT') for x in fips['County or equivalent']]

# remove everything after a comma in a county name (e.g. "ANCHORAGE, MUNICIPALITY OF")
fips['County or equivalent'] = [x.split(',')[0] for x in fips['County or equivalent']]

# replace DC info to correspond to GeoCov19 format
dc_loc = fips[fips['County or equivalent']=='DISTRICT OF COLUMBIA'].index.tolist()[0]
fips['State or equivalent'].loc[dc_loc] = 'WASHINGTON, D.C.'
fips['County or equivalent'].loc[dc_loc] = 'WASHINGTON'

########################################################################

# create dictionary
state_fips_dict = {k: f.groupby('County or equivalent')['FIPS'].apply(list).to_dict()
     for k, f in fips.groupby('State or equivalent')}

# clean dictionary
for state in state_fips_dict:
    for county in state_fips_dict[state]:
        state_fips_dict[state][county] = state_fips_dict[state][county][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


# Map county names to FIPS (using FIPS dict)

### Get FIPS from a single dictionary

In [115]:
def fuzzy_match_counties(county, counties_dict):
    '''
    input = county we're searching for; dict of counties we're trying to match it to and their correspodning FIPS
    output = name of the county that it's closest to
    '''
    # create list of jaro-winkler similarities between the misspelled county and all counties in the state
    jaro_distances = np.array([jellyfish.jaro_winkler_similarity(county, county_from_list) for county_from_list in list(counties_dict.keys())])
        
    # return county with smallest jaro-winkler distance
    return counties_dict[list(counties_dict.keys())[np.argmin(jaro_distances)]]

In [113]:
def get_fips_from_loc(loc):
    '''
    input = tweet_location from US Election data: (STATE, COUNTY) tuple
    output = FIPS code corresponding to county 
    '''
    
    try:
        return state_fips_dict[ loc[0] ][ loc[1] ]
    except:
        try:
            return fuzzy_match_counties(loc[1], state_fips_dict[loc[0]])
        except Exception as e:
            return np.nan
        


In [75]:
all_tweets_df['State+County'] = list(zip(all_tweets_df['State'], all_tweets_df['County']))

In [None]:
all_tweets_df['FIPS'] = all_tweets_df['State+County'].apply(get_fips_from_loc)

In [124]:
all_tweets_df = all_tweets_df[~all_tweets_df['FIPS'].isnull()]

In [130]:
all_tweets_df.to_csv(r"C:/Users/mikha/Dropbox/mikhael_misc/Projects/594/Twitter-Conspiracies/Geo Cross Referencing/Users Geolocated From US Election 2020.csv", index=False)

# Merge with Master User List

In [144]:
master_user_list_path = r"C:/Users/mikha/Dropbox/mikhael_misc/Projects/594/Twitter-Conspiracies/All CT Link and Hashtag Users.csv"

MASTER_USER_LIST = pd.read_csv(master_user_list_path)

del MASTER_USER_LIST['Unnamed: 0']

In [133]:
MASTER_USER_LIST = MASTER_USER_LIST.merge(all_tweets_df, left_on='ID', right_on='user_id', how='left', suffixes=('', ' - ELECTION 2020'))

In [135]:
MASTER_USER_LIST[~MASTER_USER_LIST['FIPS'].isnull()]

Unnamed: 0,ID,Num CT Tweets - HT,Num CT Tweets - LINK,user_id,Coordinates,State,County,State+County,FIPS
39,3632641633,310.0,,3.632642e+09,"(40.0149856, -105.2705456)",COLORADO,BOULDER COUNTY,"(COLORADO, BOULDER COUNTY)",8013.0
49,94374562,306.0,,9.437456e+07,"(31.8160381, -99.5120986)",TEXAS,COLEMAN COUNTY,"(TEXAS, COLEMAN COUNTY)",48083.0
96,183301058,904.0,215.0,1.833011e+08,"(27.9477595, -82.458444)",FLORIDA,HILLSBOROUGH COUNTY,"(FLORIDA, HILLSBOROUGH COUNTY)",12057.0
130,1194604029365456896,90.0,,1.194604e+18,"(34.2331373, -102.4107493)",TEXAS,LAMB COUNTY,"(TEXAS, LAMB COUNTY)",48279.0
147,14709515,297.0,,1.470952e+07,"(33.687438799999995, -80.4363743)",SOUTH CAROLINA,SUMTER COUNTY,"(SOUTH CAROLINA, SUMTER COUNTY)",45085.0
...,...,...,...,...,...,...,...,...,...
1027927,179236473,,1.0,1.792365e+08,"(27.7567667, -81.4639835)",FLORIDA,POLK COUNTY,"(FLORIDA, POLK COUNTY)",12105.0
1028396,1270528891774525440,,1.0,1.270529e+18,"(42.2681569, -83.7312291)",MICHIGAN,WASHTENAW COUNTY,"(MICHIGAN, WASHTENAW COUNTY)",26161.0
1028662,56342154,,1.0,5.634215e+07,"(29.4246002, -98.4951405)",TEXAS,BEXAR COUNTY,"(TEXAS, BEXAR COUNTY)",48029.0
1028745,1213901815827165184,,1.0,1.213902e+18,"(40.8517224, -73.0992188)",NEW YORK,SUFFOLK COUNTY,"(NEW YORK, SUFFOLK COUNTY)",36103.0


In [138]:
MASTER_USER_LIST.rename(columns={'Coordinates':'Coordinates - ELECTION 2020', 'State+County':'State+County - ELECTION 2020', 'FIPS': 'FIPS - ELECTION 2020'}, inplace=True)

In [143]:
MASTER_USER_LIST.drop(['user_id','State','County'], axis='columns', inplace=True)

In [146]:
MASTER_USER_LIST.to_csv(r'C:/Users/mikha/Dropbox/mikhael_misc/Projects/594/Twitter-Conspiracies/Geo Cross Referencing/Master User List - GEOLOCATED.csv', index=False)

In [147]:
MASTER_USER_LIST

Unnamed: 0,ID,Num CT Tweets - HT,Num CT Tweets - LINK,Coordinates - ELECTION 2020,State+County - ELECTION 2020,FIPS - ELECTION 2020
0,948955244,9162.0,,,,
1,1041624996265701378,2095.0,2.0,,,
2,25420415,4030.0,,,,
3,71777998,4684.0,,,,
4,1721052956,6051.0,,,,
...,...,...,...,...,...,...
1028956,1304140679573106689,,1.0,,,
1028957,78607938,,1.0,,,
1028958,345261628,,1.0,,,
1028959,1298989066076057600,,1.0,,,
