In [1]:
import pickle
import pandas as pd
import json
import os

# compile user geodata

In [2]:
path = '/data_volume/live/intermediate_files/data-processing/accounts_data/'

In [3]:
accounts_by_county = pickle.load(open(path+'us_counties_accounts.pkl','rb'))
fips = pd.read_csv('/data_volume/home/vaccine_hesitancy_paper/quick_output/data/misc/fip_code_lookup.csv')
state_fips = fips[fips['geo_level']=='State'].drop(columns = ['geo_level','fips_code','county_code_fips']).rename(columns = {'area_name':'state'})
county_fips = fips.drop(columns = ['geo_level','county_code_fips']).rename(columns = {'area_name':'county'})
county_fips['fips_code'] = county_fips['fips_code'].astype('Int64')


In [4]:
#get df from dict
geo = list()
for location in accounts_by_county.keys():
    for user in accounts_by_county[location]:
        geo.append((user,location))
geo = pd.DataFrame(geo, columns = ['user_id','location'])

#get clean county and state columns
geo['location'] = geo['location'].apply(lambda x: x.strip(' []').replace("'","").replace(", ",",").split(','))
geo['state'] = geo['location'].apply(lambda x: x[1])
geo['county'] = geo['location'].apply(lambda x: x[0].strip('"'))

#get FIPS code
geo = geo.merge(right=state_fips, on='state',how='inner')
geo = geo.merge(right=county_fips, on=['state_code_fips','county'], how='inner')

#clean up
geo = geo.drop(columns = ['state_code_fips','location']
              ).rename(columns = {'fips_code':'FIPS'}
              ).drop_duplicates('user_id'
              ).dropna(
              ).reset_index(drop=True)
geo['FIPS'] = geo['FIPS'].astype('int64')

In [5]:
geo

Unnamed: 0,user_id,state,county,FIPS
0,816100811407233024,Ohio,Hamilton County,39061
1,484708822,Ohio,Hamilton County,39061
2,1333135267646091269,Ohio,Hamilton County,39061
3,1071089369921044480,Ohio,Hamilton County,39061
4,3004620045,Ohio,Hamilton County,39061
...,...,...,...,...
2399904,1520223043842420738,Wyoming,Campbell County,56005
2399905,263449923,Wyoming,Campbell County,56005
2399906,783215322,Wyoming,Campbell County,56005
2399907,240937722,Wyoming,Campbell County,56005


In [6]:
geo.to_parquet('/data_volume/pub/antivax/raw_geo_and_vax_data/users_geodata.parquet')

# compile all geotagged tweets

In [7]:
geo = pd.read_parquet('/data_volume/pub/antivax/raw_geo_and_vax_data/users_geodata.parquet')

In [8]:
geotagged_users = geo['user_id']

In [9]:
path = '/data_volume/pub/antivax/raw_tweets_parquets/'
files = os.listdir(path)
files = [file for file in os.listdir(path) if '.parquet' in file]
files.sort()

In [10]:
%%time

dfs = list()
n_total = 0
n_geolocated = 0
for file in files:
    df = pd.read_parquet(path+file)
    n_total += len(df)
    df = df[df['user_id'].isin(geotagged_users)]
    n_geolocated += len(df)
    dfs.append(df)
    
df = pd.concat(dfs)
del dfs

CPU times: user 4min 14s, sys: 1min 45s, total: 5min 59s
Wall time: 5min 11s


In [11]:
print('total tweets:',n_total,'\ngeolocated tweets:', n_geolocated, '\nfrac geolocated:',n_geolocated/n_total)

total tweets: 229543946 
geolocated tweets: 25926048 
frac geolocated: 0.11294590187100818


In [12]:
df = df.merge(right=geo[['user_id','FIPS']], on='user_id',how='inner')

In [13]:
df.head(1)

Unnamed: 0,tweet_id,text,tweet_time,user_id,lang,retweeted_tweet_id,FIPS
0,1356029474035421189,Just another reason why these high carb foods ...,2021-01-31 23:59:55,411821925,en,1355897942184439810,28033


In [14]:
df.to_parquet('/data_volume/pub/antivax/all_geotagged_tweets.parquet')

# clean vax and population data


In [15]:
import pandas as pd
import numpy as np

In [16]:
#load vax data
path = '/data_volume/pub/antivax/'
vax = pd.read_csv(path+'/raw_geo_and_vax_data/COVID-19_Vaccinations_in_the_United_States_County.csv', 
                  low_memory=False)
vax = vax[['Date','FIPS','Series_Complete_Pop_Pct','Series_Complete_Yes'] #select columns
         ].rename(columns={'Series_Complete_Pop_Pct':'fraction_vaccinated', 
                           'Series_Complete_Yes': 'n_vaccinated',
                           'Date':'date'} #rename
                 ).sort_values(['FIPS','date']) #sort
vax['fraction_vaccinated'] = vax['fraction_vaccinated'] / 100 #convert percent to fraction
vax = vax[vax['FIPS']!='UNK'] #drop unknown fips
vax['FIPS'] = vax['FIPS'].astype('int') #convert to int
vax['state'] = (vax['FIPS']/1000).astype(int) #get state FIPS

#drop Texas. no valid data present, only zeros. 
vax = vax[vax['state']!=48]

#get state FIPS
vax['date'] = pd.to_datetime(vax['date'], format = '%m/%d/%Y')

In [17]:
vax.to_parquet(path+'vaccination_stats.parquet')

In [18]:
#get county populations
pop = pd.read_csv(path+'/raw_geo_and_vax_data/county_populations_2020.csv', dtype='str')
pop = pop[['STATE','COUNTY','POPESTIMATE2020']].rename(columns = {'POPESTIMATE2020':'population'})
pop['FIPS'] = (pop['STATE']+pop['COUNTY']).astype(int)
pop['population'] = pop['population'].astype(int)
pop = pop[['FIPS','population']]
pop = pop.append(pd.DataFrame([(2261,9243), (2270,8250)],columns=['FIPS','population'])) #manually append some AK populations, which are missing for some reason
pop = pop.sort_values('FIPS').reset_index(drop=True)
pop.head()

Unnamed: 0,FIPS,population
0,1000,4921532
1,1001,56145
2,1003,229287
3,1005,24589
4,1007,22136


In [19]:
pop.to_parquet(path+'FIPS_populations.parquet')

# confounders

In [20]:
import pandas as pd

In [21]:
poppath = '/data_volume/pub/antivax/'
confounders_path = '/data_volume/home/vaccine_hesitancy_paper/quick_output/intermediate_files/socio-econ/county_level/'


In [22]:
pop = pd.read_parquet(poppath+'FIPS_populations.parquet')[['FIPS','population']]

In [23]:
election = pd.read_csv(confounders_path+'county_2020_elections.csv')[['county_fips','per_dem','per_gop']]
pop = pop.merge(election, left_on='FIPS',right_on='county_fips', how='left').drop(columns=['county_fips'])

In [24]:
rural_urban = pd.read_csv(confounders_path+'ruralurbancodes2013.csv', delimiter=';')[['FIPS','RUCC_2013']]
pop = pop.merge(rural_urban, on='FIPS', how='left')

In [25]:
cols = ['WhiteNonHispanicPct2010','BlackNonHispanicPct2010',
        'AsianNonHispanicPct2010','HispanicPct2010',
        'Under18Pct2010','Age65AndOlderPct2010',
        'Ed5CollegePlusPct','PopDensity2010']
demographics = pd.read_csv(confounders_path+'People.csv')[cols+['FIPS']]
pop = pop.merge(demographics, on='FIPS', how='left')

In [26]:
income = pd.read_csv(confounders_path+'Income.csv')[['FIPS','MedHHInc']]
pop = pop.merge(income, on='FIPS', how='left')

In [27]:
gini = pd.read_csv(confounders_path+'Gini.csv').drop(0).rename(columns = {'B19083_001E':'gini_est'})
gini['FIPS'] = gini['GEO_ID'].apply(lambda x: x[-5:]).astype(int)
gini = gini[['FIPS','gini_est']]
gini['gini_est'] = gini['gini_est'].astype(float)
pop = pop.merge(gini, on='FIPS', how='left')

In [28]:
pop.to_parquet(poppath+'FIPS_populations.parquet')

In [29]:
pop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3196 entries, 0 to 3195
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   FIPS                     3196 non-null   int64  
 1   population               3196 non-null   int64  
 2   per_dem                  3112 non-null   float64
 3   per_gop                  3112 non-null   float64
 4   RUCC_2013                3141 non-null   float64
 5   WhiteNonHispanicPct2010  3193 non-null   float64
 6   BlackNonHispanicPct2010  3193 non-null   float64
 7   AsianNonHispanicPct2010  3193 non-null   float64
 8   HispanicPct2010          3193 non-null   float64
 9   Under18Pct2010           3193 non-null   float64
 10  Age65AndOlderPct2010     3193 non-null   float64
 11  Ed5CollegePlusPct        3193 non-null   float64
 12  PopDensity2010           3193 non-null   float64
 13  MedHHInc                 3192 non-null   float64
 14  gini_est                