In [None]:
!jupyter nbextension enable codefolding/main


In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [None]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 199)
pd.options.display.float_format = '{:.2f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Overview
1. Identify US users 
2. Identify US Tweets 
3. Delete all obs. without US User or Tweet 
4. Match to list of users we collected

## Import GeoCov19 Data

In [None]:
df = pd.read_json(r"C:/Users/crackcocaine69xxx/Python Stuff/594/GeoCoV19 Data/geo_feb_01_10/geo_2020-02-01/geo_2020-02-01.json", lines=True)

# DELETE THIS LATER

In [None]:
df = df.iloc[:5000]

### Make Uppercase

In [None]:
def make_dict_uppercase(d):
    '''
    input = dictionary
    output = uppercase dictionary
    '''
    
    return {key.upper(): value.upper() for key,value in d.items()}

df['user_location'] = df['user_location'].apply(make_dict_uppercase)


##################################################################################


def make_list_of_dicts_uppercase(list_of_dicts):
    '''
    input = list of dictionaries
    output = uppercase list of dictionaries
    '''
    
    return [make_dict_uppercase(d) for d in list_of_dicts]


df['tweet_locations'] = df['tweet_locations'].apply(make_list_of_dicts_uppercase)

## Get (State & County) --> FIPS dictionary

In [None]:
# import data from wikipedia
fips = pd.read_html("https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county")[1]

# remove all hyperlinks (these look like "... County [h]", etc.)
fips['County or equivalent'] = fips['County or equivalent'].str.replace(r"\[.*\]","")

# convert to uppercase
fips['County or equivalent'] = fips['County or equivalent'].apply(lambda x: x.upper())
fips['State or equivalent'] = fips['State or equivalent'].apply(lambda x: x.upper())

# replace "St." with "Saint"
fips['County or equivalent'] = [x.replace('ST.','SAINT') for x in fips['County or equivalent']]

# remove everything after a comma in a county name (e.g. "ANCHORAGE, MUNICIPALITY OF")
fips['County or equivalent'] = [x.split(',')[0] for x in fips['County or equivalent']]

# replace DC info to correspond to GeoCov19 format
dc_loc = fips[fips['County or equivalent']=='DISTRICT OF COLUMBIA'].index.tolist()[0]
fips['State or equivalent'].loc[dc_loc] = 'WASHINGTON, D.C.'
fips['County or equivalent'].loc[dc_loc] = 'WASHINGTON'

########################################################################

# create dictionary
state_fips_dict = {k: f.groupby('County or equivalent')['FIPS'].apply(list).to_dict()
     for k, f in fips.groupby('State or equivalent')}

# clean dictionary
for state in state_fips_dict:
    for county in state_fips_dict[state]:
        state_fips_dict[state][county] = state_fips_dict[state][county][0]

# Map county names to FIPS (using FIPS dict)

In [None]:
def get_fips_from_loc(loc):
    '''
    input = tweet_location from GeoCov19 data (single dictionary)
    output = FIPS code corresponding to counties mentioned
    '''

    if loc['COUNTRY_CODE']=='US':

        try:

            if loc['COUNTY'].split(" ")[-1] in ('COUNTY', 'PARISH'):

                return state_fips_dict[ loc['STATE'] ] [loc['COUNTY'] ]

            elif 'COUNTY' in loc:

                try:

                    return state_fips_dict[ loc['STATE'] ] [loc['COUNTY'] + " " + "COUNTY"]

                except Exception as e:

                    pass
        
        except Exception as e:

            pass

In [None]:
def get_fips_from_list_of_locs(tweet_locs):
    '''
    input = tweet_locations from GeoCov19 data (list of dicts)
    output = list of FIPS codes corresponding to counties mentioned
    '''

    temp_list = [get_fips_from_loc(loc, tweet_locs) for loc in tweet_locs]
    
    return [x for x in temp_list if x is not None]

In [None]:
# def OG_get_fips_from_list_of_locs(tweet_locs):
#     '''
#     input = tweet_locations from GeoCov19 data (list of dicts)
#     output = list of FIPS codes corresponding to counties mentioned
#     '''
#     tweet_FIPS = []
    
#     for loc in tweet_locs: #there are no longer any empty locs so we don't have an if-else for them
        
#         if loc['COUNTRY_CODE']=='US':

#             try:
                
#                 if loc['COUNTY'].split(" ")[-1] in ('COUNTY', 'PARISH'):
                    
#                     tweet_FIPS.append(state_fips_dict[ loc['STATE'] ] [loc['COUNTY'] ])
                                
#                 elif 'COUNTY' in loc:
                    
#                     try:
                        
#                         tweet_FIPS.append(state_fips_dict[ loc['STATE'] ] [loc['COUNTY'] + " " + "COUNTY"])
                        
#                     except Exception as e:
                        
#                         print('0 ', e, '\n')
#                         print(loc, '\n')
#                         print(tweet_locs, '\n \n')
                    
#             except Exception as e:
                
#                 try:
                    
#                     print((loc['STATE'], loc['COUNTY']),',', '\n')
#                     print('1 ', e, '\n')
#                     print(loc, '\n')
#                     print(tweet_locs, '\n')
#                     print(state_fips_dict[loc['STATE']], '\n')
#                     print(state_fips_dict[loc['STATE']][loc['COUNTY']], '\n')
#                     print('\n \n')
                    
#                 except:
                    
#                     pass
#                     print('2 ', e, '\n')
#                     print(loc, '\n')
#                     print(tweet_locs, '\n \n')

#     return tweet_FIPS

## NEED TO INSPECT COUNTY MAPPINGS TO ENSURE THAT IT PICKS UP ON STRINGS WELL


# ALL REPLACEMENTS MUST BE DONE IN "fips"
* ~"Pointe Coupee Parish County" should be "Pointe Coupee Parish"~
* ~Need to replace "St." with "Saint" in "fips"~
* Fix counties that aren't cross-referenced correctly:
    * "D.C."
    * "SAN FRANCISCO"

In [None]:
df['Tweet FIPS'] = df['tweet_locations'].apply(get_fips_from_list_of_locs)

In [None]:
df['User FIPS'] = df['user_location'].apply(get_fips_from_loc)

In [None]:
df.iloc[0]