In [2]:
!jupyter nbextension enable codefolding/main


Enabling notebook extension codefolding/main...
      - Validating: problems found:
        - require?  X codefolding/main


In [3]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [65]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 199)
pd.options.display.float_format = '{:.4f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Overview
1. Identify US users 
2. Identify US Tweets 
3. Delete all obs. without US User or Tweet 
4. Match to list of users we collected

## Import GeoCov19 Data

In [5]:
df = pd.read_json(r"C:/Users/crackcocaine69xxx/Python Stuff/594/GeoCoV19 Data/geo_feb_01_10/geo_2020-02-01/geo_2020-02-01.json", lines=True)

### Make Uppercase

In [6]:
def make_dict_uppercase(d):
    '''
    input = dictionary
    output = uppercase dictionary
    '''
    
    return {key.upper(): value.upper() for key,value in d.items()}

df['user_location'] = df['user_location'].apply(make_dict_uppercase)


##################################################################################


def make_list_of_dicts_uppercase(list_of_dicts):
    '''
    input = list of dictionaries
    output = uppercase list of dictionaries
    '''
    
    return [make_dict_uppercase(d) for d in list_of_dicts]


df['tweet_locations'] = df['tweet_locations'].apply(make_list_of_dicts_uppercase)

## Get (State & County) --> FIPS dictionary

In [7]:
# import data from wikipedia
fips = pd.read_html("https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county")[1]

# remove all hyperlinks (these look like "... County [h]", etc.)
fips['County or equivalent'] = fips['County or equivalent'].str.replace(r"\[.*\]","")

# convert to uppercase
fips['County or equivalent'] = fips['County or equivalent'].apply(lambda x: x.upper())
fips['State or equivalent'] = fips['State or equivalent'].apply(lambda x: x.upper())

# replace "St." with "Saint"
fips['County or equivalent'] = [x.replace('ST.','SAINT') for x in fips['County or equivalent']]

# remove everything after a comma in a county name (e.g. "ANCHORAGE, MUNICIPALITY OF")
fips['County or equivalent'] = [x.split(',')[0] for x in fips['County or equivalent']]

# replace DC info to correspond to GeoCov19 format
dc_loc = fips[fips['County or equivalent']=='DISTRICT OF COLUMBIA'].index.tolist()[0]
fips['State or equivalent'].loc[dc_loc] = 'WASHINGTON, D.C.'
fips['County or equivalent'].loc[dc_loc] = 'WASHINGTON'

########################################################################

# create dictionary
state_fips_dict = {k: f.groupby('County or equivalent')['FIPS'].apply(list).to_dict()
     for k, f in fips.groupby('State or equivalent')}

# clean dictionary
for state in state_fips_dict:
    for county in state_fips_dict[state]:
        state_fips_dict[state][county] = state_fips_dict[state][county][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


# Map county names to FIPS (using FIPS dict)

### Get FIPS from a single dictionary

In [8]:
def get_fips_from_loc(loc):
    '''
    input = tweet_location from GeoCov19 data (single dictionary)
    output = FIPS code corresponding to counties mentioned
    '''
    try:
       
        if loc['COUNTRY_CODE']=='US':

            try:

                if loc['COUNTY'].split(" ")[-1] in ('COUNTY', 'PARISH'):

                    return state_fips_dict[ loc['STATE'] ] [loc['COUNTY'] ]

                elif 'COUNTY' in loc:

                    try:

                        return state_fips_dict[ loc['STATE'] ] [loc['COUNTY'] + " " + "COUNTY"]

                    except Exception as e:

                        pass

            except Exception as e:

                pass
    except:
        
        pass

### Get FIPS from a list of dictionaries

In [9]:
def get_fips_from_list_of_locs(tweet_locs):
    '''
    input = tweet_locations from GeoCov19 data (list of dicts)
    output = list of FIPS codes corresponding to counties mentioned
    '''

    temp_list = [get_fips_from_loc(loc) for loc in tweet_locs]
    
    return [x for x in temp_list if x is not None]

## NEED TO INSPECT COUNTY MAPPINGS TO ENSURE THAT IT PICKS UP ON STRINGS WELL
May have to implement fuzzy-matching if errors continue? Probably easiest/best to just find errors, since they won't change

# ALL REPLACEMENTS MUST BE DONE IN "fips"
* ~"Pointe Coupee Parish County" should be "Pointe Coupee Parish"~
* ~Need to replace "St." with "Saint" in "fips"~
* Fix counties that aren't cross-referenced correctly:
    * "D.C."
    * "SAN FRANCISCO"
    * There are probably more

In [10]:
df['Tweet FIPS'] = df['tweet_locations'].apply(get_fips_from_list_of_locs)

df['User FIPS'] = df['user_location'].apply(get_fips_from_loc)

## This looks unimportant but we need it

In [82]:
def remove_nan_from_list(my_list):

    return [x for x in my_list if not pd.isnull(x)]

## Percentage of GeoCov19 Tweets with county-level user data

In [72]:
df[~df['User FIPS'].isnull()]['geo_source'].value_counts() / len(df) * 100

user_location   6.6260
place           0.0708
coordinates     0.0027
Name: geo_source, dtype: float64

## Keep this

In [130]:
def aggregate_by_user(df):
    '''
    Input = GeoCov19 dataframe
    Output = pd.DatFrame with four columns: 'user_id'; 'User FIPS - user_location', 'User FIPS - place', 'User FIPS - coordinates'
    '''
    
    loc_type_dict = {}
    
    for loc_source in ['user_location', 'place', 'coordinates']:

        loc_type_dict[loc_source] = df[ df['geo_source']==loc_source ].groupby(by='user_id').agg({'User FIPS': set})        
    
        loc_type_dict[loc_source]['User FIPS'] = loc_type_dict[loc_source]['User FIPS'].apply(list)
    # merging
    
    grouped_df = loc_type_dict['user_location']
    
    grouped_df.columns = grouped_df.columns + ' - user_location'
    
    grouped_df = grouped_df.merge(loc_type_dict['place'], on='user_id', suffixes=(None, f" - place"), how='outer')
    
    grouped_df = grouped_df.merge(loc_type_dict['coordinates'], on='user_id', suffixes=(' - place', f" - coordinates"), how='outer')
    
    grouped_df.columns = str(df['Date'].unique()[0]) + ' - ' + grouped_df.columns
    
    return grouped_df.reset_index()

In [117]:
df['Date'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d').dt.date

In [131]:
aggregate_by_user(df[~df['User FIPS'].isnull()].head(20))

Unnamed: 0,user_id,2020-02-01 - User FIPS - user_location,2020-02-01 - User FIPS - place,2020-02-01 - User FIPS - coordinates
0,13604612,[13121.0],,
1,23328623,[42003.0],,
2,60676618,[6037.0],,
3,90670459,[48279.0],,
4,171348971,[36029.0],,
5,228938650,[20091.0],,
6,322784463,[37119.0],,
7,359211782,[25025.0],,
8,593195091,[12099.0],,
9,893190151,[17097.0],,


# FINAL FUNCTION


* make FIPS df and dict
* import master list of CT HT and Link users

In [34]:
def final_func(IMPORT_PATH, EXPORT_PATH):
    
    df = pd.read_json(fr"{IMPORT_PATH}", lines=True)
    
    # drop if not in master list of users
    df = df.set_index('user_id').loc[MASTER_USER_LIST]
    
    # convert location strings to FIPS
#     df['Tweet FIPS'] = df['tweet_locations'].apply(get_fips_from_list_of_locs)
    df['User FIPS'] = df['user_location'].apply(get_fips_from_loc)
    
    # only keep obs with user_location
    df = df[~df['User FIPS'].isnull()]
    
    # convert date to better format
    df['Date'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d').dt.date
    del df['created_at']
    
    # aggregate by user
    df = aggregate_by_user(df)
    
    df.to_csv(fr"{EXPORT_PATH_ROOT}.csv")

# Implementing Final Function (Final Loop)

In [133]:
month_length_dict = {'feb':29, 'march':31, 'april':30}
month_number_dict = {'feb':2, 'march':3, 'april':4}

IMPORT_PATH_ROOT = "C:/Users/crackcocaine69xxx/Python Stuff/594/GeoCoV19 Data/"

EXPORT_PATH_ROOT = "C:/Users/crackcocaine69xxx/Python Stuff/594/GeoCoV19 Data/Intermediate Cleaned Data/"

for month in ['feb', 'march', 'april']:
    
    days = month_length_dict[month]
    month_number = month_number_dict[month]
    
    for i in range(1, days + 1):
        
        if i < 10:
            
            IMP_PATH = IMPORT_PATH_ROOT + fr"geo_{month}_0{days}_10/geo_2020-{month_number}-0{days}/geo_2020-{month_number}-0{days}.json"
            EXP_PATH = EXPORT_PATH_ROOT + f"{month_number}-{days}.csv"
            
            final_func(IMP_PATH, EXP_PATH)

        elif i < 20:
            
            IMP_PATH = IMPORT_PATH_ROOT + fr"geo_{month}_{days}_20/geo_2020-{month_number}-{days}/geo_2020-{month_number}-{days}.json"
            EXP_PATH = EXPORT_PATH_ROOT + f"{month_number}-{days}.csv"
            
            final_func(IMP_PATH, EXP_PATH)            
        else:
            
            IMP_PATH = IMPORT_PATH_ROOT + fr"geo_{month}_{days}_30/geo_2020-{month_number}-{days}/geo_2020-{month_number}-{days}.json"
            EXP_PATH = EXPORT_PATH_ROOT + f"{month_number}-{days}.csv"
            
            final_func(IMP_PATH, EXP_PATH)            

ValueError: Expected object or value

Want to end up with each row being a user, and each col being the time period (week). 

(i,j)=[list of FIPS they tweeted from in this week]