In [2]:
!jupyter nbextension enable codefolding/main


Enabling notebook extension codefolding/main...
      - Validating: problems found:
        - require?  X codefolding/main


In [3]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [65]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 199)
pd.options.display.float_format = '{:.4f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Overview
1. Identify US users 
2. Identify US Tweets 
3. Delete all obs. without US User or Tweet 
4. Match to list of users we collected

## Import GeoCov19 Data

In [5]:
df = pd.read_json(r"C:/Users/crackcocaine69xxx/Python Stuff/594/GeoCoV19 Data/geo_feb_01_10/geo_2020-02-01/geo_2020-02-01.json", lines=True)

### Make Uppercase

In [6]:
def make_dict_uppercase(d):
    '''
    input = dictionary
    output = uppercase dictionary
    '''
    
    return {key.upper(): value.upper() for key,value in d.items()}

df['user_location'] = df['user_location'].apply(make_dict_uppercase)


##################################################################################


def make_list_of_dicts_uppercase(list_of_dicts):
    '''
    input = list of dictionaries
    output = uppercase list of dictionaries
    '''
    
    return [make_dict_uppercase(d) for d in list_of_dicts]


df['tweet_locations'] = df['tweet_locations'].apply(make_list_of_dicts_uppercase)

## Get (State & County) --> FIPS dictionary

In [7]:
# import data from wikipedia
fips = pd.read_html("https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county")[1]

# remove all hyperlinks (these look like "... County [h]", etc.)
fips['County or equivalent'] = fips['County or equivalent'].str.replace(r"\[.*\]","")

# convert to uppercase
fips['County or equivalent'] = fips['County or equivalent'].apply(lambda x: x.upper())
fips['State or equivalent'] = fips['State or equivalent'].apply(lambda x: x.upper())

# replace "St." with "Saint"
fips['County or equivalent'] = [x.replace('ST.','SAINT') for x in fips['County or equivalent']]

# remove everything after a comma in a county name (e.g. "ANCHORAGE, MUNICIPALITY OF")
fips['County or equivalent'] = [x.split(',')[0] for x in fips['County or equivalent']]

# replace DC info to correspond to GeoCov19 format
dc_loc = fips[fips['County or equivalent']=='DISTRICT OF COLUMBIA'].index.tolist()[0]
fips['State or equivalent'].loc[dc_loc] = 'WASHINGTON, D.C.'
fips['County or equivalent'].loc[dc_loc] = 'WASHINGTON'

########################################################################

# create dictionary
state_fips_dict = {k: f.groupby('County or equivalent')['FIPS'].apply(list).to_dict()
     for k, f in fips.groupby('State or equivalent')}

# clean dictionary
for state in state_fips_dict:
    for county in state_fips_dict[state]:
        state_fips_dict[state][county] = state_fips_dict[state][county][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


# Map county names to FIPS (using FIPS dict)

### Get FIPS from a single dictionary

In [8]:
def get_fips_from_loc(loc):
    '''
    input = tweet_location from GeoCov19 data (single dictionary)
    output = FIPS code corresponding to counties mentioned
    '''
    try:
       
        if loc['COUNTRY_CODE']=='US':

            try:

                if loc['COUNTY'].split(" ")[-1] in ('COUNTY', 'PARISH'):

                    return state_fips_dict[ loc['STATE'] ] [loc['COUNTY'] ]

                elif 'COUNTY' in loc:

                    try:

                        return state_fips_dict[ loc['STATE'] ] [loc['COUNTY'] + " " + "COUNTY"]

                    except Exception as e:

                        pass

            except Exception as e:

                pass
    except:
        
        pass

### Get FIPS from a list of dictionaries

In [9]:
def get_fips_from_list_of_locs(tweet_locs):
    '''
    input = tweet_locations from GeoCov19 data (list of dicts)
    output = list of FIPS codes corresponding to counties mentioned
    '''

    temp_list = [get_fips_from_loc(loc) for loc in tweet_locs]
    
    return [x for x in temp_list if x is not None]

## NEED TO INSPECT COUNTY MAPPINGS TO ENSURE THAT IT PICKS UP ON STRINGS WELL
May have to implement fuzzy-matching if errors continue? Probably easiest/best to just find errors, since they won't change

# ALL REPLACEMENTS MUST BE DONE IN "fips"
* ~"Pointe Coupee Parish County" should be "Pointe Coupee Parish"~
* ~Need to replace "St." with "Saint" in "fips"~
* Fix counties that aren't cross-referenced correctly:
    * "D.C."
    * "SAN FRANCISCO"
    * There are probably more

In [10]:
df['Tweet FIPS'] = df['tweet_locations'].apply(get_fips_from_list_of_locs)

df['User FIPS'] = df['user_location'].apply(get_fips_from_loc)

In [11]:
df.head()

Unnamed: 0,tweet_id,created_at,user_id,geo_source,user_location,geo,place,tweet_locations,Tweet FIPS,User FIPS
0,1223489703456006144,2020-02-01 06:14:11+00:00,30746211,user_location,{'COUNTRY_CODE': 'CA'},{},{},"[{'COUNTRY_CODE': 'UA', 'STATE': 'ZHYTOMYR OBLAST', 'COUNTY': 'SLOVECHNE AMALGAMATED TERRITORIAL COMMUNITY'}, {'COUNTRY_CODE': 'ES', 'STATE': 'ANDALUSIA', 'COUNTY': 'JAÉN', 'CITY': 'JAÉN'}, {'COU...","[40139, 24035]",
1,1223489798142230528,2020-02-01 06:14:33+00:00,2411706499,tweet_text,{},{},{},"[{'COUNTRY_CODE': 'RU', 'STATE': 'TATARSTAN', 'COUNTY': 'VYSOKOGORSKY DISTRICT'}, {'COUNTRY_CODE': 'US', 'STATE': 'NEW YORK', 'COUNTY': 'NEW YORK COUNTY', 'CITY': 'NEW YORK'}]",[36061],
2,1223489799044001792,2020-02-01 06:14:34+00:00,884746422364418048,user_location,"{'COUNTRY_CODE': 'CN', 'STATE': 'HONG KONG'}",{},{},"[{'COUNTRY_CODE': 'SG'}, {'COUNTRY_CODE': 'GB', 'STATE': 'ENGLAND', 'COUNTY': 'EAST SUSSEX', 'CITY': 'ROTHER'}, {'COUNTRY_CODE': 'YE', 'STATE': 'DHAMAR GOVERNORATE', 'COUNTY': 'ANSS DISTRICT'}, {...","[51051, 27071]",
3,1223489811459108864,2020-02-01 06:14:37+00:00,29512878,user_location,"{'COUNTRY_CODE': 'PH', 'STATE': 'CAVITE', 'CITY': 'DASMARINAS'}",{},{},[],[],
4,1223489829817577472,2020-02-01 06:14:41+00:00,3244922072,user_location,"{'COUNTRY_CODE': 'TH', 'STATE': 'SARABURI PROVINCE'}",{},{},[],[],


In [12]:
(~df['User FIPS'].isnull()).sum() / len(df)

0.06699491278524633

In [13]:
(df['geo_source']=='coordinates').sum() / len(df)

0.00020402866602757687

In [14]:
(df['geo'] != {}).sum() / len(df)

0.00020402866602757687

In [15]:
len(df[~df['User FIPS'].isnull()]) / len(df)

0.06699491278524633

In [44]:
df[~df['User FIPS'].isnull()]

Unnamed: 0,tweet_id,created_at,user_id,geo_source,user_location,geo,place,tweet_locations,Tweet FIPS,User FIPS
21,1223489918350983168,2020-02-01 06:15:02+00:00,1123938337130201088,user_location,"{'COUNTRY_CODE': 'US', 'STATE': 'NEW YORK', 'COUNTY': 'ONONDAGA COUNTY'}",{},{},"[{'COUNTRY_CODE': 'FR', 'STATE': 'NEW AQUITAINE', 'COUNTY': 'PAU'}, {'COUNTRY_CODE': 'IE'}, {'COUNTRY_CODE': 'US', 'STATE': 'ALABAMA', 'COUNTY': 'GENEVA COUNTY'}]",[1061],36067.00
33,1223729031679553536,2020-02-01 22:05:11+00:00,2674880963,user_location,"{'COUNTRY_CODE': 'US', 'STATE': 'TEXAS', 'COUNTY': 'DALLAS COUNTY', 'CITY': 'DALLAS'}",{},{},"[{'COUNTRY_CODE': 'US', 'STATE': 'MISSISSIPPI', 'COUNTY': 'DESOTO COUNTY'}, {'COUNTRY_CODE': 'US', 'STATE': 'MASSACHUSETTS', 'COUNTY': 'PLYMOUTH COUNTY'}]","[28033, 25023]",48113.00
72,1223491029304823808,2020-02-01 06:19:27+00:00,359211782,user_location,"{'COUNTRY_CODE': 'US', 'STATE': 'MASSACHUSETTS', 'COUNTY': 'SUFFOLK COUNTY', 'CITY': 'BOSTON'}",{},{},"[{'COUNTRY_CODE': 'RU', 'STATE': 'TATARSTAN', 'COUNTY': 'VYSOKOGORSKY DISTRICT'}, {'COUNTRY_CODE': 'US', 'STATE': 'NEW YORK', 'COUNTY': 'NEW YORK COUNTY', 'CITY': 'NEW YORK'}]",[36061],25025.00
73,1223490979329478656,2020-02-01 06:19:15+00:00,987844879,user_location,"{'COUNTRY_CODE': 'US', 'STATE': 'ALABAMA', 'COUNTY': 'BIBB COUNTY'}",{},{},"[{'COUNTRY_CODE': 'FR', 'STATE': 'AUVERGNE-RHÔNE-ALPES', 'COUNTY': 'THONON-LES-BAINS', 'CITY': 'NOVEL'}]",[],1007.00
104,1223731425289101312,2020-02-01 22:14:42+00:00,60365817,place,"{'COUNTRY_CODE': 'US', 'STATE': 'TEXAS', 'COUNTY': 'DALLAS COUNTY', 'CITY': 'DALLAS'}",{},"{'country_code': 'us', 'state': 'Texas', 'county': 'Dallas County', 'city': 'Dallas'}","[{'COUNTRY_CODE': 'IR', 'STATE': 'KERMANSHAH PROVINCE', 'COUNTY': 'ESLAMABAD-E GHARB COUNTY'}]",[],48113.00
...,...,...,...,...,...,...,...,...,...,...
666501,1223553492792627200,2020-02-01 10:27:39+00:00,915572521667321856,user_location,"{'COUNTRY_CODE': 'US', 'STATE': 'OKLAHOMA', 'COUNTY': 'OKLAHOMA COUNTY', 'CITY': 'OKLAHOMA CITY'}",{},{},"[{'COUNTRY_CODE': 'CN', 'STATE': 'HUBEI', 'COUNTY': 'HUANGZHOU', 'CITY': 'HUANGGANG'}, {'COUNTRY_CODE': 'CN', 'STATE': 'HUBEI', 'COUNTY': 'JIANG'AN DISTRICT', 'CITY': 'WUHAN'}, {'COUNTRY_CODE': '...",[32023],40109.00
666512,1223554215622168576,2020-02-01 10:30:32+00:00,1021118073187831808,user_location,"{'COUNTRY_CODE': 'US', 'STATE': 'OKLAHOMA', 'COUNTY': 'MARSHALL COUNTY'}",{},{},"[{'COUNTRY_CODE': 'US', 'STATE': 'LOUISIANA'}, {'COUNTRY_CODE': 'GT', 'STATE': 'ESCUINTLA', 'COUNTY': 'LA GOMERA'}, {'COUNTRY_CODE': 'ES'}, {'COUNTRY_CODE': 'US', 'STATE': 'TEXAS'}, {'COUNTRY_COD...",[40095],40095.00
666514,1223554790371684352,2020-02-01 10:32:49+00:00,168831316,user_location,"{'COUNTRY_CODE': 'US', 'STATE': 'MISSOURI', 'COUNTY': 'NEW MADRID COUNTY'}",{},{},"[{'COUNTRY_CODE': 'CG', 'STATE': 'KOUILOU', 'COUNTY': 'POINTE-NOIRE'}, {'COUNTRY_CODE': 'FR', 'STATE': 'AUVERGNE-RHÔNE-ALPES', 'COUNTY': 'THONON-LES-BAINS', 'CITY': 'NOVEL'}, {'COUNTRY_CODE': 'GB...",[],29143.00
666528,1223555329503436800,2020-02-01 10:34:57+00:00,24435494,user_location,"{'COUNTRY_CODE': 'US', 'STATE': 'CALIFORNIA', 'COUNTY': 'CONTRA COSTA COUNTY', 'CITY': 'BRENTWOOD'}",{},{},"[{'COUNTRY_CODE': 'ID', 'STATE': 'EAST NUSA TENGGARA', 'COUNTY': 'KABUPATEN MANGGARAI'}, {'COUNTRY_CODE': 'US', 'STATE': 'TENNESSEE', 'COUNTY': 'WHITE COUNTY'}, {'COUNTRY_CODE': 'US', 'STATE': 'W...","[47185, 54079]",6013.00


In [48]:
df[df['user_id']==1497]

Unnamed: 0,tweet_id,created_at,user_id,geo_source,user_location,geo,place,tweet_locations,Tweet FIPS,User FIPS
84210,1223413231408091136,2020-02-01 01:10:18+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},"[{'COUNTRY_CODE': 'FR', 'STATE': 'NEW AQUITAINE', 'COUNTY': 'MONT-DE-MARSAN', 'CITY': 'BATS'}, {'COUNTRY_CODE': 'AT', 'STATE': 'UPPER AUSTRIA', 'COUNTY': 'BRAUNAU AM INN', 'CITY': 'MINING'}, {'CO...",[],
166048,1223675564919283712,2020-02-01 18:32:44+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},[],[],
249764,1223647584327258112,2020-02-01 16:41:33+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},"[{'COUNTRY_CODE': 'RO', 'COUNTY': 'SĂLAJ'}, {'COUNTRY_CODE': 'CN', 'STATE': 'HUBEI', 'COUNTY': 'JIANG'AN DISTRICT', 'CITY': 'WUHAN'}]",[],
305224,1223614373555593216,2020-02-01 14:29:34+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},"[{'COUNTRY_CODE': 'CN', 'STATE': 'HUBEI', 'COUNTY': 'JIANG'AN DISTRICT', 'CITY': 'WUHAN'}]",[],
366539,1223614102351773696,2020-02-01 14:28:30+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},"[{'COUNTRY_CODE': 'US', 'STATE': 'CALIFORNIA', 'COUNTY': 'RIVERSIDE COUNTY', 'CITY': 'BANNING'}, {'COUNTRY_CODE': 'IR', 'STATE': 'KERMANSHAH PROVINCE', 'COUNTY': 'ESLAMABAD-E GHARB COUNTY'}]",[6065],
373146,1223602783288549376,2020-02-01 13:43:31+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},"[{'COUNTRY_CODE': 'FR', 'STATE': 'NEW AQUITAINE', 'COUNTY': 'BAYONNE'}, {'COUNTRY_CODE': 'CN', 'COUNTY': 'TAOYUAN COUNTY'}, {'COUNTRY_CODE': 'CN', 'STATE': 'FUJIAN', 'COUNTY': 'SHA COUNTY', 'CITY...",[],
416350,1223697469265018880,2020-02-01 19:59:46+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},"[{'COUNTRY_CODE': 'NP', 'STATE': 'WESTERN DEVELOPMENT REGION', 'COUNTY': 'MODI'}, {'COUNTRY_CODE': 'IN'}, {'COUNTRY_CODE': 'US', 'STATE': 'CALIFORNIA', 'COUNTY': 'ORANGE COUNTY', 'CITY': 'IRVINE'...","[6059, 18179]",
515835,1223622205214425088,2020-02-01 15:00:42+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},"[{'COUNTRY_CODE': 'BE', 'STATE': 'WALLONIA', 'COUNTY': 'WALLOON BRABANT'}, {'COUNTRY_CODE': 'IT'}, {'COUNTRY_CODE': 'BE', 'STATE': 'WALLONIA', 'COUNTY': 'WALLOON BRABANT'}, {'COUNTRY_CODE': 'AT',...",[],
550990,1223620401797922816,2020-02-01 14:53:32+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},"[{'COUNTRY_CODE': 'US', 'STATE': 'ILLINOIS', 'COUNTY': 'COOK COUNTY', 'CITY': 'JUSTICE'}, {'COUNTRY_CODE': 'US', 'STATE': 'PENNSYLVANIA', 'COUNTY': 'LYCOMING COUNTY'}]","[17031, 42081]",
618013,1223718994064048128,2020-02-01 21:25:18+00:00,1497,user_location,"{'COUNTRY_CODE': 'FR', 'STATE': 'OCCITANIA', 'COUNTY': 'TARBES', 'CITY': 'HORGUES'}",{},{},"[{'COUNTRY_CODE': 'CN'}, {'COUNTRY_CODE': 'US', 'STATE': 'WEST VIRGINIA', 'COUNTY': 'CLAY COUNTY'}]",[54015],


In [None]:
loc_type_dict = {}

for loc_source in ['user_location', 'place', 'coordinates']:
    
    loc_type_dict[loc_source] = df[ df['geo_source']==loc_source ].groupby(by='user_id').agg({'Tweet FIPS': 'sum', 'User FIPS': list})
    
    loc_type_dict[loc_source] = loc_type_dict[loc_source].apply(remove_nan_from_list)

In [73]:
test = df[(df['user_id']==1497) & (df['geo_source']=='user_location')].groupby(by='user_id').agg({'Tweet FIPS': 'sum', 'User FIPS': list})

test['User FIPS'] = test['User FIPS'].apply(remove_nan_from_list)

test

Unnamed: 0_level_0,Tweet FIPS,User FIPS
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1497,"[6065, 6059, 18179, 17031, 42081, 54015, 17011, 1061, 54015]",[]


In [38]:
df.groupby(by='user_id').agg(set)

Unnamed: 0_level_0,tweet_id,created_at,geo_source,User FIPS
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
364,{1223686944292581376},{2020-02-01 19:17:57+00:00},{user_location},{nan}
765,{1223756817718661120},{2020-02-01 23:55:36+00:00},{user_location},{nan}
1183,"{1223520438166290432, 1223416861427036160}","{2020-02-01 08:16:19+00:00, 2020-02-01 01:24:44+00:00}",{user_location},"{nan, nan}"
1378,{1223635851609022464},{2020-02-01 15:54:55+00:00},{tweet_text},{nan}
1497,"{1223413231408091136, 1223675564919283712, 1223647584327258112, 1223614102351773696, 1223614373555593216, 1223602783288549376, 1223697469265018880, 1223622205214425088, 1223620401797922816, 12237...","{2020-02-01 19:59:46+00:00, 2020-02-01 14:53:32+00:00, 2020-02-01 16:41:33+00:00, 2020-02-01 18:32:44+00:00, 2020-02-01 01:10:18+00:00, 2020-02-01 14:29:34+00:00, 2020-02-01 21:25:18+00:00, 2020-...",{user_location},"{nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan}"
...,...,...,...,...
1223751383360376832,{1223755447863316480},{2020-02-01 23:50:09+00:00},{user_location},{36067.0}
1223752292966420480,{1223752842772533248},{2020-02-01 23:39:48+00:00},{tweet_text},{nan}
1223754087713427456,{1223755941402771456},{2020-02-01 23:52:07+00:00},{tweet_text},{nan}
1223754434435600384,"{1223755420944207872, 1223756573765447680}","{2020-02-01 23:50:03+00:00, 2020-02-01 23:54:38+00:00}",{tweet_text},"{nan, nan}"


## This looks unimportant but we need it

In [51]:
def remove_nan_from_list(my_list):

    return [x for x in my_list if not np.isnan(x)]

## Percentage of GeoCov19 Tweets with county-level user data

In [72]:
df[~df['User FIPS'].isnull()]['geo_source'].value_counts() / len(df) * 100

user_location   6.6260
place           0.0708
coordinates     0.0027
Name: geo_source, dtype: float64

# FINAL FUNCTION


* make FIPS df and dict
* import master list of CT HT and Link users

In [34]:
def final_func(IMPORT_PATH, EXPORT_PATH, date_str):
    
    df = pd.read_json(fr"{IMPORT_PATH}", lines=True)
    
    # drop if not in master list of users
    df = df.set_index('user_id').loc[MASTER_USER_LIST]
    
    # convert location strings to FIPS
    df['Tweet FIPS'] = df['tweet_locations'].apply(get_fips_from_list_of_locs)
    df['User FIPS'] = df['user_location'].apply(get_fips_from_loc)
    
    # only keep obs with user_location
    df[~df['User FIPS'].isnull()]
    
    # convert date to better format
    df['Date'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d').dt.date
    del df['created_at']
    
    
    


Want to end up with each row being a user, and each col being the time period (week). 

(i,j)=[list of FIPS they tweeted from in this week]