In [2]:
%matplotlib inline

from importlib.machinery import SourceFileLoader
import algosdk
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import os
import re
import json
import joblib

# load custome module from path.
covid19_WebScrapes = SourceFileLoader("covid19_WebScrapes", "./scripts/covid19_WebScrapes.py").load_module()
merge_prep_data = SourceFileLoader("merge_data", "./scripts/merge_prep_data.py").load_module()

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 78)

# Read in Cases Data on County Level

In [2]:
covid19_county_level = covid19_WebScrapes.TestingData_Scraper()

Testing_DF = covid19_county_level.Get_Final_DF(Impute = True)

Imputing values where errors in cumulative stats.


In [3]:
Testing_DF.tail()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Positive,Deaths
384793,84070016,US,USA,840,,Central Utah,Utah,US,39.372319,-111.575868,"Central Utah, Utah, US",5/18/20,30,0
384794,84070017,US,USA,840,,Southeast Utah,Utah,US,38.996171,-110.701396,"Southeast Utah, Utah, US",5/18/20,15,0
384795,84070018,US,USA,840,,Southwest Utah,Utah,US,37.854472,-111.441876,"Southwest Utah, Utah, US",5/18/20,236,3
384796,84070019,US,USA,840,,TriCounty,Utah,US,40.124915,-109.517442,"TriCounty, Utah, US",5/18/20,22,0
384797,84070020,US,USA,840,,Weber-Morgan,Utah,US,41.27116,-111.914512,"Weber-Morgan, Utah, US",5/18/20,234,3


# Read in Algorand Blockchain Survey Data

In [4]:
API_KEY=str(np.loadtxt('local_var.txt',dtype=str))[8:]

alg_tx = covid19_WebScrapes.Algorand_Scrape(API_KEY)

algod last round: 6761408
algod time since last round: 385096693
algod catchup: 0
algod latest version: https://github.com/algorandfoundation/specs/tree/8096e2df2da75c3339986317f9abe69d4fa86b4b
####################
{'hash': 'H7USWABILCOKRZB3TDLUCTZGG23H2FBGKQ3VVXRW4GZDFFTVHZ3Q', 'previousBlockHash': 'WQJUIOVJ6KNSSO22LXDSTYBS6G6BINXDAJBSBZQGDHMXJAPSLNSA', 'seed': 'R5WJROFK7OOJUT4DIKGQYJXECXHS7I7E6753CVGBIDP4UWOCGBWQ', 'proposer': '4ZODDDUGJIOXZF6GMV42OG3I3ZDPUTAZIM5UBQWZUHPKIYCX7RKEOHXSTY', 'round': 6761408, 'period': 0, 'txnRoot': 'WRS2VL2OQ5LPWBYLNBCZV3MEQ4DACSRDES6IUKHGOWYQERJRWC5A', 'reward': 122387, 'rate': 26000001, 'frac': 863288133, 'txns': {}, 'timestamp': 1589937828, 'currentProtocol': 'https://github.com/algorandfoundation/specs/tree/8096e2df2da75c3339986317f9abe69d4fa86b4b', 'nextProtocol': 'https://github.com/algorandfoundation/specs/tree/e5f565421d720c6f75cdd186f7098495caf9101f', 'nextProtocolApprovals': 10000, 'nextProtocolVoteBefore': 6708531, 'nextProtocolSwitchOn': 684

In [5]:
Survey_DF = alg_tx.Convert_to_DF()

In [6]:
Survey_DF_trim = Survey_DF[['gc','gr','gzp','ga','gs','sz','tz','tt','tr','mz','qz','q1','q2','q3','q4','ql']]
Survey_DF_trim.columns = ['Country','Region','Zip','AgeGroup','Gender','Symptomatic',
                          'Tested','Tested_Attempt','Test_Result','Received_Care','Quarantined',
                          'Q_Symptoms','Q_Voluntary','Q_Personal','Q_General','Left_Quarantine']
Survey_DF_trim.tail()

Unnamed: 0,Country,Region,Zip,AgeGroup,Gender,Symptomatic,Tested,Tested_Attempt,Test_Result,Received_Care,Quarantined,Q_Symptoms,Q_Voluntary,Q_Personal,Q_General,Left_Quarantine
20064,BR,SP,,50,m,-1,-1,,,-1,1,,,,True,1.0
20065,CN,37,,30,m,-1,-1,,,-1,-1,,,,,
20066,BR,CE,,20,f,-1,-1,,,-1,1,,,,True,1.0
20067,CN,50,,30,f,-1,-1,,,-1,-1,,,,,
20068,CN,51,,40,f,-1,-1,,,-1,-1,,,,,


# Read in Wikipedia Area Data

In [7]:
wiki_scraper = covid19_WebScrapes.Wiki_Scrape()

county_areas = wiki_scraper.Scrape_Counties()

Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia_(U.S._state),Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New_Hampshire,New_Jersey,New_Mexico,New_York,North_Carolina,North_Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode_Island,South_Carolina,South_Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West_Virginia,Wisconsin,Wyoming,

In [8]:
County_Areas = pd.DataFrame(county_areas,columns=['State','County_FIPS','County','Area (sqmi)'])

In [9]:
County_Areas['Area (sqmi)'] = County_Areas['Area (sqmi)'] .apply(lambda x : str(x).replace(',',''))
County_Areas['Area (sqmi)'] = County_Areas['Area (sqmi)'].astype(float)

In [13]:
County_Areas.head()

Unnamed: 0,State,County_FIPS,County,Area (sqmi)
0,Alabama,1,Autauga County,594.44
1,Alabama,3,Baldwin County,1589.78
2,Alabama,5,Barbour County,884.88
3,Alabama,7,Bibb County,622.58
4,Alabama,9,Blount County,644.78


# Read in Alphabet Location Data

In [11]:
google = covid19_WebScrapes.Alphabet_Scrape_V2()

google_df = google.get_Data(country='United States',country_only=False,state_only=False) #pulls county info only

google_df.head()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
108360,US,United States,Alabama,Autauga County,02/15/20,5.0,7.0,,,-4.0,
108361,US,United States,Alabama,Autauga County,02/16/20,0.0,1.0,-23.0,,-4.0,
108362,US,United States,Alabama,Autauga County,02/17/20,8.0,0.0,,,-27.0,5.0
108363,US,United States,Alabama,Autauga County,02/18/20,-2.0,0.0,,,2.0,0.0
108364,US,United States,Alabama,Autauga County,02/19/20,-2.0,0.0,,,2.0,0.0


# Clean Dataframes, Add FIPS ID's

In [14]:
data_cleaner = covid19_WebScrapes.Clean_Data('manually_pulled/FIPS_Codes_USDA.csv',
                                             'manually_pulled/new_state_mapping.txt')

area_data_cleaned = data_cleaner.Clean_Area_Data(County_Areas)
test_data_cleaned = data_cleaner.Clean_Cases_Data(Testing_DF)
google_data_cleaned = data_cleaner.Clean_Loc_Data(google_df)

# Write out dataframes

In [17]:
folder_name = datetime.datetime.strftime(datetime.datetime.today(),'%d%b%y')
os.mkdir('Processed_Data/'+folder_name)

test_data_cleaned.to_csv('Processed_Data/'+folder_name+'/CountyLevel_Cases_Cleaned.csv',index=False)
Survey_DF_trim.to_csv('Processed_Data/'+folder_name+'/Survey_Data.csv',index=False)
area_data_cleaned.to_csv('Processed_Data/'+folder_name+'/CountyLevel_Areas_Cleaned.csv',index=False)
google_data_cleaned.to_csv('Processed_Data/'+folder_name+'/CountyLevel_Google_LocData_Cleaned.csv',index=False)

# Now Merge Data and Engineer some features

In [3]:
data_merger = merge_prep_data.Merge_Data()

merged_census_data = data_merger.Merge_Census_Data()
merged_scraped_data = data_merger.Merge_Scraped_Data()

final_merge = data_merger.MERGE_ALL(merged_scraped_data,merged_census_data)

In [4]:
cols_remove = ['State_x','County_FIPS','County','Name','UID','iso2','iso3','code3','Province_State',
               'Country_Region','Combined_Key','country_region_code','country_region','sub_region_1',
               'sub_region_2','date','unique_id','State_y','STATE_FIPS','Unnamed: 0','county','state_FIPS',
              'county_fips']

final_merge.drop(cols_remove,axis=1,inplace=True)

In [5]:
engineer_feats = merge_prep_data.Engineer_Feats(datatype_write_loc='Merged_Data/data_types.json')

DF_out = engineer_feats.Apply_Logic(final_merge)

Adding Proximity Logic...
Normalizing Stats , lagging features...
Interpolating Data...
retail_and_recreation_percent_change_from_baseline


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF[col] = DF.groupby('state')[col].apply(lambda group: group.interpolate(method=stat_fill_method,axis=0).ffill().bfill())


grocery_and_pharmacy_percent_change_from_baseline
parks_percent_change_from_baseline
transit_stations_percent_change_from_baseline
workplaces_percent_change_from_baseline
residential_percent_change_from_baseline


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF[col] = DF.groupby('state')[col].apply(lambda group: group.interpolate(method=stat_fill_method,axis=0,).ffill().bfill())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF[col] = DF[col].astype(float)


Couldn't convert Admin2 to float
Couldn't convert Date to float
Couldn't convert state to float
Couldn't convert Closest_Big_City to float


In [6]:
cols_keep = ['FIPS','state','Admin2','Lat','Long_','Date',
             'Positive','Deaths','Positive_Cases_PopNormed','Deaths_PopNormed',
             'New_Positive_Cases_PopNormed_Lagged',
             'New_Positive_Cases_PopNormed',
             'Area (sqmi)','population','Proximity','Population_Density','Percent_in_Workforce',
             'Households_per_SqMile',
             'retail_and_recreation_percent_change_from_baseline',
             'grocery_and_pharmacy_percent_change_from_baseline',
             'parks_percent_change_from_baseline',
             'transit_stations_percent_change_from_baseline',
             'workplaces_percent_change_from_baseline',
             'residential_percent_change_from_baseline',
             'Mass gathering restrictions', 'Initial business closure',
             'Educational facilities closed', 'Non-essential services closed',
             'Stay at home order', 'Travel severely limited',
             '%_family_households', '%_single_male_households',
             '%_single_female_households', '%_living_alone',
             'total_household_income', 'household_income_less_than_25_years',
             'household_income_25_to_45_years', 'household_income_45_to_65_years',
             'household_income_65_and_older',
             '%_households_with_earnings_last12',
             '%_households_with_no_earnings_last12', '%_in_poverty',
             '%_in_poverty_18_to_59', '%_in_poverty_60_to_74',
             '%_in_poverty_75_to_85',
             '%_white', '%_black',
             '$_other_race', 
             '%_male', '%_female', 
             '%_male_pop_greater_than_60','%_female_pop_greater_than_60',
             '%_workers_less_than_15_to_work', '%_workers_15_to_45_to_work',
             '%_workers_greater_than_45_to_work',
             '%_drive_alone_to_work', '%_carpool_to_work',
             '%_public_transit_to_work', '%_bus_trolley_to_work', '%_walked_to_work',
             '%_cab_other_means_of_transportation_to_work','jail_incarceration_rate_per_100k']

In [7]:
DF_write = DF_out[cols_keep]

In [8]:
engineer_feats.write_out_json(DF_write)

In [42]:
folder_name = datetime.datetime.strftime(datetime.datetime.today(),'%d%b%y')
if not os.path.exists('Merged_Data/'+folder_name) :
    os.mkdir('Merged_Data/'+folder_name)
    
memory_use = DF_write.memory_usage(deep=True,index=False).sum()    

if memory_use > 1e8 :
    print('splitting dataframes for Github push purposes')
    break_val = int((DF_write.shape[0] * 45000000)/memory_use)
    for i in range(int(np.ceil(DF_write.shape[0]/break_val))) :
        name_write = 'Merged_Data/'+folder_name+'/Final_Merged_Pt{}.csv'.format(i+1)
        chunk_write = DF_write.iloc[break_val*i : break_val*(i+1)]
        chunk_write.to_csv(name_write,index=False)
else :
    DF_write.to_csv('Merged_Data/'+folder_name+'/Final_Merged.csv',index=False) #write out csv file

splitting dataframes for Github push purposes
