In [35]:
%matplotlib inline

from importlib.machinery import SourceFileLoader
import algosdk
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import os
import re
import json
import joblib
import warnings
from pandas.core.common import SettingWithCopyWarning


warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)


# load custome module from path.
covid19_WebScrapes = SourceFileLoader("covid19_WebScrapes", "./scripts/covid19_WebScrapes.py").load_module()
merge_prep_data = SourceFileLoader("merge_data", "./scripts/merge_prep_data.py").load_module()

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 78)

# Read in Cases Data on County Level

In [2]:
covid19_county_level = covid19_WebScrapes.TestingData_Scraper()

Testing_DF = covid19_county_level.Get_Final_DF(Impute = False)

In [3]:
Testing_DF.tail()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Positive,Deaths
505450,84070016,US,USA,840,,Central Utah,Utah,US,39.372319,-111.575868,"Central Utah, Utah, US",06/24/20,117,0
505451,84070017,US,USA,840,,Southeast Utah,Utah,US,38.996171,-110.701396,"Southeast Utah, Utah, US",06/24/20,32,0
505452,84070018,US,USA,840,,Southwest Utah,Utah,US,37.854472,-111.441876,"Southwest Utah, Utah, US",06/24/20,1200,10
505453,84070019,US,USA,840,,TriCounty,Utah,US,40.124915,-109.517442,"TriCounty, Utah, US",06/24/20,40,0
505454,84070020,US,USA,840,,Weber-Morgan,Utah,US,41.27116,-111.914512,"Weber-Morgan, Utah, US",06/24/20,734,10


# Read in Algorand Blockchain Survey Data

In [5]:
# This is currently broken, and isn't used anyways due to lack of data at the time of training.
'''
API_KEY=str(np.loadtxt('local_var.txt',dtype=str))[8:]

alg_tx = covid19_WebScrapes.Algorand_Scrape(API_KEY)

Survey_DF = alg_tx.Convert_to_DF()

Survey_DF_trim = Survey_DF[['gc','gr','gzp','ga','gs','sz','tz','tt','tr','mz','qz','q1','q2','q3','q4','ql']]
Survey_DF_trim.columns = ['Country','Region','Zip','AgeGroup','Gender','Symptomatic',
                          'Tested','Tested_Attempt','Test_Result','Received_Care','Quarantined',
                          'Q_Symptoms','Q_Voluntary','Q_Personal','Q_General','Left_Quarantine']
Survey_DF_trim.tail()
'''

"\nAPI_KEY=str(np.loadtxt('local_var.txt',dtype=str))[8:]\n\nalg_tx = covid19_WebScrapes.Algorand_Scrape(API_KEY)\n\nSurvey_DF = alg_tx.Convert_to_DF()\n\nSurvey_DF_trim = Survey_DF[['gc','gr','gzp','ga','gs','sz','tz','tt','tr','mz','qz','q1','q2','q3','q4','ql']]\nSurvey_DF_trim.columns = ['Country','Region','Zip','AgeGroup','Gender','Symptomatic',\n                          'Tested','Tested_Attempt','Test_Result','Received_Care','Quarantined',\n                          'Q_Symptoms','Q_Voluntary','Q_Personal','Q_General','Left_Quarantine']\nSurvey_DF_trim.tail()\n"

# Read in Wikipedia Area Data

In [6]:
wiki_scraper = covid19_WebScrapes.Wiki_Scrape()

county_areas = wiki_scraper.Scrape_Counties()

Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia_(U.S._state),Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New_Hampshire,New_Jersey,New_Mexico,New_York,North_Carolina,North_Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode_Island,South_Carolina,South_Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West_Virginia,Wisconsin,Wyoming,

In [7]:
County_Areas = pd.DataFrame(county_areas,columns=['State','County_FIPS','County','Area (sqmi)'])

In [8]:
County_Areas['Area (sqmi)'] = County_Areas['Area (sqmi)'] .apply(lambda x : str(x).replace(',',''))
County_Areas['Area (sqmi)'] = County_Areas['Area (sqmi)'].astype(float)

In [9]:
County_Areas.head()

Unnamed: 0,State,County_FIPS,County,Area (sqmi)
0,Alabama,1,Autauga County,594.44
1,Alabama,3,Baldwin County,1589.78
2,Alabama,5,Barbour County,884.88
3,Alabama,7,Bibb County,622.58
4,Alabama,9,Blount County,644.78


# Read in Alphabet Location Data

In [10]:
google = covid19_WebScrapes.Alphabet_Scrape_V2()

google_df = google.get_Data(country='United States',country_only=False,state_only=False) #pulls county info only

google_df.head()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
226353,US,United States,Alabama,Autauga County,,1001.0,02/15/20,5.0,7.0,,,-4.0,
226354,US,United States,Alabama,Autauga County,,1001.0,02/16/20,0.0,1.0,-23.0,,-4.0,
226355,US,United States,Alabama,Autauga County,,1001.0,02/17/20,8.0,0.0,,,-27.0,5.0
226356,US,United States,Alabama,Autauga County,,1001.0,02/18/20,-2.0,0.0,,,2.0,0.0
226357,US,United States,Alabama,Autauga County,,1001.0,02/19/20,-2.0,0.0,,,2.0,0.0


# Read in Orders Data (stay at home, social distancing, etc...)

In [11]:
orders = covid19_WebScrapes.OrdersScrape()

orders_df = orders.getzip()

orders_df.head()

Unnamed: 0,location_name,peak_bed_day_mean,peak_bed_day_lower,peak_bed_day_upper,peak_icu_bed_day_mean,peak_icu_bed_day_lower,peak_icu_bed_day_upper,peak_vent_day_mean,peak_vent_day_lower,peak_vent_day_upper,all_bed_capacity,icu_bed_capacity,all_bed_usage,icu_bed_usage,available_all_nbr,available_icu_nbr,travel_limit_start_date,travel_limit_end_date,stay_home_start_date,stay_home_end_date,educational_fac_start_date,educational_fac_end_date,any_gathering_restrict_start_date,any_gathering_restrict_end_date,any_business_start_date,any_business_end_date,all_non-ess_business_start_date,all_non-ess_business_end_date
0,Abruzzo,2020-03-29,2020-03-28,2020-08-27,2020-04-01,2020-03-30,2020-08-27,2020-03-31,2020-03-29,2020-08-27,4270.0,153.0,3322.0,112.0,948.0,41.0,2020-03-22,2020-05-04,2020-03-11,2020-06-03,2020-03-05,,2020-03-11,,2020-03-11,,2020-03-11,2020-04-14
1,Acre,2020-08-04,2020-07-12,2020-08-27,2020-08-04,2020-07-15,2020-08-27,2020-08-04,2020-07-14,2020-08-27,1286.0,46.0,1007.0,39.0,279.0,7.0,,,,,2020-03-16,,2020-03-20,,2020-03-17,,2020-03-17,
2,Aguascalientes,2020-08-04,2020-05-22,2020-08-27,2020-08-04,2020-05-23,2020-08-27,2020-08-04,2020-05-20,2020-08-27,1276.0,80.0,896.0,67.0,380.0,13.0,,,,,2020-03-19,,2020-03-13,,2020-03-23,,2020-03-30,
3,Alabama,2020-09-21,2020-06-20,2020-12-27,2020-09-24,2020-06-21,2020-12-27,2020-09-21,2020-06-21,2020-12-27,17537.0,1525.0,11793.0,1050.0,5744.0,475.0,,,2020-04-04,2020-04-30,2020-03-19,,2020-03-19,,2020-03-19,2020-06-15,2020-03-28,2020-04-30
4,Alagoas,2020-08-04,2020-07-25,2020-08-26,2020-08-04,2020-07-27,2020-08-27,2020-08-04,2020-07-26,2020-08-27,5227.0,309.0,4088.0,261.0,1139.0,48.0,,,,,2020-03-17,,2020-03-16,,2020-04-07,,2020-04-07,2020-05-06


# Clean Dataframes, Add FIPS ID's

In [12]:
data_cleaner = covid19_WebScrapes.Clean_Data('manually_pulled/FIPS_Codes_USDA.csv',
                                             'manually_pulled/new_state_mapping.txt')

area_data_cleaned = data_cleaner.Clean_Area_Data(County_Areas)
test_data_cleaned = data_cleaner.Clean_Cases_Data(Testing_DF)
google_data_cleaned = data_cleaner.Clean_Loc_Data(google_df)
orders_data_cleaned = data_cleaner.Clean_Orders_Data(orders_df)

# Write out dataframes

In [30]:
folder_name = datetime.datetime.strftime(datetime.datetime.today(),'%d%b%y')
os.mkdir('Processed_Data/'+folder_name)

test_data_cleaned.to_csv('Processed_Data/'+folder_name+'/CountyLevel_Cases_Cleaned.csv',index=False)
#Survey_DF_trim.to_csv('Processed_Data/'+folder_name+'/Survey_Data.csv',index=False)
area_data_cleaned.to_csv('Processed_Data/'+folder_name+'/CountyLevel_Areas_Cleaned.csv',index=False)
google_data_cleaned.to_csv('Processed_Data/'+folder_name+'/CountyLevel_Google_LocData_Cleaned.csv',index=False)
orders_data_cleaned.to_csv('Processed_Data/'+folder_name+'/StateLevel_Orders_Cleaned.csv',index=False)

# Now Merge Data and Engineer some features

In [40]:
data_merger = merge_prep_data.Merge_Data()

merged_census_data = data_merger.Merge_Census_Data()
merged_scraped_data = data_merger.Merge_Scraped_Data()

final_merge = data_merger.MERGE_ALL(merged_scraped_data,merged_census_data)

In [43]:
cols_remove = ['County_FIPS','County','UID','iso2','iso3','code3','Province_State',
               'Country_Region','Combined_Key','country_region_code','country_region','sub_region_1',
               'sub_region_2','date','State_fip','Unnamed: 0','county','state_FIPS',
              'county_fips','census_fips_code']

final_merge.drop(cols_remove,axis=1,inplace=True)

In [44]:
engineer_feats = merge_prep_data.Engineer_Feats(datatype_write_loc='Merged_Data/data_types.json')

DF_out = engineer_feats.Apply_Logic(final_merge)

Adding Proximity Logic...
Normalizing Stats , lagging features...
Interpolating Data...
retail_and_recreation_percent_change_from_baseline
grocery_and_pharmacy_percent_change_from_baseline
parks_percent_change_from_baseline
transit_stations_percent_change_from_baseline
workplaces_percent_change_from_baseline
residential_percent_change_from_baseline
Couldn't convert State to float
Couldn't convert Admin2 to float
Couldn't convert Date to float
Couldn't convert state to float
Couldn't convert Closest_Big_City to float


In [45]:
cols_keep = ['FIPS','state','Admin2','Lat','Long_','Date',
             'Positive','Deaths','Positive_Cases_PopNormed','Deaths_PopNormed',
             'New_Positive_Cases_PopNormed_Lagged',
             'New_Positive_Cases_PopNormed',
             'Area (sqmi)','population','Proximity','Population_Density','Percent_in_Workforce',
             'Households_per_SqMile',
             'retail_and_recreation_percent_change_from_baseline',
             'grocery_and_pharmacy_percent_change_from_baseline',
             'parks_percent_change_from_baseline',
             'transit_stations_percent_change_from_baseline',
             'workplaces_percent_change_from_baseline',
             'residential_percent_change_from_baseline',
             'Mass gathering restrictions', 'Initial business closure',
             'Educational facilities closed', 'Non-essential services closed',
             'Stay at home order', 'Travel severely limited',
             '%_family_households', '%_single_male_households',
             '%_single_female_households', '%_living_alone',
             'total_household_income', 'household_income_less_than_25_years',
             'household_income_25_to_45_years', 'household_income_45_to_65_years',
             'household_income_65_and_older',
             '%_households_with_earnings_last12',
             '%_households_with_no_earnings_last12', '%_in_poverty',
             '%_in_poverty_18_to_59', '%_in_poverty_60_to_74',
             '%_in_poverty_75_to_85',
             '%_white', '%_black',
             '$_other_race', 
             '%_male', '%_female', 
             '%_male_pop_greater_than_60','%_female_pop_greater_than_60',
             '%_workers_less_than_15_to_work', '%_workers_15_to_45_to_work',
             '%_workers_greater_than_45_to_work',
             '%_drive_alone_to_work', '%_carpool_to_work',
             '%_public_transit_to_work', '%_bus_trolley_to_work', '%_walked_to_work',
             '%_cab_other_means_of_transportation_to_work','jail_incarceration_rate_per_100k']

In [46]:
DF_write = DF_out[cols_keep]

In [47]:
engineer_feats.write_out_json(DF_write)

In [48]:
folder_name = datetime.datetime.strftime(datetime.datetime.today(),'%d%b%y')
if not os.path.exists('Merged_Data/'+folder_name) :
    os.mkdir('Merged_Data/'+folder_name)
    
memory_use = DF_write.memory_usage(deep=True,index=False).sum()    

if memory_use > 1e8 :
    print('splitting dataframes for Github push purposes')
    break_val = int((DF_write.shape[0] * 45000000)/memory_use)
    for i in range(int(np.ceil(DF_write.shape[0]/break_val))) :
        name_write = 'Merged_Data/'+folder_name+'/Final_Merged_Pt{}.csv'.format(i+1)
        chunk_write = DF_write.iloc[break_val*i : break_val*(i+1)]
        chunk_write.to_csv(name_write,index=False)
else :
    DF_write.to_csv('Merged_Data/'+folder_name+'/Final_Merged.csv',index=False) #write out csv file

splitting dataframes for Github push purposes
