In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

# import geopandas as gpd
# import fiona

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pwd

'/Users/michaelmainzer/Desktop/projects/covid-recovery/data'

In [3]:
# all codes need to be brought in as strings
msa  =  pd.read_excel('../../data/geographies/cbsa.xlsx',
                      sheet_name='cbsa',
                      usecols="A,D,M",
                      converters={'area_code':str,'CBSA Code':str})
rc = pd.read_excel('../../data/geographies/regional-commissions.xlsx',
                   sheet_name='county-rc',
                   usecols="A,C,D",
                   converters={"area_code":str})

In [4]:
msa.sort_values('area_code', ascending='False', inplace=True)
rc.sort_values('area_code', ascending='False', inplace=True)

In [5]:
# get clean covid data
covid = pd.read_csv('../application/app-data/covid-clean.csv')
covid = covid[['date','area_name','area_code','cases','deaths']]

covid.date = pd.to_datetime(covid.date)

covid['area_code'] = covid['area_code'].astype(int)
covid['area_code'] = covid['area_code'].astype(str)

In [None]:
# get new data from nyt
covidCounty = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
covidCounty = covidCounty.loc[covidCounty['state'] == 'Georgia']

covidCounty = covidCounty[['date','county','fips','cases','deaths']]

# get state level data
covidState = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')
covidState = covidState.loc[covidState['state'] == 'Georgia']

covidCounty.rename(columns={'county' : 'area_name',
                      'fips'   : 'area_code'}, inplace=True)

covidState.rename(columns={'state' : 'area_name',
                           'fips'  : 'area_code'}, inplace=True)

covid = covidState.append(covidCounty)

# the covid data is reported as a cumulative sum
# we want the daily counts as well for different charting options
covid.date = pd.to_datetime(covid.date)
covid.sort_values(['area_name', 'date'], ascending=[True, True], inplace=True)



covid.dropna(subset=['area_code'], inplace=True)

covid['area_code'] = covid['area_code'].astype(int)
covid['area_code'] = covid['area_code'].astype(str)

covid.reset_index(inplace=True, drop=True)

covid.sort_values(['area_name', 'date'], ascending=[True, True], inplace=True)

# filter only CURRENT DAY'S DATA
# append to cleaned sheet

In [6]:

covid['daily_cases'] = np.where(covid['area_code'] == covid['area_code'].shift(), covid.cases - covid.cases.shift(), np.nan)

covid['daily_cases'].fillna(covid['cases'], inplace=True)

covid['daily_deaths'] = np.where(covid['area_code'] == covid['area_code'].shift(), covid.deaths - covid.deaths.shift(), np.nan)
covid['daily_deaths'].fillna(covid['deaths'], inplace=True)

In [7]:
covidMsa = pd.merge(covid,msa, how='left', on='area_code')

In [8]:
covidMsa.dropna(subset=['CBSA Code'], inplace=True)

In [9]:
covidMsa = covidMsa.groupby(['date','CBSA Title','CBSA Code']).agg({'cases'       :'sum',
                                                                    'deaths'      :'sum',
                                                                    'daily_cases' :'sum',
                                                                    'daily_deaths':'sum'}).reset_index()

In [10]:
covidRc = pd.merge(covid,rc, how='left', on='area_code')

In [11]:
covidRc = covidRc.groupby(['date','Regional Commission','RC_ID']).agg({'cases'       :'sum',
                                                                       'deaths'      :'sum',
                                                                       'daily_cases' :'sum',
                                                                       'daily_deaths':'sum'}).reset_index()

In [12]:
# match the column names and order of all frames

# final has to be the following
# date,area_code (state fips, county fips, cbsa, lwfda_no),area_name,cases,
# deaths,daily_cases,daily_deaths,rolling_7_case_avg,rolling_7_death_avg

covidRc.rename(columns={'Regional Commission' : 'area_name',
                        'RC_ID'               : 'area_code'}, inplace=True)

covidMsa.rename(columns={'CBSA Title' : 'area_name',
                         'CBSA Code'  : 'area_code'}, inplace=True)

In [25]:
covidRc.sort_values(['area_code','date'], ascending=['False','False'], inplace=True)
covidMsa.sort_values(['area_code','date'], ascending=['False','False'], inplace=True)
covid.sort_values(['area_code','date'], ascending=['False','False'], inplace=True)

In [26]:
df = covid.append([covidRc, covidMsa])

In [27]:
df.tail(100)

Unnamed: 0,date,area_name,area_code,cases,deaths,daily_cases,daily_deaths
1152,2020-04-18,"Valdosta, GA",46660,122,9,6.0,0.0
1191,2020-04-19,"Valdosta, GA",46660,129,9,7.0,0.0
1230,2020-04-20,"Valdosta, GA",46660,135,10,6.0,1.0
1269,2020-04-21,"Valdosta, GA",46660,158,10,23.0,0.0
1308,2020-04-22,"Valdosta, GA",46660,170,11,12.0,1.0
...,...,...,...,...,...,...,...
1155,2020-04-18,"Waycross, GA",48180,132,7,6.0,0.0
1194,2020-04-19,"Waycross, GA",48180,137,8,5.0,1.0
1233,2020-04-20,"Waycross, GA",48180,139,8,2.0,0.0
1272,2020-04-21,"Waycross, GA",48180,144,9,5.0,1.0


In [14]:
df.sort_values(['area_code','date'], ascending=['False','False'], inplace=True)

In [15]:
df.reset_index(drop=True,inplace=True)

In [19]:
df.head()

Unnamed: 0,date,area_name,area_code,cases,deaths,daily_cases,daily_deaths
0,2020-03-11,"Albany, GA",10500,1,0,1.0,0.0
1,2020-03-12,"Albany, GA",10500,1,0,0.0,0.0
2,2020-03-13,"Albany, GA",10500,1,0,0.0,0.0
3,2020-03-14,"Albany, GA",10500,1,0,0.0,0.0
4,2020-03-15,"Albany, GA",10500,8,0,7.0,0.0


In [17]:
df.to_csv('../application/app-data/covid.csv',index=False)

In [None]:
# add 7 day running average
# add total new cases last two weeks
# add total new cases per 1000
# add daily growth rate over last 14 days
# new cases over the last 7 days
# new cases over the previous 7-day period
# total change current week to previous week
# week to week change per 100k

In [None]:
covid.head()