In [98]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

# import geopandas as gpd
# import fiona

import matplotlib.pyplot as plt
import seaborn as sns

In [99]:
pwd

'/Users/michaelmainzer/OneDrive - Southern Company/projects/covid-recovery/data'

In [100]:
# all codes need to be brought in as strings
msa  =  pd.read_excel('../../data/geographies/cbsa.xlsx',
                      sheet_name='cbsa',
                      usecols="A,D,M",
                      converters={'fips':str,'CBSA Code':str})
rc = pd.read_excel('../../data/geographies/regional-commissions.xlsx',
                   sheet_name='county-rc',
                   usecols="A,C,D",
                   converters={"fips":str})

In [101]:
msa.sort_values('fips', ascending='False', inplace=True)
rc.sort_values('fips', ascending='False', inplace=True)

In [102]:
# covid = pd.read_excel('covid.xlsx')
covid = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
covid = covid.loc[covid['state'] == 'Georgia']

# the covid data is reported as a cumulative sum
# we want the daily counts as well for different charting options
covid.date = pd.to_datetime(covid.date)
covid.sort_values(['county', 'date'], ascending=[True, True], inplace=True)

covid['fips'] = covid['fips'].fillna(13)

covid['fips'] = covid['fips'].astype(int)
covid['fips'] = covid['fips'].astype(str)

covid.reset_index(inplace=True, drop=True)

In [103]:
covid.sort_values('fips', inplace=True)

In [104]:
covid['county'] = covid['county'].replace({'Unknown':'Georgia'})

In [105]:
covid.sort_values(['county', 'date'], ascending=[True, True], inplace=True)

covid['daily_cases'] = np.where(covid.fips == covid.fips.shift(), covid.cases - covid.cases.shift(), np.nan)

covid['daily_cases'].fillna(covid['cases'], inplace=True)

covid['daily_deaths'] = np.where(covid.fips == covid.fips.shift(), covid.deaths - covid.deaths.shift(), np.nan)
covid['daily_deaths'].fillna(covid['deaths'], inplace=True)

In [106]:
# create three separate frames - one for the counties and state like currently
    # another for msas
    # and another for rcs
    # then concat all three for a single csv

covidCounty = covid.copy()

In [107]:
covidMsa = pd.merge(covidCounty,msa, how='left', on='fips')

In [108]:
covidMsa.dropna(subset=['CBSA Code'], inplace=True)

In [109]:
covidMsa = covidMsa.groupby(['date','CBSA Title','CBSA Code']).agg({'cases'       :'sum',
                                                                    'deaths'      :'sum',
                                                                    'daily_cases' :'sum',
                                                                    'daily_deaths':'sum'}).reset_index()

In [110]:
covidRc = pd.merge(covidCounty,rc, how='left', on='fips')

In [111]:
covidRc = covidRc.groupby(['date','Regional Commission','RC_ID']).agg({'cases'       :'sum',
                                                                       'deaths'      :'sum',
                                                                       'daily_cases' :'sum',
                                                                       'daily_deaths':'sum'}).reset_index()

In [112]:
# match the column names and order of all frames

# final has to be the following
# date,area_code (state fips, county fips, cbsa, lwfda_no),area_name,cases,
# deaths,daily_cases,daily_deaths,rolling_7_case_avg,rolling_7_death_avg

covidRc.rename(columns={'Regional Commission' : 'area_name',
                        'RC_ID'               : 'area_code'}, inplace=True)

covidMsa.rename(columns={'CBSA Title' : 'area_name',
                         'CBSA Code'  : 'area_code'}, inplace=True)

covidCounty.rename(columns={'county'     : 'area_name',
                            'fips'  : 'area_code'}, inplace=True)

covidCounty = covidCounty[['date','area_name','area_code','cases','deaths','daily_cases','daily_deaths']]

In [113]:
covidState = covidCounty.loc[covidCounty['area_name'] == 'Georgia']
covidCounty = covidCounty.loc[covidCounty['area_name'] != 'Georgia']

In [117]:
df = covidState.append([covidRc, covidMsa, covidCounty])

In [119]:
df.reset_index(inplace=True)

In [121]:
del df['index']

In [123]:
df.to_csv('../application/app-data/covid.csv',index=False)

In [None]:
# add 7 day running average
# add total new cases last two weeks
# add total new cases per 1000
# add daily growth rate over last 14 days
# new cases over the last 7 days
# new cases over the previous 7-day period
# total change current week to previous week
# week to week change per 100k

In [9]:
covid.head()

Unnamed: 0,date,county,state,fips,cases,deaths,daily_cases,daily_deaths
0,2020-03-30,Appling,Georgia,13001,2,0,2.0,0.0
1,2020-03-31,Appling,Georgia,13001,1,0,-1.0,0.0
2,2020-04-01,Appling,Georgia,13001,1,0,0.0,0.0
3,2020-04-02,Appling,Georgia,13001,3,0,2.0,0.0
4,2020-04-03,Appling,Georgia,13001,5,0,2.0,0.0
