In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

import geopandas as gpd
import fiona

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pwd

In [2]:
# all codes need to be brought in as strings
msa  =  pd.read_excel('../../data/geographies/cbsa.xlsx',
                      sheet_name='cbsa',
                      usecols="A,D,M",
                      converters={'area_code':str,'CBSA Code':str})
rc = pd.read_excel('../../data/geographies/regional-commissions.xlsx',
                   sheet_name='county-rc',
                   usecols="A,C,D",
                   converters={"area_code":str})

In [3]:
msa.sort_values('area_code', ascending='False', inplace=True)
rc.sort_values('area_code', ascending='False', inplace=True)

In [4]:
# get clean covid data
covid = pd.read_csv('../application/app-data/covid-county-clean.csv')
covid = covid[['date','area_name','area_code','cases','deaths']]

covid.date = pd.to_datetime(covid.date)

covid['area_code'] = covid['area_code'].astype(int)
covid['area_code'] = covid['area_code'].astype(str)

In [5]:
# make a single dataset with the right unique identifiers for each
covidCodes = covid[['area_name','area_code']]
covidCodes.drop_duplicates(inplace=True)
msaCodes = msa[['CBSA Title','CBSA Code']]
msaCodes.drop_duplicates(inplace=True)
msaCodes.rename(columns={'CBSA Title' : 'area_name',
                         'CBSA Code'  : 'area_code'}, inplace=True)
rcCodes = rc[['Regional Commission','RC_ID']]
rcCodes.drop_duplicates(inplace=True)
rcCodes.rename(columns={'Regional Commission' : 'area_name',
                        'RC_ID'               : 'area_code'}, inplace=True)

codes = covidCodes.append([rcCodes, msaCodes])

codes.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [6]:
# get new data from nyt
covidCountyNew = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
covidCountyNew = covidCountyNew.loc[covidCountyNew['state'] == 'Georgia']

covidCountyNew = covidCountyNew[['date','county','fips','cases','deaths']]

# get state level data
covidStateNew = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')
covidStateNew = covidStateNew.loc[covidStateNew['state'] == 'Georgia']

covidCountyNew.rename(columns={'county' : 'area_name',
                               'fips'   : 'area_code'}, inplace=True)

covidStateNew.rename(columns={'state' : 'area_name',
                              'fips'  : 'area_code'}, inplace=True)

covidNew = covidStateNew.append(covidCountyNew)

# the covid data is reported as a cumulative sum
# we want the daily counts as well for different charting options
covidNew.date = pd.to_datetime(covidNew.date)
covidNew.sort_values(['area_name', 'date'], ascending=[True, True], inplace=True)



covidNew.dropna(subset=['area_code'], inplace=True)

covidNew['area_code'] = covidNew['area_code'].astype(int)
covidNew['area_code'] = covidNew['area_code'].astype(str)

covidNew.reset_index(inplace=True, drop=True)

covidNew.sort_values(['area_name', 'date'], ascending=[True, True], inplace=True)

# filter only dates AFTER 4/22
covidNew = covidNew.loc[covidNew['date'] >= '2020-04-23']
# append to cleaned sheet
covid = covid.append(covidNew)
covid.sort_values(['area_code','date'], inplace=True)
covid.reset_index(drop=True,inplace=True)

In [7]:
covid['daily_cases'] = np.where(covid['area_code'] == covid['area_code'].shift(), covid.cases - covid.cases.shift(), np.nan)

covid['daily_cases'].fillna(covid['cases'], inplace=True)

covid['daily_deaths'] = np.where(covid['area_code'] == covid['area_code'].shift(), covid.deaths - covid.deaths.shift(), np.nan)
covid['daily_deaths'].fillna(covid['deaths'], inplace=True)

In [8]:
covidMsa = pd.merge(covid,msa, how='left', on='area_code')

In [9]:
covidMsa.dropna(subset=['CBSA Code'], inplace=True)

In [10]:
covidMsa = covidMsa.groupby(['date','CBSA Title','CBSA Code']).agg({'cases'       :'sum',
                                                                    'deaths'      :'sum',
                                                                    'daily_cases' :'sum',
                                                                    'daily_deaths':'sum'}).reset_index()

In [11]:
covidRc = pd.merge(covid,rc, how='left', on='area_code')

In [12]:
covidRc = covidRc.groupby(['date','Regional Commission','RC_ID']).agg({'cases'       :'sum',
                                                                       'deaths'      :'sum',
                                                                       'daily_cases' :'sum',
                                                                       'daily_deaths':'sum'}).reset_index()

In [13]:
# match the column names and order of all frames

# final has to be the following
# date,area_code (state fips, county fips, cbsa, lwfda_no),area_name,cases,
# deaths,daily_cases,daily_deaths,rolling_7_case_avg,rolling_7_death_avg

covidRc.rename(columns={'Regional Commission' : 'area_name',
                        'RC_ID'               : 'area_code'}, inplace=True)

covidMsa.rename(columns={'CBSA Title' : 'area_name',
                         'CBSA Code'  : 'area_code'}, inplace=True)

In [14]:
covidRc = covidRc.set_index(['date','area_name']).unstack(fill_value=0).asfreq('D', fill_value=0).stack().sort_index(level=1).reset_index()
covidMsa = covidMsa.set_index(['date','area_name']).unstack(fill_value=0).asfreq('D', fill_value=0).stack().sort_index(level=1).reset_index()
covid = covid.set_index(['date','area_name']).unstack(fill_value=0).asfreq('D', fill_value=0).stack().sort_index(level=1).reset_index()

In [15]:
# each dataframe now has 0 in place of some of their area_codes because of dates added
# merge with correct area codes
covidRc = pd.merge(covidRc, codes, how='left', on='area_name')
covidRc.rename(columns={'area_code_y' : 'area_code'}, inplace=True)
covidRc = covidRc[['date','area_name','area_code','cases','daily_cases','deaths','daily_deaths']]
covidMsa = pd.merge(covidMsa, codes, how='left', on='area_name')
covidMsa.rename(columns={'area_code_y' : 'area_code'}, inplace=True)
covidMsa = covidMsa[['date','area_name','area_code','cases','daily_cases','deaths','daily_deaths']]
covid = pd.merge(covid, codes, how='left', on='area_name')
covid.rename(columns={'area_code_y' : 'area_code'}, inplace=True)
covid = covid[['date','area_name','area_code','cases','daily_cases','deaths','daily_deaths']]

In [16]:
covidRc.sort_values(['area_code','date'], ascending=['False','False'], inplace=True)
covidMsa.sort_values(['area_code','date'], ascending=['False','False'], inplace=True)
covid.sort_values(['area_code','date'], ascending=['False','False'], inplace=True)

In [17]:
df = covid.append([covidRc, covidMsa])

In [18]:
df.reset_index(drop=True,inplace=True)

In [19]:
# get 7 day rolling average for cases
groupedCases = df.groupby(['area_name']).apply(lambda x: x.set_index('date').resample('1D').first())

groupedCases = groupedCases.groupby(level=0)['daily_cases'] \
                            .apply(lambda x: x.shift().rolling(min_periods=6,window=7).mean()) \
                            .reset_index(name='avg_cases_7')

# get 7 day rolling average for deaths
groupedDeaths = df.groupby(['area_name']).apply(lambda x: x.set_index('date').resample('1D').first())

groupedDeaths = groupedDeaths.groupby(level=0)['daily_deaths'] \
                             .apply(lambda x: x.shift().rolling(min_periods=6,window=7).mean()) \
                             .reset_index(name='avg_deaths_7')

In [20]:
df = pd.merge(df, groupedCases, how='left', on=['date', 'area_name'])
df = pd.merge(df, groupedDeaths, how='left', on=['date', 'area_name'])

In [21]:
df["dateTwo"] = pd.to_datetime(df["date"]).dt.strftime('%b %d')

In [22]:
df["date"] = pd.to_datetime(df["date"]).dt.strftime('%m-%d-%Y')

In [23]:
df = df[['date','dateTwo','area_name','area_code','daily_cases','avg_cases_7','daily_deaths','avg_deaths_7']]

In [28]:
df.head(112)

Unnamed: 0,date,dateTwo,area_name,area_code,daily_cases,avg_cases_7,daily_deaths,avg_deaths_7
0,03-02-2020,Mar 02,Georgia,13,2.0,,0.0,
1,03-03-2020,Mar 03,Georgia,13,0.0,,0.0,
2,03-04-2020,Mar 04,Georgia,13,0.0,,0.0,
3,03-05-2020,Mar 05,Georgia,13,0.0,,0.0,
4,03-06-2020,Mar 06,Georgia,13,1.0,,0.0,
...,...,...,...,...,...,...,...,...
107,04-23-2020,Apr 23,Appling County,13001,0.0,4.714286,0.0,0.285714
108,04-24-2020,Apr 24,Appling County,13001,0.0,4.285714,0.0,0.285714
109,04-25-2020,Apr 25,Appling County,13001,0.0,4.142857,0.0,0.285714
110,03-02-2020,Mar 02,Atkinson County,13003,0.0,,0.0,


In [29]:
df.to_csv('../application/app-data/covid.csv',index=False)

In [None]:
# add total new cases last two weeks
# add total new cases per 1000
# add daily growth rate over last 14 days
# new cases over the last 7 days
# new cases over the previous 7-day period
# total change current week to previous week
# week to week change per 100k

In [None]:
covid.head()