In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

import geopandas as gpd
import fiona

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
counties = pd.read_excel('covid-recovery-data.xlsx', sheet_name="county-population")
jobs = pd.read_excel('covid-recovery-data.xlsx', sheet_name="county-jobs-summary", usecols="B,C,D,F,G,I")
ccvi = pd.read_excel('covid-recovery-data.xlsx', sheet_name="ccvi-county", usecols="D:K")
countyShape = gpd.read_file("spatial/ga-counties.geojson")
countyClaims = pd.read_excel('covid-recovery-data.xlsx', sheet_name="unemployment-claims-monthly")
hospitals = gpd.read_file("spatial/hospitals.geojson")

In [3]:
hospitals = gpd.read_file("spatial/hospitals.geojson")

In [4]:
hospitals.sort_values('BEDS', inplace=True)

In [5]:
hospitals.BEDS = np.where(hospitals.BEDS < 0, 82,hospitals.BEDS)

In [6]:
counties = pd.merge(counties,jobs,how='left',on='FIPS')

In [7]:
counties = pd.merge(counties,ccvi,how='left',on='FIPS')

In [8]:
counties['FIPS'] = counties['FIPS'].astype(str)

In [9]:
counties = pd.merge(counties,countyShape, how='left', left_on='FIPS', right_on='GEOID')

In [10]:
del counties['GEOID']

In [11]:
# groupby county fips to get number of hospitals and number of beds per county
countyHospitals = hospitals.groupby('COUNTYFIPS') \
                           .agg({'NAME':'count', 'BEDS': 'sum'}) \
                           .reset_index() \
                           .rename(columns={'NAME':'Hospitals',
                                            'BEDS':'Beds',
                                            'COUNTYFIPS':'FIPS'})

In [12]:
counties = pd.merge(counties,countyHospitals, how='left', on='FIPS')

In [13]:
counties['Hospitals'].fillna(0, inplace=True)
counties['Beds'].fillna(0, inplace=True)

In [14]:
counties['Beds_per_1000'] = ( counties['Beds'] / counties['Total_Population'] ) * 1000
counties['Beds_per_1000_Elderly'] = ( counties['Beds'] / counties['Population_Over_65'] ) * 1000

In [16]:
# get clean covid data
covid = pd.read_csv('../application/app-data/covid-county-clean.csv')
covid = covid[['date','area_name','area_code','cases','deaths']]

covid.date = pd.to_datetime(covid.date)

covid['area_code'] = covid['area_code'].astype(int)
covid['area_code'] = covid['area_code'].astype(str)

# get new data from nyt
covidCountyNew = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
covidCountyNew = covidCountyNew.loc[covidCountyNew['state'] == 'Georgia']

covidCountyNew = covidCountyNew[['date','county','fips','cases','deaths']]

# get state level data
covidStateNew = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')
covidStateNew = covidStateNew.loc[covidStateNew['state'] == 'Georgia']

covidCountyNew.rename(columns={'county' : 'area_name',
                               'fips'   : 'area_code'}, inplace=True)

covidStateNew.rename(columns={'state' : 'area_name',
                              'fips'  : 'area_code'}, inplace=True)

covidNew = covidStateNew.append(covidCountyNew)

# the covid data is reported as a cumulative sum
# we want the daily counts as well for different charting options
covidNew.date = pd.to_datetime(covidNew.date)
covidNew.sort_values(['area_name', 'date'], ascending=[True, True], inplace=True)



covidNew.dropna(subset=['area_code'], inplace=True)

covidNew['area_code'] = covidNew['area_code'].astype(int)
covidNew['area_code'] = covidNew['area_code'].astype(str)

covidNew.reset_index(inplace=True, drop=True)

covidNew.sort_values(['area_name', 'date'], ascending=[True, True], inplace=True)

# filter only dates AFTER 4/22
covidNew = covidNew.loc[covidNew['date'] >= '2020-04-23']
# append to cleaned sheet
covid = covid.append(covidNew)
covid.sort_values(['area_code','date'], inplace=True)
covid.reset_index(drop=True,inplace=True)

In [17]:
covid['daily_cases'] = np.where(covid['area_code'] == covid['area_code'].shift(), covid.cases - covid.cases.shift(), np.nan)

covid['daily_cases'].fillna(covid['cases'], inplace=True)

covid['daily_deaths'] = np.where(covid['area_code'] == covid['area_code'].shift(), covid.deaths - covid.deaths.shift(), np.nan)
covid['daily_deaths'].fillna(covid['deaths'], inplace=True)

In [18]:
# we dont' ned the state here as we'r just trying to mak a fram
# at the county level
covid = covid.loc[covid['area_name'] != 'Georgia']

In [19]:
# groupby county fips to get number of hospitals and number of beds per county
covidTotals = covid.groupby('area_code') \
                   .agg({'daily_cases':'sum', 'daily_deaths': 'sum'}) \
                   .reset_index() \
                   .rename(columns={'daily_cases':'Cases',
                                    'daily_deaths':'Deaths',
                                    'area_code':'FIPS'})

# add total new cases last two weeks
cutOff = covid['date'].max() - pd.Timedelta(days=13)
cutOffOne = covid['date'].max() - pd.Timedelta(days=6)
covidLastTwo = covid.loc[covid['date'] >= cutOff]

covidTwo = covidLastTwo.groupby('area_code') \
                                .agg({'daily_cases':'sum','daily_deaths':'sum'}) \
                                .reset_index() \
                                .rename(columns={'daily_cases':'Recent_Cases',
                                            'daily_deaths':'Recent_Deaths',
                                            'area_code':'FIPS'})

# add daily growth rate over last 14 days
covidLastTwo['Pct_Chng_Case'] = covidLastTwo.groupby('area_code')['cases'].pct_change()
covidLastTwo.Pct_Chng_Case = covidLastTwo.Pct_Chng_Case * 100
covidLastTwo['Pct_Chng_Case'].fillna(0, inplace=True)

covidTwoAvgChng = covidLastTwo.groupby('area_code') \
                                .agg({'Pct_Chng_Case':'mean'}) \
                                .reset_index() \
                                .rename(columns={'Pct_Chng_Case':'avg_daily_change',
                                                 'area_code':'FIPS'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [20]:
# new cases over the last 7 days
covidThisWk = covid.loc[covid['date'] >= cutOffOne]
covidThisWk = covidThisWk[['date','area_code','daily_cases','daily_deaths']]
covidThisWk.rename(columns={'daily_cases' : 'daily_cases_this_wk',
                            'daily_deaths': 'daily_deaths_this_wk'}, inplace=True)
# new cases over the previous 7-day period
covidPrevWk = covid[(covid['date'] >= cutOff) & (covid['date'] < cutOffOne)]
covidPrevWk = covidPrevWk[['date','area_code','daily_cases','daily_deaths']]
covidPrevWk.rename(columns={'daily_cases' : 'daily_cases_prev_wk',
                            'daily_deaths': 'daily_deaths_prev_wk'}, inplace=True)
# total change current week to previous week
covidThisWk = covidThisWk.groupby('area_code') \
                           .agg({'daily_cases_this_wk':'sum', 'daily_deaths_this_wk': 'sum'}) \
                           .reset_index() \
                           .rename(columns={'area_code':'FIPS',
                                            'daily_cases_this_wk' : 'cases_this_wk',
                                            'daily_deaths_this_wk': 'deaths_this_wk'})

covidPrevWk = covidPrevWk.groupby('area_code') \
                           .agg({'daily_cases_prev_wk':'sum', 'daily_deaths_prev_wk': 'sum'}) \
                           .reset_index() \
                           .rename(columns={'area_code':'FIPS',
                                            'daily_cases_prev_wk' : 'cases_prev_wk',
                                            'daily_deaths_prev_wk': 'deaths_prev_wk'})

covidWeek = pd.merge(covidThisWk,covidPrevWk,how='left',on='FIPS')

covidWeek['cases_diff'] = covidWeek['cases_this_wk'] - covidWeek['cases_prev_wk']
covidWeek['deaths_diff'] = covidWeek['deaths_this_wk'] - covidWeek['deaths_prev_wk']

In [61]:
# merge all of these covid totaled files at the county level
# covidTwoAvgChng,covidWeek,covidTwo,covidTotals
covid = pd.merge(covidTotals,covidTwo,how='left',on='FIPS')
covid = pd.merge(covid,covidWeek,how='left',on='FIPS')
covid = pd.merge(covid,covidTwoAvgChng,how='left',on='FIPS')

In [62]:
covid['FIPS'] = covid['FIPS'].astype(str)

In [63]:
countyList = counties[['FIPS','County','Total_Population']]

In [32]:
# merge counties with covid
counties = pd.merge(counties,covid, how='left', on='FIPS')

In [34]:
counties['Cases'].fillna(0, inplace=True)
counties['Deaths'].fillna(0, inplace=True)

In [35]:
# add custom columns on rates, etc.
counties['COVID_Death_Rate'] = ( counties['Deaths'] / counties['Cases'] ) * 1000
counties['COVID_Cases_Per_1000'] = ( counties['Cases'] / counties['Total_Population'] ) * 1000
# add total new cases per 1000
counties['Cases_Wk_Chng_100k'] = ( counties['cases_diff'] / counties['Total_Population'] ) * 100000
# add total new cases per 1000
counties['Deaths_Wk_Chng_100k'] = ( counties['deaths_diff'] / counties['Total_Population'] ) * 100000

In [36]:
countyClaims = countyClaims.melt(id_vars=["FIPS", "County","Year"], var_name="Month", value_name="Claims")

In [37]:
countyClaimsNew = countyClaims.loc[countyClaims['Year'] == 2020]
countyClaimsNew = countyClaimsNew.loc[countyClaimsNew['Month'] == "March"]
countyClaimsLast = countyClaims.loc[countyClaims['Year'] == 2020]
countyClaimsLast = countyClaimsLast.loc[countyClaimsLast['Month'] == "February"]

claims = pd.merge(countyClaimsNew, countyClaimsLast, how='left', on='FIPS')

In [38]:
claims.rename(columns={'Claims_x' : 'March_Claims',
                       'Claims_y' : 'February_Claims'}, inplace=True)

claims['Claims_Pct_Change'] = ( ( claims['March_Claims'] - claims['February_Claims'] ) / claims['February_Claims'] ) * 100

In [39]:
claims['Claims_Pct_Change'] = claims['Claims_Pct_Change'].round(1)

In [40]:
claims['FIPS'] = claims['FIPS'].astype(str)

In [41]:
claims = claims[['FIPS','March_Claims','February_Claims','Claims_Pct_Change']]

In [42]:
claims.sort_values('February_Claims', inplace=True)

In [43]:
counties = pd.merge(counties, claims, how='left', on='FIPS')

In [44]:
counties.sort_values('February_Claims', inplace=True)

In [45]:
counties['Cases_1000_Norm'] = (counties.iloc[ : , 29 ]-counties.iloc[ : , 29 ].min())/(counties.iloc[ : , 29 ].max()-counties.iloc[ : , 29 ].min()) * 100

In [46]:
counties = counties[['FIPS','County','Total_Population','Population_Under_18','Population_Over_18','Population_Over_65',
                     'Population_Pct_Over_65','Jobs','Jobs_Frequent_Disease_Exposure','Jobs_Pct_Disease_Exposure',
                     'Jobs_Frequent_Physical_Proximity','Jobs_Pct_Prox','Socioeconomic_Status',
                     'Household_Comp_Disability','Minority_Status_Language','Housing_Transportation',
                     'Epidemiology','Healthcare_System','CCVI_Score','Hospitals','Beds','Beds_per_1000','Pct_Uninsured',
                     'Beds_per_1000_Elderly','Cases','Deaths','COVID_Death_Rate','COVID_Cases_Per_1000','Cases_1000_Norm',
                     'Deaths_Wk_Chng_100k','Cases_Wk_Chng_100k','avg_daily_change',
                     'March_Claims','February_Claims','Claims_Pct_Change','geometry']]

counties.rename(columns={'avg_daily_change' : 'Cases_Avg_Pct_Chng_Daily'}, inplace=True)

In [47]:
counties['Cases_1000_Norm'].fillna(0,inplace=True)
counties['Deaths_Wk_Chng_100k'].fillna(0,inplace=True)
counties['Cases_Wk_Chng_100k'].fillna(0,inplace=True)
counties['Cases_Avg_Pct_Chng_Daily'].fillna(0,inplace=True)
counties['COVID_Death_Rate'].fillna(0,inplace=True)

In [48]:
counties = gpd.GeoDataFrame(counties, geometry='geometry')

In [49]:
countyPoints = counties.copy()

In [50]:
countyPoints['geometry'] = counties['geometry'].centroid

In [51]:
countyPoints.sort_values('Cases_1000_Norm', inplace=True)
countyPoints['Cases_1000_Norm'].round(1)
counties['COVID_Death_Rate'].fillna(0, inplace=True)
countyPoints['COVID_Death_Rate'].fillna(0, inplace=True)

In [52]:
countyData = counties[['FIPS','County','Cases','Deaths','Total_Population','Population_Under_18','Population_Over_18','Population_Over_65',
                     'Population_Pct_Over_65','Jobs','Jobs_Frequent_Disease_Exposure','Jobs_Pct_Disease_Exposure',
                     'Jobs_Frequent_Physical_Proximity','Jobs_Pct_Prox','Socioeconomic_Status',
                     'Household_Comp_Disability','Minority_Status_Language','Housing_Transportation',
                     'Epidemiology','Healthcare_System','CCVI_Score','Hospitals','Beds','Beds_per_1000','Pct_Uninsured',
                     'Beds_per_1000_Elderly','COVID_Death_Rate','COVID_Cases_Per_1000','Cases_1000_Norm',
                     'Deaths_Wk_Chng_100k','Cases_Wk_Chng_100k','Cases_Avg_Pct_Chng_Daily',
                     'March_Claims','February_Claims','Claims_Pct_Change']]

In [53]:
hospitals.to_excel('hospitals.xlsx',index=False)

In [54]:
countyHospitals.to_excel('hospitals.xlsx',index=False)

In [55]:
hospitals = hospitals[['ID','NAME','ADDRESS','CITY','STATE','ZIP','TYPE','STATUS','COUNTY','COUNTYFIPS',
                       'LATITUDE','LONGITUDE','NAICS_CODE','NAICS_DESC','BEDS','TRAUMA',
                       'HELIPAD','geometry']]

In [56]:
hospitals.to_file("spatial/mapbox/hospitals.geojson", driver='GeoJSON')

In [57]:
counties.to_file("spatial/mapbox/countyData.geojson", driver='GeoJSON')
countyPoints.to_file("spatial/mapbox/countyPoints.geojson", driver='GeoJSON')
countyData.to_excel('countyData.xlsx', index=False)
covid.to_excel('covid.xlsx', index=False)

In [58]:
countyData.sort_values('Cases',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [69]:
# original covid file needs to be written out as separate 
# file for additional visualizations and tables
covidDW = pd.merge(countyList,covid,how='left',on='FIPS')
covidDW.fillna(0, inplace=True)

covidDW['Cases Change per 100k'] = ( covidDW['cases_diff'] / covidDW['Total_Population'] ) *100000
#rename all columns for datawrapper visuals
covidDW.rename(columns={'Cases'             : 'Total Cases',
                         'Deaths'           : 'Total Deaths',
                         'Recent_Cases'     : 'Recent Cases',
                         'Recent_Deaths'    : 'Recent Deaths',
                         'cases_this_wk'    : 'Cases This Week',
                         'cases_prev_wk'    : 'Cases Last Week',
                         'deaths_this_wk'   : 'Deaths This Week',
                         'deaths_prev_wk'   : 'Deaths Last Week',
                         'cases_diff'       : 'Change in Cases',
                         'deaths_diff'       : 'Change in Deaths',
                         'avg_daily_change' : 'Avg. Daily Change'}, inplace=True)

In [71]:
# repeat all of that covid manipulation with MSAs and Regional Commissions
# stack on top of each other and writ to single excel file in format(s) for datawrapper
covidDW.to_excel('../application/app-data/covidDataWrapper.xlsx', index=False)

In [None]:
# for correlations
dataPlot = counties[['County','Population_Over_65',
                     'Population_Pct_Over_65','Jobs','Jobs_Frequent_Disease_Exposure','Jobs_Pct_Disease_Exposure',
                     'Jobs_Frequent_Physical_Proximity','Jobs_Pct_Prox','Socioeconomic_Status',
                     'Epidemiology','Healthcare_System','CCVI_Score','Hospitals','Beds','Beds_per_1000','Pct_Uninsured',
                     'Beds_per_1000_Elderly','COVID_Death_Rate','COVID_Cases_Per_1000','Cases_1000_Norm',
                     'Claims_Pct_Change']]

In [None]:
# Compute the correlation matrix
corr = dataPlot.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
plt.show()