In [38]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import altair as alt

print(pd.__version__)


1.1.3


In [45]:
COUNTY_DATA = 'data/nyc-us-counties.csv'
STATE_DATA = 'data/nyc-us-states.csv'
US_DATA = 'data/nyc-us.csv'

In [40]:
df = pd.read_csv(COUNTY_DATA)
df.dtypes

date       object
county     object
state      object
fips      float64
cases       int64
deaths      int64
dtype: object

In [41]:
xdf = df[df.date == df.date.max()]
xdf.head()

Unnamed: 0,date,county,state,fips,cases,deaths
647924,2020-10-20,Autauga,Alabama,1001.0,2010,29
647925,2020-10-20,Baldwin,Alabama,1003.0,6405,69
647926,2020-10-20,Barbour,Alabama,1005.0,988,9
647927,2020-10-20,Bibb,Alabama,1007.0,791,14
647928,2020-10-20,Blount,Alabama,1009.0,1848,24


In [42]:
xdf = df.groupby(by='county').agg({'cases': 'sum',
                            'deaths': 'sum',
                            'state': lambda x: x.iloc[1],
                            'fips': lambda x: x.iloc[1]})


In [43]:
xdf.deaths.max()

4222405

In [44]:
from vega_datasets import data

counties = alt.topo_feature(data.us_10m.url, 'counties')
#source = data.unemployment.url

url = 'county_data.json'
xdf.to_json(url, orient='records')

#alt.data_transformers.disable_max_rows()
alt.data_transformers.enable('json')

alt.Chart(counties).mark_geoshape().encode(
    color='deaths:Q',
    tooltip='id:Q'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(url, 'fips', ['deaths'])
).project(
    type='albersUsa'
).properties(
    width=500,
    height=300
)


### Analyzing States Data

In [46]:
df = pd.read_csv(STATE_DATA)
df.dtypes

date      object
state     object
fips       int64
cases      int64
deaths     int64
dtype: object

### Analyzing US Data

In [118]:
u_df = pd.read_csv(US_DATA)
u_df.dtypes

date      object
cases      int64
deaths     int64
dtype: object

In [119]:
u_df.tail()

Unnamed: 0,date,cases,deaths
269,2020-10-16,8090568,218494
270,2020-10-17,8143329,219173
271,2020-10-18,8191211,219541
272,2020-10-19,8255429,220058
273,2020-10-20,8316027,220987


In [120]:
# Calculate daily changes in confirmed cases with a given EMA in weeks
def getChangeConfCases(u_df, ema):
    d = u_df.cases.diff()
    tmp_df = pd.DataFrame(u_df)
    tmp_df['CASES_CHANGE'] = d
    tmp_df['CASES_EMA'] = d.ewm(span=7, adjust=False).mean()
    tmp_df.dropna(inplace=True)
    tmp_df = tmp_df.astype({'CASES_CHANGE': 'int64'})

    return tmp_df

# Calculate daily changes in deaths with a given EMA in weeks
def getChangeDeaths(u_df, ema):
    d = u_df.deaths.diff()
    tmp_df = pd.DataFrame(u_df)
    tmp_df['DEATHS_CHANGE'] = d
    tmp_df['DEATHS_EMA'] = d.ewm(span=7, adjust=False).mean()
    tmp_df.dropna(inplace=True)
    tmp_df = tmp_df.astype({'DEATHS_CHANGE': 'int64'})
    
    return tmp_df


In [121]:
p_df = getChangeConfCases(u_df,1)
p_df.tail()

Unnamed: 0,date,cases,deaths,CASES_CHANGE,CASES_EMA
269,2020-10-16,8090568,218494,70464,59394.694551
270,2020-10-17,8143329,219173,52761,57736.270914
271,2020-10-18,8191211,219541,47882,55272.703185
272,2020-10-19,8255429,220058,64218,57509.027389
273,2020-10-20,8316027,220987,60598,58281.270542


In [122]:
d_df = getChangeDeaths(u_df, 1)
d_df.tail(10)

Unnamed: 0,date,cases,deaths,CASES_CHANGE,CASES_EMA,DEATHS_CHANGE,DEATHS_EMA
264,2020-10-11,7794625,214606,44766.0,49492.420661,419,671.695327
265,2020-10-12,7840546,214957,45921.0,48599.565496,351,591.521495
266,2020-10-13,7894905,215783,54359.0,50039.424122,826,650.141121
267,2020-10-14,7954777,216792,59872.0,52497.568091,1009,739.855841
268,2020-10-15,8020104,217585,65327.0,55704.926069,793,753.141881
269,2020-10-16,8090568,218494,70464.0,59394.694551,909,792.10641
270,2020-10-17,8143329,219173,52761.0,57736.270914,679,763.829808
271,2020-10-18,8191211,219541,47882.0,55272.703185,368,664.872356
272,2020-10-19,8255429,220058,64218.0,57509.027389,517,627.904267
273,2020-10-20,8316027,220987,60598.0,58281.270542,929,703.1782


In [123]:
# Create an Altair bar chart to show daily changes over a weekly period
# df - DataFrame to be used for the plot
# field - variable to plot on the Y axis
def createIncrementalBarChart(df, field):
    if (field == 'CASES_CHANGE'):
        chartTitle = "Confirmed Cases"
        emaCol = "CASES_EMA"
    else:
        chartTitle = "Reported Deaths"
        emaCol = "DEATHS_EMA"

    axisTitle = "Daily Change"

    bar = alt.Chart(df).mark_bar(opacity=0.6).encode(
                x = alt.X("date:T", axis=alt.Axis(title='Date')),
                y = alt.Y(field + ":Q", stack=None, axis=alt.Axis(title=axisTitle))
            ).properties(
                width=640,
                height=480
            )
    line = alt.Chart(df).mark_line().encode(
            alt.X("date:T"),
            alt.Y(emaCol + ":Q"),
            color=alt.value('red')
    )
    layer = alt.layer(bar, line
            ).properties(
                title=chartTitle
            ).configure_title(
                fontSize=20,
                align='center',
                color='gray'  
            ).configure_axis(
                grid=False,
                titleFontSize=14
            )
    chart = bar + line
    # TODO - the configuration below does not seem to have any effect - fix
    chart.configure_title(
        fontSize=20,
        anchor='start',
        color='gray'  
    )
    return (layer)


In [124]:
p_df = getChangeConfCases(u_df,1)
chart = createIncrementalBarChart(p_df, "CASES_CHANGE")
chart

In [125]:
d_df = getChangeDeaths(u_df, 1)
dchart = createIncrementalBarChart(d_df, "DEATHS_CHANGE")
dchart