In [322]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [323]:
url = 'https://github.com/nytimes/covid-19-data/blob/master/us-counties-2020.csv?raw=true'
df = pd.read_csv(url)

In [324]:
df['date'] = pd.to_datetime(df['date'])

In [325]:
df_first_wave = df[df['date'] < '06/30/2020']

In [326]:
treatment_states = ['Minnesota', 'Montana','Nevada','North Carolina','Rhode Island', 'Alabama','Arizona','Florida','Georgia','Kansas','Maine','Maryland','Mississippi','Missouri','New Hampshire','South Carolina','Tennesse','Texas','Utah','Virgina']

In [327]:
control_states = ['Arkansas','Iowa','Nebraska','North Dakota','Oklahoma', 'South Dakota', 'Wyoming']

In [328]:
treatment_df = df_first_wave[df_first_wave['state'].isin(treatment_states)]

In [329]:
control_df = df_first_wave[df_first_wave['state'].isin(control_states)]

In [330]:
control_df['Treatment'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_df['Treatment'] = False


In [331]:
treatment_df['Treatment'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treatment_df['Treatment'] = True


In [332]:
combined_df = pd.concat([control_df, treatment_df])

## Dropping values where 'county' is 'Unknown' as there is no way to impute. Losing 1157 records of 150050 initial total (0.77%)

In [333]:
#drop unknown county
combined_df = combined_df[combined_df['county'] != 'Unknown']

## Drop records where the 'county' is the same as the 'state.' These are considered to be state-wide counts of deaths and cases. Dropping 440 of 148893 current total of records (0.2%)

In [334]:
combined_df[combined_df['county'] == combined_df['state']]

Unnamed: 0,date,county,state,fips,cases,deaths,Treatment
1834,2020-03-13,Oklahoma,Oklahoma,40109.0,1,0.0,False
2195,2020-03-14,Oklahoma,Oklahoma,40109.0,1,0.0,False
2613,2020-03-15,Oklahoma,Oklahoma,40109.0,1,0.0,False
3077,2020-03-16,Oklahoma,Oklahoma,40109.0,2,0.0,False
3604,2020-03-17,Oklahoma,Oklahoma,40109.0,6,0.0,False
4219,2020-03-18,Oklahoma,Oklahoma,40109.0,14,0.0,False
4957,2020-03-19,Oklahoma,Oklahoma,40109.0,18,0.0,False
5824,2020-03-20,Oklahoma,Oklahoma,40109.0,19,0.0,False
6825,2020-03-21,Oklahoma,Oklahoma,40109.0,20,0.0,False
7940,2020-03-22,Oklahoma,Oklahoma,40109.0,26,0.0,False


In [335]:
#drop where state == county
combined_df = combined_df[combined_df['county'] != combined_df['state']]
combined_df


KeyboardInterrupt: 

In [None]:
#plot sum of cases per state by treatment in a bar graph
combined_df.groupby(['state', 'Treatment'])['cases'].sum().unstack().plot(kind='bar', figsize=(20,10))
plt.title('Total Cases by State in the First Wave')
plt.xlabel('State')
plt.ylabel('Total Cases')
plt.show()


In [None]:
#plot sum of cases per state by treatment in a bar graph
combined_df.groupby(['state', 'Treatment'])['deaths'].sum().unstack().plot(kind='bar', figsize=(20,10))
plt.title('Total Deaths by State in the First Wave')
plt.xlabel('State')
plt.ylabel('Total Deaths')
plt.show()

In [None]:
import pandas as pd


df_new = combined_df.groupby(['state', 'county', 'Treatment']).apply(
    lambda x: pd.DataFrame({
        'date': pd.date_range(start='2020-01-01', end=x['date'].max(), freq='D'),
        'state': x['state'].iloc[0],
        'county': x['county'].iloc[0],
        'Treatment': x['Treatment'].iloc[0]
    })
).reset_index(drop=True)

combined_df = pd.merge(df_new, combined_df, on=['date', 'state', 'county', 'Treatment'], how='outer').fillna(0)



In [None]:
combined_df

In [None]:
# assuming your dataframe is called df and you have a cumulative sum column called 'cum_deaths'
combined_df['deaths_per_day'] = df.groupby(['state', 'county'])['deaths'].diff()


In [None]:
#print entire dataframe
pd.set_option('display.max_rows', None)
combined_df[combined_df['county'] == 'La Paz']


In [None]:
combined_df[combined_df['deaths_per_day'] < 0]

In [None]:
# Group the data by county and calculate the deaths per day for each county, skipping the first day
county_data = combined_df.groupby(["state", "county"]).apply(lambda x: x.assign(cases_per_day=x["cases"].diff().fillna(x["cases"])))
county_data = county_data.reset_index(drop=True)
county_data = county_data.groupby(["state", "county"]).apply(lambda x: x.assign(cases_per_day=x["cases_per_day"].apply(lambda y: y if y == y else x["cases"].iloc[0] - x["cases"].iloc[0])))
county_data = county_data.reset_index(drop=True)

# Print the result
kansas = county_data[county_data['state'] == 'Kansas']
kansas['cases_per_day'].sum()

In [None]:
# Group the data by county and calculate the deaths per day for each county, skipping the first day
county_data_final = county_data.groupby(["state", "county"]).apply(lambda x: x.assign(deaths_per_day=x["deaths"].diff().fillna(x["deaths"])))
county_data_final = county_data_final.reset_index(drop=True)
county_data_final = county_data_final.groupby(["state", "county"]).apply(lambda x: x.assign(deaths_per_day=x["deaths_per_day"].apply(lambda y: y if y == y else x["deaths"].iloc[0] - x["deaths"].iloc[0])))
county_data_final = county_data_final.reset_index(drop=True)

# Print the result
kansas = county_data_final[county_data_final['state'] == 'Kansas']
kansas['deaths_per_day'].sum()

In [None]:
county_data_final[county_data_final['deaths_per_day'] < 0]

In [None]:
county_data_final['deaths_per_day'].describe()

In [None]:
county_data_final

In [None]:
anderson = kansas[kansas['county'] == 'Anderson']
anderson[anderson['date'] == '2020-06-16']

In [None]:
kansas = county_data[county_data['state'] == 'Kansas']
kansas[kansas['deaths_per_day'] <0 ]

In [None]:
# Group the data by county and calculate the deaths per day for each county, skipping the first day
county_data = combined_df.groupby("county").apply(lambda x: x.assign(deaths_per_day=x["deaths"].diff().fillna(x["deaths"])))
county_data["deaths_per_day"] = county_data["deaths_per_day"].apply(lambda x: x if type(x) != float or x != county_data["deaths_per_day"].iloc[0] else 0)

county_data = county_data.reset_index(drop=True)

# Print the result
county_data


In [None]:
kansas = county_data[county_data['state'] == 'Kansas']
kansas['deaths_per_day'].sum()

In [None]:
# Group the data by county and calculate the deaths per day for each county
county_data_final = county_data.groupby("county").apply(lambda x: x.assign(cases_per_day=x["cases"].diff().fillna(x["cases"])))
county_data_final = county_data_final.reset_index(drop=True)
# Print the result
county_data_final

In [None]:
county_data_final[county_data_final['county'] == 'Mecklenburg']

In [None]:
county_data_final.groupby(['state', 'Treatment'])['deaths_per_day'].sum().unstack().plot(kind='bar', figsize=(20,10))
plt.title('Total Deaths per Day by State in the First Wave')
plt.xlabel('State')
plt.ylabel('Total Deaths per Day')
plt.show()


In [None]:
kansas = county_data_final[county_data_final['state'] == 'Kansas']
kansas['deaths_per_day'].sum()

In [None]:
kansas['deaths_per_day'].describe()

In [None]:
kansas['cases_per_day'].describe()

In [None]:
kansas[kansas['county'] == 'Anderson']

In [None]:
kansas[kansas['cases_per_day'] < 0]