In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from states import states

In [None]:
#read and format diversity data, 0 = all white, 1 = all nonwhite
div_df = pd.read_csv('datasets/diversity_index_counties.csv')
#convert state abbv to state name to more easily combine datasets
div_df['State'] = [states[x] for x in div_df['State Abbr']]
#clean
div_df['County'] = [x.replace(' County', '') for x in div_df['County']]
div_df = div_df.sort_values(by = 'County')
div_df = div_df.reset_index(drop=True)
#rename
div_df = div_df.rename(columns = {'state': 'State', 'county': 'County', 
                                     'Black or African American alone, percent, 2013':'Black-Index',
                                     'American Indian and Alaska Native alone, percent, 2013':'Native-Index',
                                     'Asian alone, percent, 2013':'Asian-Index',
                                     'Hispanic or Latino, percent, 2013':'Latin-Index',
                                     'White alone, not Hispanic or Latino, percent, 2013':'White-Index'
})
#narrow
div_df = div_df[['State', 'County', 'Diversity-Index', 
                 'Black-Index',
                 'Native-Index',
                 'Asian-Index',
                 'Latin-Index',
                 'White-Index']]
#display
div_df

In [None]:
#read and format election data
el_df = pd.read_csv('datasets/ElectionResults2020.csv')
el_df = el_df[el_df['won'] == True]
el_df = el_df[['state', 'county', 'party']]
el_df = el_df.rename(columns = {'state': 'State', 'county': 'County', 'party':'2020 Election Result'})
el_df['County'] = [x.replace(' County', '') for x in el_df['County']]
el_df = el_df.sort_values(by = 'County')
el_df = el_df.reset_index(drop = True)
el_df

In [None]:
#merge div and el
div_el_df = pd.merge(div_df, el_df, on=['County', 'State'])
div_el_df

In [None]:
#read and format population data
pop_df = pd.read_csv('datasets/uscensuspop2020.csv')
pop_df = pop_df[['STNAME', 'CTYNAME', 'ESTIMATESBASE2020']]
pop_df = pop_df.rename(columns = {'STNAME': 'State', 'CTYNAME': 'County', 'ESTIMATESBASE2020':'Population'})
pop_df['County'] = [x.replace(' County', '') for x in pop_df['County']]
pop_df = pop_df.sort_values(by = 'County')
pop_df = pop_df.reset_index(drop = True)
pop_df

In [None]:
#merge div el with pop
div_el_pop_df = pd.merge(div_el_df, pop_df, on=['County', 'State'])
div_el_pop_df

In [None]:
#read county poverty data
poverty_df = pd.read_csv('datasets/PovertyEstimates.csv')
#clean
poverty_df = poverty_df.rename(columns = {'Stabr': 'State Abbv', 'Area_name': 'County', 'PCTPOVALL_2019':'poverty rate'})
poverty_df['State'] = [states[x] for x in poverty_df['State Abbv']]
poverty_df['County'] = [x.replace(' County', '') for x in poverty_df['County']]
#narrow
poverty_df = poverty_df[['State', 'County', 'poverty rate']]
#display
poverty_df = poverty_df.sort_values(by = 'County')
poverty_df = poverty_df.reset_index(drop = True)
poverty_df

In [None]:
#combine div el pop with poverty data
county_df = pd.merge(poverty_df, div_el_pop_df, on=['County', 'State'])
county_df

In [None]:
#read covid-19 dataset
covid_df = pd.read_csv('https://query.data.world/s/254wzyu352gsbcze7tu3pckdjuuhm3')
#Narrow dataset to only united states
us_df = covid_df[covid_df['COUNTRY_SHORT_NAME'] == 'United States']
#because we are looking only at cumulative data, I only want the most recent totals
today_df = us_df[us_df['REPORT_DATE']==max(us_df['REPORT_DATE'])]
#narrow to only cols of interest
today_df = today_df[['PROVINCE_STATE_NAME',
                     'COUNTY_NAME',
                     'PEOPLE_POSITIVE_CASES_COUNT',
                     'PEOPLE_DEATH_COUNT']]
#rename cols
today_df = today_df.rename(columns = {'PROVINCE_STATE_NAME': 'State',
                                            'COUNTY_NAME': 'County',
                                            'PEOPLE_POSITIVE_CASES_COUNT': 'Cases',
                                            'PEOPLE_DEATH_COUNT':'Deaths'})
today_df = today_df.sort_values(by = 'County')
today_df = today_df.reset_index(drop=True)
today_df

In [None]:
#merge datasets
df = pd.merge(today_df, county_df, how = 'outer', on = ['State', 'County'])
#calculate cases/deaths as percent of total population
df['% Cases'] = df['Cases']/df['Population']*100
df['% Deaths'] = df['Deaths']/df['Population']*100
#remove outliers
df=df[df['% Cases'] < 50]
df=df[df['% Deaths'] < 1.1]
#display
df

In [None]:
#narrow to just states
state_df = df[['State', 'Cases', 'Deaths', 'Population']].groupby('State').sum()
#calculate cases/deaths as percent of total population
state_df['% Cases'] = state_df['Cases']/state_df['Population']*100
state_df['% Deaths'] = state_df['Deaths']/state_df['Population']*100
state_df = state_df.reset_index()
#display
state_df.head()

In [None]:
#State vs total case count - bar
plt.rcParams["figure.figsize"] = (15,5)
plt.bar(state_df['State'],state_df['% Cases'])
plt.title("US States: COVID-19 Cases from 2020 to today")
plt.xticks(rotation = 90)
plt.yticks(np.arange(0, 40, 5))
plt.ylabel("COVID-19 Cases as % of State Population")
plt.savefig('graphs/all_states_cases.png')
plt.show()

In [None]:
#State vs total death counts - bar
plt.bar(state_df['State'],state_df['% Deaths'])
plt.title("US States: COVID-19 Deaths from 2020 to today")
plt.xticks(rotation = 90)
plt.yticks(np.arange(0,0.5, 0.05))
plt.ylabel("COVID-19 Deaths as % of State Population")
plt.savefig('graphs/all_states_deaths.png')
plt.show()

In [None]:
#group states by 2020 election results
grouped_el_df = df.groupby('2020 Election Result').mean()
group_df = grouped_el_df[['% Cases', '% Deaths']]
group_df = group_df.reset_index()
group_df

In [None]:
#t-tests
dem_df = df[df['2020 Election Result']=='DEM']
rep_df = df[df['2020 Election Result']=='REP']
#cases
stat_c, p_c = st.ttest_ind(dem_df['% Cases'], rep_df['% Cases'])
#deaths
stat_d, p_d = st.ttest_ind(dem_df['% Deaths'], rep_df['% Deaths'])
print(p_c)
print(p_d)

In [None]:
#Cases
plt.rcParams["figure.figsize"] = (5,5)
plt.bar(['Democratic Counties', 'Republican Counties'],group_df['% Cases'])
plt.yticks(np.arange(20,28,1))
plt.ylim(20,26)
plt.title("COVID-19 Cases in Counties by 2020 Election Results")
plt.ylabel("COVID-19 Cases as a % of Population")
plt.xlabel(f"p = {p_c}")
plt.savefig('graphs/cases_by_party.png')
plt.show()

In [None]:
#Deaths
plt.rcParams["figure.figsize"] = (5,5)
plt.bar(['Democratic Counties', 'Republican Counties'],group_df['% Deaths'])
plt.yticks(np.arange(0,0.5,0.05))
plt.title("COVID-19 Deaths in Counties by 2020 Election Results")
plt.ylabel("COVID-19 Deaths as a % of Population")
plt.xlabel(f"p = {p_d}")
plt.savefig('graphs/deaths_by_party.png')
plt.show()

In [None]:
#income v.cases scatter plot
x = df['poverty rate']
y = df['% Cases']
xd = dem_df['poverty rate']
yd = dem_df['% Cases']
xr = rep_df['poverty rate']
yr = rep_df['% Cases']


plt.rcParams["figure.figsize"] = (8,8)
plt.title("US Counties: Poverty Rate v. COVID-19 Cases")
plt.ylabel("COVID-19 Cases as a % of Population")
plt.xlabel("Poverty Rate")
plt.scatter(xd,yd,s = 1, color = 'b')
plt.scatter(xr,yr,s = 1, color = 'r')

slope, intercept, r, p, std_err = st.linregress(x,y)
sloped, interceptd, rd, pd, std_errd = st.linregress(xd,yd)
sloper, interceptr, rr, pr, std_errr = st.linregress(xr,yr)

plt.plot(x,slope*x+intercept, color = 'k')
plt.plot(xd,sloped*xd+interceptd, color = 'b')
plt.plot(xr,sloper*xr+interceptr, color = 'r')

plt.text(10, 45, f"r = {round(r,4)}")
plt.text(10, 47, f"r = {round(rd,4)}", c='b')
plt.text(10, 49, f"r = {round(rr,4)}", c='r')
plt.savefig('graphs/poverty_v_cases.png')
plt.show()


In [None]:
#income v. deaths scatter plot
x2 = df['poverty rate']
y2 = df['% Deaths']
xd2 = dem_df['poverty rate']
yd2 = dem_df['% Deaths']
xr2 = rep_df['poverty rate']
yr2 = rep_df['% Deaths']

slope, intercept, r, p, std_err = st.linregress(x2,y2)
sloped, interceptd, rd, pd, std_errd = st.linregress(xd2,yd2)
sloper, interceptr, rr, pr, std_errr = st.linregress(xr2,yr2)

plt.title("US Counties: Poverty Rate v. COVID-19 Deaths")
plt.ylabel("COVID-19 Deaths as a % of Population")
plt.xlabel("Poverty Rate")
plt.scatter(xd2,yd2,s = 1, color = 'b')
plt.scatter(xr2,yr2,s = 1, color = 'r')

plt.plot(x2,slope*x2+intercept, color = 'k')
plt.plot(xd2,sloped*xd2+interceptd, color = 'b')
plt.plot(xr2,sloper*xr2+interceptr, color = 'r')


plt.text(35, 0.1, f"r = {round(r,5)}")

plt.text(35, 0.1, f"r = {round(r,4)}")
plt.text(35, 0.2, f"r = {round(rd,4)}", c='b')
plt.text(35, 0.3, f"r = {round(rr,4)}", c='r')

plt.savefig('graphs/poverty_v_deaths.png')
plt.show()

In [None]:
#diversity v. cases scatter plot
x1 = df['Black-Index']
y1 = df['% Cases']
plt.scatter(x1,y1,s = 1, c='b')

x2 = df['Native-Index']
y2 = df['% Cases']
plt.scatter(x2,y2,s = 1, c='g')

x3 = df['Asian-Index']
y3 = df['% Cases']
plt.scatter(x3,y3,s = 1, c='m')

x4 = df['Latin-Index']
y4 = df['% Cases']
plt.scatter(x4,y4,s = 1, c='y')

plt.title("US Counties: Diversity v. COVID-19 Cases")
plt.ylabel("COVID-19 Cases as a % of Population")
plt.xlabel("Diversity (0 = all white)")

slope1, intercept1, r1, p1, std_err1 = st.linregress(x1,y1)
plt.plot(x1,slope1*x1+intercept1, color = 'b')

slope2, intercept2, r2, p2, std_er2r = st.linregress(x2,y2)
plt.plot(x2,slope2*x2+intercept2, color = 'g')

slope3, intercept3, r3, p3, std_er3r = st.linregress(x3,y3)
plt.plot(x3,slope3*x3+intercept3, color = 'm')

slope4, intercept4, r4, p4, std_err4 = st.linregress(x4,y4)
plt.plot(x4,slope4*x4+intercept4, color = 'y')

plt.text(50, 10, f"Black/African American: r = {round(r1,4)}", c='b')
plt.text(50, 8, f"Native American: r = {round(r2,4)}", c='g')
plt.text(50, 6, f"Asian American: r = {round(r3,4)}", c='m')
plt.text(50, 4, f"Hispanic/Latino: r = {round(r4,4)}", c='y')

plt.savefig('graphs/diversity_breakdown_v_cases.png')
plt.show()

In [None]:
#diversity v. daeths scatter plot
x1 = df['Black-Index']
y1 = df['% Deaths']
plt.scatter(x1,y1,s = 1, c='b')

x2 = df['Native-Index']
y2 = df['% Deaths']
plt.scatter(x2,y2,s = 1, c='g')

x3 = df['Asian-Index']
y3 = df['% Deaths']
plt.scatter(x3,y3,s = 1, c='m')

x4 = df['Latin-Index']
y4 = df['% Deaths']
plt.scatter(x4,y4,s = 1, c='y')

plt.title("US Counties: Diversity v. COVID-19 Deaths")
plt.ylabel("COVID-19 Deaths as a % of Population")
plt.xlabel("Diversity (0 = all white)")

slope1, intercept1, r1, p1, std_err1 = st.linregress(x1,y1)
plt.plot(x1,slope1*x1+intercept1, color = 'b')

slope2, intercept2, r2, p2, std_er2r = st.linregress(x2,y2)
plt.plot(x2,slope2*x2+intercept2, color = 'g')

slope3, intercept3, r3, p3, std_er3r = st.linregress(x3,y3)
plt.plot(x3,slope3*x3+intercept3, color = 'm')

slope4, intercept4, r4, p4, std_err4 = st.linregress(x4,y4)
plt.plot(x4,slope4*x4+intercept4, color = 'y')

plt.text(60, .08, f"Black/African American: r = {round(r1,3)}", c='b')
plt.text(60, .12, f"Native American: r = {round(r2,3)}", c='g')
plt.text(60, .16, f"Asian American: r = {round(r3,3)}", c='m')
plt.text(60, .20, f"Hispanic/Latino: r = {round(r4,3)}", c='y')

plt.ylim(0,1.1)

plt.savefig('graphs/diversity_breakdown_v_deaths.png')

plt.show()


In [None]:
#diversity v.cases scatter plot
x = df['Diversity-Index']
y = df['% Cases']

plt.title("US Counties: Diversity v. COVID-19 Cases")
plt.ylabel("COVID-19 Cases as a % of Population")
plt.xlabel("Diversity Index (0 = all white)")
plt.scatter(x,y,s = 1)

slope, intercept, r, p, std_err = st.linregress(x,y)

plt.plot(x,slope*x+intercept, color = 'k')

#plt.text(10, 45, f"r = {round(r,4)}")

#plt.savefig('graphs/diversity_v_cases.png')
plt.show()


In [None]:
#diversity v. deaths scatter plot
x = df['Diversity-Index']
y = df['% Deaths']

plt.title("US Counties: Diversity v. COVID-19 Deaths")
plt.ylabel("COVID-19 Deaths as a % of Population")
plt.xlabel("Diversity Index (0 = all white)")
plt.scatter(x,y,s = 1)

slope, intercept, r, p, std_err = st.linregress(x,y)

plt.plot(x,slope*x+intercept, color = 'k')

plt.text(0.1, 1.0, f"r = {round(r,4)}")

plt.savefig('graphs/diversity_v_deaths.png')
plt.show()