In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
from scipy.stats import pearsonr

# Linear Regression + Correlation Stuff

## PRE-COVID Unemployment Data

In [2]:
unemployment = pd.read_csv('../team_project/eda_data/Merged_complete_employment_death.csv', index_col=[0])

In [3]:
unemployment["County"].unique()

array(['Alameda', 'Alpine', 'Amador', 'Orange', 'Kern', 'Calaveras',
       'Butte', 'Colusa', 'Contra Costa', 'Del Norte', 'Imperial',
       'El Dorado', 'Fresno', 'Glenn', 'Kings', 'Humboldt', 'Inyo',
       'Lake', 'Lassen', 'Los Angeles', 'Madera', 'Mariposa', 'Mendocino',
       'Merced', 'Stanislaus', 'Modoc', 'Mono', 'Napa', 'Nevada',
       'Ventura', 'Placer', 'Plumas', 'Shasta', 'Riverside', 'Sacramento',
       'Monterey', 'San Benito', 'San Bernardino', 'San Diego',
       'San Francisco', 'San Luis Obispo', 'San Mateo', 'Marin',
       'Santa Clara', 'Santa Cruz', 'Santa Barbara', 'Sonoma', 'Sierra',
       'Siskiyou', 'San Joaquin', 'Sutter', 'Tehama', 'Trinity',
       'Tuolumne', 'Solano', 'Tulare', 'Yolo', 'Yuba'], dtype=object)

In [4]:
unemployment["County"] = unemployment["County"].str.strip()

In [5]:
rmv_2020 = unemployment[unemployment["Year"]!= 2020]
rmv_2020

Unnamed: 0,County,Year,Employment,Labor Force,Unemployment,Unemployment Rate,Opioid Death Rate,Opioid Death
0,Alameda,2015,779891.666667,819016.666667,39108.333333,4.775000,2.73,50.0
1,Alameda,2016,796025.000000,831816.666667,35791.666667,4.325000,2.18,39.0
2,Alameda,2017,807658.333333,838700.000000,31041.666667,3.700000,1.36,25.0
3,Alameda,2018,815691.666667,841483.333333,25783.333333,3.066667,2.28,42.0
4,Alameda,2019,815883.333333,841083.333333,25191.666667,3.000000,4.33,78.0
...,...,...,...,...,...,...,...,...
339,Yolo,2019,103383.333333,107925.000000,4525.000000,4.191667,2.49,6.0
341,Yuba,2016,25716.666667,28150.000000,2450.000000,8.641667,3.72,2.0
342,Yuba,2017,26583.333333,28733.333333,2150.000000,7.475000,7.63,5.0
343,Yuba,2018,27341.666667,29233.333333,1883.333333,6.466667,4.45,3.0


In [6]:
rmv_2020["Year"].value_counts()

2016    58
2017    58
2018    58
2019    58
2015    56
Name: Year, dtype: int64

In [7]:
precovid_county = rmv_2020.groupby("County")[["Unemployment","Unemployment Rate"]].mean()
precovid_county

Unnamed: 0_level_0,Unemployment,Unemployment Rate
County,Unnamed: 1_level_1,Unnamed: 2_level_1
Alameda,31383.333333,3.773333
Alpine,33.833333,6.07
Amador,748.0,5.13
Butte,6048.333333,5.99
Calaveras,1037.166667,4.966667
Colusa,1566.333333,14.385
Contra Costa,21925.0,3.955
Del Norte,667.5,6.853333
El Dorado,4043.333333,4.501667
Fresno,38571.666667,8.665


## During COVID-19 (2020)

In [8]:
only_2020 = unemployment[unemployment["Year"]== 2020]
only_2020

Unnamed: 0,County,Year,Employment,Labor Force,Unemployment,Unemployment Rate,Opioid Death Rate,Opioid Death
5,Alameda,2020,742416.7,813800.0,71383.333333,8.825,7.74,138.0
11,Alpine,2020,460.0,518.3333,59.166667,11.925,0.0,0.0
17,Amador,2020,13105.83,14425.0,1317.5,9.166667,13.07,4.0
23,Orange,2020,1416742.0,1553308.0,136575.0,8.866667,14.13,459.0
29,Kern,2020,335708.3,383775.0,48066.666667,12.55,21.0,193.0
35,Calaveras,2020,19588.33,21208.33,1620.0,7.658333,7.0,2.0
41,Butte,2020,84083.33,92608.33,8508.333333,9.233333,5.19,13.0
47,Colusa,2020,8825.0,10500.83,1675.833333,15.866667,14.72,3.0
53,Contra Costa,2020,493241.7,541241.7,48033.333333,8.916667,11.45,130.0
59,Del Norte,2020,8467.5,9350.0,884.166667,9.5,9.01,3.0


In [9]:
county_df = precovid_county.join(only_2020.set_index("County"),lsuffix="_precovid", rsuffix="_covid")

In [10]:
county_df = county_df.drop(columns="Year")
county_df.head()

Unnamed: 0_level_0,Unemployment_precovid,Unemployment Rate_precovid,Employment,Labor Force,Unemployment_covid,Unemployment Rate_covid,Opioid Death Rate,Opioid Death
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alameda,31383.333333,3.773333,742416.666667,813800.0,71383.333333,8.825,7.74,138.0
Alpine,33.833333,6.07,460.0,518.333333,59.166667,11.925,0.0,0.0
Amador,748.0,5.13,13105.833333,14425.0,1317.5,9.166667,13.07,4.0
Butte,6048.333333,5.99,84083.333333,92608.333333,8508.333333,9.233333,5.19,13.0
Calaveras,1037.166667,4.966667,19588.333333,21208.333333,1620.0,7.658333,7.0,2.0


In [11]:
county_df = county_df.rename(columns=
{"Unemployment_precovid":"PRECOVID_UNEMPLOYMENT_COUNTS",
"Unemployment Rate_precovid":"PRECOVID_UNEMPLOYMENT_RATES",
"Unemployment_covid":"COVID_UNEMPLOYMENT_COUNTS",
"Unemployment Rate_covid":"COVID_UNEMPLOYMENT_RATES"})
county_df.head()

Unnamed: 0_level_0,PRECOVID_UNEMPLOYMENT_COUNTS,PRECOVID_UNEMPLOYMENT_RATES,Employment,Labor Force,COVID_UNEMPLOYMENT_COUNTS,COVID_UNEMPLOYMENT_RATES,Opioid Death Rate,Opioid Death
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alameda,31383.333333,3.773333,742416.666667,813800.0,71383.333333,8.825,7.74,138.0
Alpine,33.833333,6.07,460.0,518.333333,59.166667,11.925,0.0,0.0
Amador,748.0,5.13,13105.833333,14425.0,1317.5,9.166667,13.07,4.0
Butte,6048.333333,5.99,84083.333333,92608.333333,8508.333333,9.233333,5.19,13.0
Calaveras,1037.166667,4.966667,19588.333333,21208.333333,1620.0,7.658333,7.0,2.0


In [12]:
# load hospital density-related metrics
facility_df = pd.read_csv("../team_project/eda_data/Merged_precovid_covid_deaths_and_hospital_metrics.csv")
facility_df.head()

Unnamed: 0,COUNTY,PRECOVID_RATES,PRECOVID_COUNTS,COVID_RATES,COVID_COUNTS,TOTAL_FAC,TOTAL_NUMBER_BEDS,COUNT_NOER_FAC,COUNT_ER_FAC,ER_BEDS,TOTALFAC_PER_POP,TOTALBEDS_PER_POP,ERFAC_PER_POP,ERBEDS_PER_POP,ERFAC_PER_AREA
0,Alameda,3.502,63.0,8.09,144.0,340.0,9074.0,327.0,13.0,2894.0,20.232315,539.964772,0.773588,172.212701,0.017589
1,Alpine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Amador,6.556,3.0,13.07,4.0,5.0,251.0,4.0,1.0,52.0,12.362162,620.580527,2.472432,128.566484,0.001682
3,Butte,6.9,16.2,11.54,26.0,44.0,1387.0,41.0,3.0,467.0,22.348639,704.490045,1.523771,237.200325,0.001833
4,Calaveras,10.566,4.6,7.0,2.0,5.0,124.0,4.0,1.0,25.0,10.794706,267.708716,2.158941,53.973531,0.00098


In [13]:
# join with pre and during covid data
county_merged = county_df.join(facility_df.set_index("COUNTY"))

In [14]:
county_merged.head()

Unnamed: 0_level_0,PRECOVID_UNEMPLOYMENT_COUNTS,PRECOVID_UNEMPLOYMENT_RATES,Employment,Labor Force,COVID_UNEMPLOYMENT_COUNTS,COVID_UNEMPLOYMENT_RATES,Opioid Death Rate,Opioid Death,PRECOVID_RATES,PRECOVID_COUNTS,...,TOTAL_FAC,TOTAL_NUMBER_BEDS,COUNT_NOER_FAC,COUNT_ER_FAC,ER_BEDS,TOTALFAC_PER_POP,TOTALBEDS_PER_POP,ERFAC_PER_POP,ERBEDS_PER_POP,ERFAC_PER_AREA
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alameda,31383.333333,3.773333,742416.666667,813800.0,71383.333333,8.825,7.74,138.0,3.502,63.0,...,340.0,9074.0,327.0,13.0,2894.0,20.232315,539.964772,0.773588,172.212701,0.017589
Alpine,33.833333,6.07,460.0,518.333333,59.166667,11.925,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Amador,748.0,5.13,13105.833333,14425.0,1317.5,9.166667,13.07,4.0,6.556,3.0,...,5.0,251.0,4.0,1.0,52.0,12.362162,620.580527,2.472432,128.566484,0.001682
Butte,6048.333333,5.99,84083.333333,92608.333333,8508.333333,9.233333,5.19,13.0,6.9,16.2,...,44.0,1387.0,41.0,3.0,467.0,22.348639,704.490045,1.523771,237.200325,0.001833
Calaveras,1037.166667,4.966667,19588.333333,21208.333333,1620.0,7.658333,7.0,2.0,10.566,4.6,...,5.0,124.0,4.0,1.0,25.0,10.794706,267.708716,2.158941,53.973531,0.00098


In [15]:
# remove unnecessary columns
county_merged = county_merged.drop(columns=["Labor Force", "Opioid Death Rate", "Opioid Death"])
county_merged.head()

Unnamed: 0_level_0,PRECOVID_UNEMPLOYMENT_COUNTS,PRECOVID_UNEMPLOYMENT_RATES,Employment,COVID_UNEMPLOYMENT_COUNTS,COVID_UNEMPLOYMENT_RATES,PRECOVID_RATES,PRECOVID_COUNTS,COVID_RATES,COVID_COUNTS,TOTAL_FAC,TOTAL_NUMBER_BEDS,COUNT_NOER_FAC,COUNT_ER_FAC,ER_BEDS,TOTALFAC_PER_POP,TOTALBEDS_PER_POP,ERFAC_PER_POP,ERBEDS_PER_POP,ERFAC_PER_AREA
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Alameda,31383.333333,3.773333,742416.666667,71383.333333,8.825,3.502,63.0,8.09,144.0,340.0,9074.0,327.0,13.0,2894.0,20.232315,539.964772,0.773588,172.212701,0.017589
Alpine,33.833333,6.07,460.0,59.166667,11.925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Amador,748.0,5.13,13105.833333,1317.5,9.166667,6.556,3.0,13.07,4.0,5.0,251.0,4.0,1.0,52.0,12.362162,620.580527,2.472432,128.566484,0.001682
Butte,6048.333333,5.99,84083.333333,8508.333333,9.233333,6.9,16.2,11.54,26.0,44.0,1387.0,41.0,3.0,467.0,22.348639,704.490045,1.523771,237.200325,0.001833
Calaveras,1037.166667,4.966667,19588.333333,1620.0,7.658333,10.566,4.6,7.0,2.0,5.0,124.0,4.0,1.0,25.0,10.794706,267.708716,2.158941,53.973531,0.00098


In [16]:
county_merged = county_merged.reset_index().rename(columns={"County":"COUNTY"})

In [17]:
# save to csv
county_merged.set_index("COUNTY").to_csv("../team_project/eda_data/Unemployment_Hospital_Density_Merge.csv")