In [2]:
# Following the kaggle outline for data analysis
# Link: https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python/notebook
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
sns.set(style="whitegrid")  # can set style depending on how you'd like it to look
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
# Opening the Adult tobacco prevalance csv file. The issue with this data set is that we cannot look at it by county. 
df = pd.read_csv('data/adult-tobacco-prevalence.csv')
df.shape

(52, 6)

In [4]:
pd.options.display.max_columns = 100
df.head(52)

Unnamed: 0,YEAR,DEMOGRAPHIC,PERCENT,SE,LOWER95,UPPER95
0,2016,Male,22.7,1.1,20.5,24.9
1,2016,Female,10.5,0.9,8.8,12.1
2,2016,Non-Hispanic African American/Black,16.8,2.5,12,21.6
3,2016,Non-Hispanic American Indian,45.7,15.9,14.6,76.9
4,2016,Non-Hispanic Asian/Pacific Islander,8.5,1.3,5.9,11.1
5,2016,Non-Hispanic Asian,8.7,1.4,6,11.4
6,2016,Non-Hispanic Pacific Islander,*,*,*,*
7,2016,Non-Hispanic White,17.8,1,15.8,19.8
8,2016,Hispanic,15.7,1.3,13.1,18.2
9,2016,LGBTQ,27.7,5.1,17.8,37.7


In [5]:
# Looking at the columns of the data
df.columns

Index(['YEAR', 'DEMOGRAPHIC', 'PERCENT', 'SE', 'LOWER95', 'UPPER95'], dtype='object')

In [6]:
# Checking the types to know how to convert accordingly 
df.dtypes

YEAR            int64
DEMOGRAPHIC    object
PERCENT        object
SE             object
LOWER95        object
UPPER95        object
dtype: object

In [7]:
df['PERCENT'].describe()

count     52
unique    38
top        *
freq       5
Name: PERCENT, dtype: object

In [8]:
df['DEMOGRAPHIC'].describe()

count                     52
unique                    13
top       Non-Hispanic Asian
freq                       4
Name: DEMOGRAPHIC, dtype: object

In [9]:
df["PERCENT"] = df["PERCENT"].fillna('Not available')

Now let's look at fentanyl deaths per county! Because some of my teammates had data going back to 2015, I decided to use data up until that year for this. I was able to find the data by county. One issue was that I had to download the csv files individually for each year, so I found a way to try and combine them all into one csv file. I will attempt to do this in the next few boxes. 

In [10]:
# Opening the 2015 data just to compare to the combined fent_df

fent_df_15 = pd.read_csv('fentanyl_data/CA_Fentanyl-Related Overdose _Death_by County_2015_01.10.2022.csv')
fent_df_15.shape

(64, 5)

In [11]:
fent_df_15.head()

Unnamed: 0,California Deaths by County -Total Population - 2015,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Fentanyl-Related Overdose : Age-Adjusted Rate...,,,,
1,County,Rates,95% LCL,95% UCL,Counts
2,Alameda,0.24,0.22,0.63,4
3,Alpine,0,,879.4,0
4,Amador,0,,19.07,0


In [12]:
# Checking the types 
fent_df_15.dtypes

California Deaths by County -Total Population - 2015     object
Unnamed: 1                                               object
Unnamed: 2                                               object
Unnamed: 3                                               object
Unnamed: 4                                               object
dtype: object

In [14]:
fent_df_15.isna().sum()

California Deaths by County -Total Population - 2015      0
Unnamed: 1                                                4
Unnamed: 2                                               32
Unnamed: 3                                                4
Unnamed: 4                                                3
dtype: int64

In [12]:
# Trying to combine them altogether. 
# Function to load a list of csv files 2015-2020
def load_dataset(name_path_dict):
    ''' Take in a dictionary with dataframe name and its CSV file paths and load into a dict of dataframes'''
    result_dict = {}
    for name, path in name_path_dict.items():
        result_dict[name] = pd.read_csv(path, skiprows=2)
    return result_dict

In [14]:
path_dict = {
    "data_2015":"fentanyl_data/CA_Fentanyl-Related Overdose _Death_by County_2015_01.10.2022.csv",
    "data_2016":"fentanyl_data/CA_Fentanyl-Related Overdose _Death_by County_2016_01.10.2022.csv",
    "data_2017":"fentanyl_data/CA_Fentanyl-Related Overdose _Death_by County_2017_01.10.2022.csv",
    "data_2018":"fentanyl_data/CA_Fentanyl-Related Overdose _Death_by County_2018_01.10.2022.csv",
    "data_2019":"fentanyl_data/CA_Fentanyl-Related Overdose _Death_by County_2019_01.10.2022.csv",
    "data_2020":"fentanyl_data/CA_Fentanyl-Related Overdose _Death_by County_2020_01.10.2022.csv"}

# Load into dictionary of dataframe fentanyl_dfs
fentanyl_dfs = load_dataset(path_dict)

#Checking the data
fentanyl_dfs['data_2015']

Unnamed: 0,County,Rates,95% LCL,95% UCL,Counts
0,Alameda,0.24,0.22,0.63,4.0
1,Alpine,0.00,,879.40,0.0
2,Amador,0.00,,19.07,0.0
3,Butte,0.33,0.36,3.07,1.0
4,Calaveras,2.81,3.07,19.23,1.0
...,...,...,...,...,...
57,Ventura,0.60,0.32,1.45,5.0
58,Yolo,0.00,,2.57,0.0
59,Yuba,0.00,,7.37,0.0
60,Source: CDPH Center for Health Statistics and ...,,,,


In [18]:
# Cleaning up the dataframes

for key,value in fentanyl_dfs.items():
    # removing 95% LCL and 95% UCL columns
    fentanyl_dfs[key] = fentanyl_dfs[key][["County","Rates","Counts"]]

    #remove last 2 lines
    fentanyl_dfs[key].drop(fentanyl_dfs[key].index[-2:], inplace = True)
    
    
# Adding the year into each dataframe
x = 2015
for key,value in fentanyl_dfs.items():
    fentanyl_dfs[key]["Year"] = x
    x += 1

In [22]:
# Checking the updates data table
fentanyl_dfs['data_2015']

Unnamed: 0,County,Rates,Counts,Year
0,Alameda,0.24,4.0,2015
1,Alpine,0.0,0.0,2015
2,Amador,0.0,0.0,2015
3,Butte,0.33,1.0,2015
4,Calaveras,2.81,1.0,2015
5,California,0.33,132.0,2015
6,Colusa,0.0,0.0,2015
7,Contra Costa,0.34,4.0,2015
8,Del Norte,0.0,0.0,2015
9,El Dorado,0.0,0.0,2015


In [23]:
fentanyl_dfs['data_2016']

Unnamed: 0,County,Rates,Counts,Year
0,Alameda,0.5,9.0,2016
1,Alpine,0.0,0.0,2016
2,Amador,3.97,1.0,2016
3,Butte,0.26,1.0,2016
4,Calaveras,4.07,2.0,2016
5,California,0.59,239.0,2016
6,Colusa,0.0,0.0,2016
7,Contra Costa,0.4,4.0,2016
8,Del Norte,3.34,1.0,2016
9,El Dorado,0.64,1.0,2016


In [24]:
# Concatenate all year into one dataframe
all_fentanyl_dfs = pd.concat(fentanyl_dfs)
all_fentanyl_dfs

Unnamed: 0,Unnamed: 1,County,Rates,Counts,Year
data_2015,0,Alameda,0.24,4.0,2015
data_2015,1,Alpine,0.00,0.0,2015
data_2015,2,Amador,0.00,0.0,2015
data_2015,3,Butte,0.33,1.0,2015
data_2015,4,Calaveras,2.81,1.0,2015
...,...,...,...,...,...
data_2020,51,Sutter,3.83,4.0,2020
data_2020,52,Tehama,1.04,1.0,2020
data_2020,53,Trinity,0.00,0.0,2020
data_2020,54,Tulare,6.25,29.0,2020


In [25]:
# Save into a CSV file for next step (cleaning, visualization)
all_fentanyl_dfs.to_csv("fentanyl_data/Fentanyl_Deaths_by_County_2015-2020.csv", index=False)