# Drug Poisoning Data Analysis

Import libraries

In [2]:
import numpy as np
import pandas as pd

from sklearn import preprocessing

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

Load in data

In [3]:
df = pd.read_csv("data.csv")
df.shape

(2703, 18)

In [4]:
df

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Crude Death Rate,Standard Error for Crude Rate,Low Confidence Limit for Crude Rate,Upper Confidence Limit for Crude Rate,Age-adjusted Rate,Standard Error Age-adjusted Rate,Lower Confidence Limit for Age-adjusted rate,Upper Confidence Limit for Age-adjusted Rate,State Crude Rate in Range,US Crude Rate,US Age-adjusted Rate
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821,8.6,0.1,8.4,8.9,,,,,,14.8,14.7
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283,16.6,0.4,15.9,17.3,,,,,,13.2,13.1
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984,0.3,0.1,0.2,0.5,,,,,,12.0,11.9
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056,21.6,0.3,21.1,22.1,,,,,,8.9,8.9
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571,0.2,0.0,0.1,0.2,,,,,,12.4,12.3
5,2001,Both Sexes,45-54 years,Non-Hispanic White,United States,3850,29733531,12.9,0.2,12.5,13.4,,,,,,6.8,6.8
6,2003,Male,Less than 15 years,Non-Hispanic White,United States,38,18562010,0.2,0.0,0.1,0.3,,,,,,8.9,8.9
7,2010,Both Sexes,15-24 years,Non-Hispanic White,United States,3020,25356574,11.9,0.2,11.5,12.3,,,,,,12.4,12.3
8,2011,Both Sexes,75+ years,Hispanic,United States,21,1213487,1.7,0.4,1.1,2.6,,,,,,13.3,13.2
9,2001,Male,55-64 years,Non-Hispanic Black,United States,129,1100021,11.7,1.0,9.7,13.8,,,,,,6.8,6.8


In [5]:
df['Age'].unique()

array(['15-24 years', 'Less than 15 years', '35-44 years', '45-54 years',
       '75+ years', '55-64 years', '65-74 years', '25-34 years',
       'All Ages'], dtype=object)

In [6]:
df.columns

Index(['Year', 'Sex', 'Age', 'Race and Hispanic Origin', 'State', 'Deaths',
       'Population', 'Crude Death Rate', 'Standard Error for Crude Rate',
       'Low Confidence Limit for Crude Rate',
       'Upper Confidence Limit for Crude Rate', 'Age-adjusted Rate',
       'Standard Error Age-adjusted Rate',
       'Lower Confidence Limit for Age-adjusted rate',
       'Upper Confidence Limit for Age-adjusted Rate',
       'State Crude Rate in Range', 'US Crude Rate', 'US Age-adjusted Rate'],
      dtype='object')

In [7]:
feats = df.drop(['Deaths'], axis=1)
labels = df['Deaths']

In [9]:
df0 = df.drop(['Age-adjusted Rate',
       'Standard Error Age-adjusted Rate',
       'Lower Confidence Limit for Age-adjusted rate',
       'Upper Confidence Limit for Age-adjusted Rate',
       'State Crude Rate in Range'], axis=1)
df0

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Crude Death Rate,Standard Error for Crude Rate,Low Confidence Limit for Crude Rate,Upper Confidence Limit for Crude Rate,US Crude Rate,US Age-adjusted Rate
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821,8.6,0.1,8.4,8.9,14.8,14.7
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283,16.6,0.4,15.9,17.3,13.2,13.1
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984,0.3,0.1,0.2,0.5,12.0,11.9
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056,21.6,0.3,21.1,22.1,8.9,8.9
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571,0.2,0.0,0.1,0.2,12.4,12.3
5,2001,Both Sexes,45-54 years,Non-Hispanic White,United States,3850,29733531,12.9,0.2,12.5,13.4,6.8,6.8
6,2003,Male,Less than 15 years,Non-Hispanic White,United States,38,18562010,0.2,0.0,0.1,0.3,8.9,8.9
7,2010,Both Sexes,15-24 years,Non-Hispanic White,United States,3020,25356574,11.9,0.2,11.5,12.3,12.4,12.3
8,2011,Both Sexes,75+ years,Hispanic,United States,21,1213487,1.7,0.4,1.1,2.6,13.3,13.2
9,2001,Male,55-64 years,Non-Hispanic Black,United States,129,1100021,11.7,1.0,9.7,13.8,6.8,6.8


In [10]:
df0.columns

Index(['Year', 'Sex', 'Age', 'Race and Hispanic Origin', 'State', 'Deaths',
       'Population', 'Crude Death Rate', 'Standard Error for Crude Rate',
       'Low Confidence Limit for Crude Rate',
       'Upper Confidence Limit for Crude Rate', 'US Crude Rate',
       'US Age-adjusted Rate'],
      dtype='object')

In [11]:
df0.dtypes

Year                                       int64
Sex                                       object
Age                                       object
Race and Hispanic Origin                  object
State                                     object
Deaths                                     int64
Population                                 int64
Crude Death Rate                         float64
Standard Error for Crude Rate            float64
Low Confidence Limit for Crude Rate      float64
Upper Confidence Limit for Crude Rate    float64
US Crude Rate                            float64
US Age-adjusted Rate                     float64
dtype: object

In [12]:
df1 = df0.drop(['Crude Death Rate', 'Standard Error for Crude Rate',
       'Low Confidence Limit for Crude Rate',
       'Upper Confidence Limit for Crude Rate', 'US Crude Rate',
       'US Age-adjusted Rate'], axis=1)

In [13]:
df1.head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571


In [14]:
df1.dtypes

Year                         int64
Sex                         object
Age                         object
Race and Hispanic Origin    object
State                       object
Deaths                       int64
Population                   int64
dtype: object

In [19]:
df1['Year'] = df1['Year'].astype('category')
df1['Sex'] = df1['Sex'].astype('category')
df1['Age'] = df1['Age'].astype('category')
df1['Race and Hispanic Origin'] = df1['Race and Hispanic Origin'].astype('category')
df1['State'] = df1['State'].astype('category')

df1.dtypes

Year                        category
Sex                         category
Age                         category
Race and Hispanic Origin    category
State                       category
Deaths                         int64
Population                     int64
dtype: object

In [22]:
df1o = pd.get_dummies(df1, columns=['Year', 'Sex', 'Age', 'Race and Hispanic Origin', 'State'])
df1o.columns

Index(['Deaths', 'Population', 'Year_1999', 'Year_2000', 'Year_2001',
       'Year_2002', 'Year_2003', 'Year_2004', 'Year_2005', 'Year_2006',
       'Year_2007', 'Year_2008', 'Year_2009', 'Year_2010', 'Year_2011',
       'Year_2012', 'Year_2013', 'Year_2014', 'Year_2015', 'Sex_Both Sexes',
       'Sex_Female', 'Sex_Male', 'Age_15-24 years', 'Age_25-34 years',
       'Age_35-44 years', 'Age_45-54 years', 'Age_55-64 years',
       'Age_65-74 years', 'Age_75+ years', 'Age_All Ages',
       'Age_Less than 15 years',
       'Race and Hispanic Origin_All Races-All Origins',
       'Race and Hispanic Origin_Hispanic',
       'Race and Hispanic Origin_Non-Hispanic Black',
       'Race and Hispanic Origin_Non-Hispanic White', 'State_Alabama',
       'State_Alaska', 'State_Arizona', 'State_Arkansas', 'State_California',
       'State_Colorado', 'State_Connecticut', 'State_Delaware',
       'State_District of Columbia', 'State_Florida', 'State_Georgia',
       'State_Hawaii', 'State_Idaho', 'Stat

## Decision Tree Model

In [23]:
dTree_mse = DecisionTreeRegressor(random_state=0, criterion='mse')
dTree_mse.fit(df1o.drop(['Deaths'],axis=1), df1o['Deaths'])

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [24]:
dTree_mse.feature_importances_

array([8.95536891e-01, 5.97523931e-04, 9.71686184e-04, 8.17226327e-04,
       1.04639769e-03, 4.86193213e-04, 3.20803149e-04, 2.76898727e-04,
       2.26566547e-04, 8.07143772e-05, 5.87011110e-05, 3.61438904e-05,
       7.28825101e-05, 4.60371118e-05, 4.33792948e-05, 9.42476329e-05,
       5.01230794e-04, 3.18308319e-03, 6.04974440e-04, 2.25035938e-02,
       6.12660790e-03, 2.89987518e-04, 6.81804880e-03, 9.98074476e-03,
       1.33138105e-02, 2.43542273e-03, 1.42214144e-05, 5.59126225e-06,
       1.49953799e-04, 1.91915356e-02, 8.89187583e-03, 1.10699521e-04,
       3.22419117e-04, 3.72772288e-03, 6.67495534e-06, 4.70925250e-08,
       1.12859992e-06, 1.49954339e-07, 1.56804300e-04, 2.48729348e-06,
       1.99625266e-08, 3.99313901e-07, 0.00000000e+00, 1.59402821e-04,
       6.20384023e-06, 1.27378347e-07, 2.10750451e-07, 2.35915902e-05,
       1.09617514e-05, 5.61527305e-06, 1.20220980e-08, 9.82688665e-06,
       5.92460885e-07, 3.54186491e-08, 1.99751227e-06, 1.10406786e-05,
      

In [25]:
for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
    print(df1o.drop(['Deaths'],axis=1).columns[i], "has importance", dTree_mse.feature_importances_[i])

Population has importance 0.8955368908956396
Year_1999 has importance 0.0005975239307293114
Year_2000 has importance 0.0009716861842844031
Year_2001 has importance 0.0008172263270101003
Year_2002 has importance 0.0010463976851457632
Year_2003 has importance 0.0004861932131133754
Year_2004 has importance 0.00032080314858082764
Year_2005 has importance 0.00027689872706007873
Year_2006 has importance 0.00022656654714549722
Year_2007 has importance 8.07143771741021e-05
Year_2008 has importance 5.870111100469168e-05
Year_2009 has importance 3.614389043082414e-05
Year_2010 has importance 7.288251008580415e-05
Year_2011 has importance 4.603711181064502e-05
Year_2012 has importance 4.337929483183987e-05
Year_2013 has importance 9.424763291164542e-05
Year_2014 has importance 0.0005012307938595268
Year_2015 has importance 0.003183083192992117
Sex_Both Sexes has importance 0.0006049744404203129
Sex_Female has importance 0.02250359378213672
Sex_Male has importance 0.006126607895009445
Age_15-24 ye

In [26]:
'Year' in df1o.drop(['Deaths'],axis=1).columns[1]

True

In [30]:
tots = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots:
    for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
        if elem[0] in df1o.drop(['Deaths'],axis=1).columns[i]:
            elem[1] = elem[1] + dTree_mse.feature_importances_[i]

In [31]:
tots

[['Year', 0.008859715678170555],
 ['Sex', 0.029235176117566476],
 ['Age', 0.05219931643106293],
 ['Race and Hispanic Origin', 0.013052717339867653],
 ['State', 0.0011161835376927336]]