# Drug Poisoning Data Analysis

Import libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

Load in data

In [2]:
df = pd.read_csv("data.csv")
df.shape

(2703, 18)

In [3]:
df

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Crude Death Rate,Standard Error for Crude Rate,Low Confidence Limit for Crude Rate,Upper Confidence Limit for Crude Rate,Age-adjusted Rate,Standard Error Age-adjusted Rate,Lower Confidence Limit for Age-adjusted rate,Upper Confidence Limit for Age-adjusted Rate,State Crude Rate in Range,US Crude Rate,US Age-adjusted Rate
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821,8.6,0.1,8.4,8.9,,,,,,14.8,14.7
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283,16.6,0.4,15.9,17.3,,,,,,13.2,13.1
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984,0.3,0.1,0.2,0.5,,,,,,12.0,11.9
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056,21.6,0.3,21.1,22.1,,,,,,8.9,8.9
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571,0.2,0.0,0.1,0.2,,,,,,12.4,12.3
5,2001,Both Sexes,45-54 years,Non-Hispanic White,United States,3850,29733531,12.9,0.2,12.5,13.4,,,,,,6.8,6.8
6,2003,Male,Less than 15 years,Non-Hispanic White,United States,38,18562010,0.2,0.0,0.1,0.3,,,,,,8.9,8.9
7,2010,Both Sexes,15-24 years,Non-Hispanic White,United States,3020,25356574,11.9,0.2,11.5,12.3,,,,,,12.4,12.3
8,2011,Both Sexes,75+ years,Hispanic,United States,21,1213487,1.7,0.4,1.1,2.6,,,,,,13.3,13.2
9,2001,Male,55-64 years,Non-Hispanic Black,United States,129,1100021,11.7,1.0,9.7,13.8,,,,,,6.8,6.8


In [4]:
df['Age'].unique()

array(['15-24 years', 'Less than 15 years', '35-44 years', '45-54 years',
       '75+ years', '55-64 years', '65-74 years', '25-34 years',
       'All Ages'], dtype=object)

In [5]:
df.columns

Index(['Year', 'Sex', 'Age', 'Race and Hispanic Origin', 'State', 'Deaths',
       'Population', 'Crude Death Rate', 'Standard Error for Crude Rate',
       'Low Confidence Limit for Crude Rate',
       'Upper Confidence Limit for Crude Rate', 'Age-adjusted Rate',
       'Standard Error Age-adjusted Rate',
       'Lower Confidence Limit for Age-adjusted rate',
       'Upper Confidence Limit for Age-adjusted Rate',
       'State Crude Rate in Range', 'US Crude Rate', 'US Age-adjusted Rate'],
      dtype='object')

In [6]:
feats = df.drop(['Deaths'], axis=1)
labels = df['Deaths']

In [7]:
df0 = df.drop(['Age-adjusted Rate',
       'Standard Error Age-adjusted Rate',
       'Lower Confidence Limit for Age-adjusted rate',
       'Upper Confidence Limit for Age-adjusted Rate',
       'State Crude Rate in Range'], axis=1)
df0

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Crude Death Rate,Standard Error for Crude Rate,Low Confidence Limit for Crude Rate,Upper Confidence Limit for Crude Rate,US Crude Rate,US Age-adjusted Rate
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821,8.6,0.1,8.4,8.9,14.8,14.7
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283,16.6,0.4,15.9,17.3,13.2,13.1
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984,0.3,0.1,0.2,0.5,12.0,11.9
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056,21.6,0.3,21.1,22.1,8.9,8.9
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571,0.2,0.0,0.1,0.2,12.4,12.3
5,2001,Both Sexes,45-54 years,Non-Hispanic White,United States,3850,29733531,12.9,0.2,12.5,13.4,6.8,6.8
6,2003,Male,Less than 15 years,Non-Hispanic White,United States,38,18562010,0.2,0.0,0.1,0.3,8.9,8.9
7,2010,Both Sexes,15-24 years,Non-Hispanic White,United States,3020,25356574,11.9,0.2,11.5,12.3,12.4,12.3
8,2011,Both Sexes,75+ years,Hispanic,United States,21,1213487,1.7,0.4,1.1,2.6,13.3,13.2
9,2001,Male,55-64 years,Non-Hispanic Black,United States,129,1100021,11.7,1.0,9.7,13.8,6.8,6.8


In [8]:
df0.columns

Index(['Year', 'Sex', 'Age', 'Race and Hispanic Origin', 'State', 'Deaths',
       'Population', 'Crude Death Rate', 'Standard Error for Crude Rate',
       'Low Confidence Limit for Crude Rate',
       'Upper Confidence Limit for Crude Rate', 'US Crude Rate',
       'US Age-adjusted Rate'],
      dtype='object')

In [9]:
df0.dtypes

Year                                       int64
Sex                                       object
Age                                       object
Race and Hispanic Origin                  object
State                                     object
Deaths                                     int64
Population                                 int64
Crude Death Rate                         float64
Standard Error for Crude Rate            float64
Low Confidence Limit for Crude Rate      float64
Upper Confidence Limit for Crude Rate    float64
US Crude Rate                            float64
US Age-adjusted Rate                     float64
dtype: object

In [10]:
df1 = df0.drop(['Crude Death Rate', 'Standard Error for Crude Rate',
       'Low Confidence Limit for Crude Rate',
       'Upper Confidence Limit for Crude Rate', 'US Crude Rate',
       'US Age-adjusted Rate'], axis=1)

In [11]:
df1.head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571


In [12]:
df1.dtypes

Year                         int64
Sex                         object
Age                         object
Race and Hispanic Origin    object
State                       object
Deaths                       int64
Population                   int64
dtype: object

In [13]:
df1['Year'] = df1['Year'].astype('category')
df1['Sex'] = df1['Sex'].astype('category')
df1['Age'] = df1['Age'].astype('category')
df1['Race and Hispanic Origin'] = df1['Race and Hispanic Origin'].astype('category')
df1['State'] = df1['State'].astype('category')

df1.dtypes

Year                        category
Sex                         category
Age                         category
Race and Hispanic Origin    category
State                       category
Deaths                         int64
Population                     int64
dtype: object

In [14]:
df1o = pd.get_dummies(df1, columns=['Year', 'Sex', 'Age', 'Race and Hispanic Origin', 'State'])
df1o.columns;

## Decision Tree Model

### Using MSE

In [15]:
dTree_mse = DecisionTreeRegressor(random_state=0, criterion='mse')
dTree_mse.fit(df1o.drop(['Deaths'],axis=1), df1o['Deaths'])

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [16]:
dTree_mse.feature_importances_;

In [17]:
for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
    print(df1o.drop(['Deaths'],axis=1).columns[i], "has importance", dTree_mse.feature_importances_[i])

Population has importance 0.8955368908956396
Year_1999 has importance 0.0005975239307293114
Year_2000 has importance 0.0009716861842844031
Year_2001 has importance 0.0008172263270101003
Year_2002 has importance 0.0010463976851457632
Year_2003 has importance 0.0004861932131133754
Year_2004 has importance 0.00032080314858082764
Year_2005 has importance 0.00027689872706007873
Year_2006 has importance 0.00022656654714549722
Year_2007 has importance 8.07143771741021e-05
Year_2008 has importance 5.870111100469168e-05
Year_2009 has importance 3.614389043082414e-05
Year_2010 has importance 7.288251008580415e-05
Year_2011 has importance 4.603711181064502e-05
Year_2012 has importance 4.337929483183987e-05
Year_2013 has importance 9.424763291164542e-05
Year_2014 has importance 0.0005012307938595268
Year_2015 has importance 0.003183083192992117
Sex_Both Sexes has importance 0.0006049744404203129
Sex_Female has importance 0.02250359378213672
Sex_Male has importance 0.006126607895009445
Age_15-24 ye

In [18]:
tots_DTmse = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_DTmse:
    for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
        if elem[0] in df1o.drop(['Deaths'],axis=1).columns[i]:
            elem[1] = elem[1] + dTree_mse.feature_importances_[i]

In [19]:
tots_DTmse

[['Year', 0.008859715678170555],
 ['Sex', 0.029235176117566476],
 ['Age', 0.05219931643106293],
 ['Race and Hispanic Origin', 0.013052717339867653],
 ['State', 0.0011161835376927336]]

### Freidman MSE

In [20]:
dTree_fmse = DecisionTreeRegressor(random_state=0, criterion='friedman_mse')
dTree_fmse.fit(df1o.drop(['Deaths'],axis=1), df1o['Deaths'])

DecisionTreeRegressor(criterion='friedman_mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=0,
           splitter='best')

In [21]:
dTree_fmse.feature_importances_;

In [22]:
for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
    print(df1o.drop(['Deaths'],axis=1).columns[i], "has importance", dTree_fmse.feature_importances_[i])

Population has importance 0.8960823209838781
Year_1999 has importance 0.0009942132149369808
Year_2000 has importance 0.0009673492893829504
Year_2001 has importance 0.0009700849463314784
Year_2002 has importance 0.0010683016527733055
Year_2003 has importance 0.0004593562012992512
Year_2004 has importance 0.000363601234455017
Year_2005 has importance 0.0003786923609807
Year_2006 has importance 0.00012548103196157354
Year_2007 has importance 8.155434834424169e-05
Year_2008 has importance 5.6848515057301824e-05
Year_2009 has importance 3.854060725818639e-05
Year_2010 has importance 8.435539819474644e-05
Year_2011 has importance 4.451179243350439e-05
Year_2012 has importance 4.275716162950531e-05
Year_2013 has importance 0.00013558100783502586
Year_2014 has importance 0.0005985754702824375
Year_2015 has importance 0.0033717541859635113
Sex_Both Sexes has importance 0.0022451736677538093
Sex_Female has importance 0.022503207427365383
Sex_Male has importance 0.006174795450125016
Age_15-24 yea

In [23]:
tots_DTfmse = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_DTfmse:
    for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
        if elem[0] in df1o.drop(['Deaths'],axis=1).columns[i]:
            elem[1] = elem[1] + dTree_fmse.feature_importances_[i]

In [24]:
tots_DTfmse

[['Year', 0.009781558419119717],
 ['Sex', 0.030923176545244208],
 ['Age', 0.05233345372291544],
 ['Race and Hispanic Origin', 0.009773643805595185],
 ['State', 0.0011058465232472547]]

### MAE

In [25]:
dTree_mae = DecisionTreeRegressor(random_state=0, criterion='mae')
dTree_mae.fit(df1o.drop(['Deaths'],axis=1), df1o['Deaths'])

DecisionTreeRegressor(criterion='mae', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [26]:
dTree_mae.feature_importances_;

In [27]:
for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
    print(df1o.drop(['Deaths'],axis=1).columns[i], "has importance", dTree_mae.feature_importances_[i])

Population has importance 0.6748110877558902
Year_1999 has importance 0.007477949870776992
Year_2000 has importance 0.0075138661857630734
Year_2001 has importance 0.00769712593752941
Year_2002 has importance 0.00654996749140767
Year_2003 has importance 0.004491486643349909
Year_2004 has importance 0.002758848990286367
Year_2005 has importance 0.0047569644655663145
Year_2006 has importance 0.003214942917940956
Year_2007 has importance 0.0016084450940152813
Year_2008 has importance 0.000906346045040357
Year_2009 has importance 0.0011069148642698654
Year_2010 has importance 0.0017992775627967555
Year_2011 has importance 0.003167559345760522
Year_2012 has importance 0.0026932908972696025
Year_2013 has importance 0.004981982342587553
Year_2014 has importance 0.007494177121523716
Year_2015 has importance 0.0107106345828677
Sex_Both Sexes has importance 0.008812911698874147
Sex_Female has importance 0.02462517755316858
Sex_Male has importance 0.014492882186914125
Age_15-24 years has importanc

In [28]:
tots_DTmae = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_DTmae:
    for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
        if elem[0] in df1o.drop(['Deaths'],axis=1).columns[i]:
            elem[1] = elem[1] + dTree_mae.feature_importances_[i]

In [29]:
tots_DTmae

[['Year', 0.07892978035875205],
 ['Sex', 0.047930971438956846],
 ['Age', 0.16466440422730697],
 ['Race and Hispanic Origin', 0.018531087626072213],
 ['State', 0.01513266859302163]]

## Random Forests

### MSE

In [30]:
rf_mse = RandomForestRegressor(n_estimators=10000, n_jobs=8, criterion='mse', random_state=0)
rf_mse.fit(df1o.drop(['Deaths'],axis=1), df1o['Deaths'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=8,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [31]:
rf_mse.feature_importances_;

In [32]:
for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
    print(df1o.drop(['Deaths'],axis=1).columns[i], "has importance", rf_mse.feature_importances_[i])

Population has importance 0.8892889411052245
Year_1999 has importance 0.003927399919424608
Year_2000 has importance 0.0035375685505226856
Year_2001 has importance 0.0024984933683859173
Year_2002 has importance 0.0011347700467770803
Year_2003 has importance 0.0006633078760154232
Year_2004 has importance 0.0005109139076380317
Year_2005 has importance 0.0004961837670953116
Year_2006 has importance 0.0003327725491594124
Year_2007 has importance 0.00015788936060293578
Year_2008 has importance 0.00012146766274621428
Year_2009 has importance 0.00011863974378093672
Year_2010 has importance 0.00019872472095456832
Year_2011 has importance 0.00035179539281850663
Year_2012 has importance 0.000361738995733794
Year_2013 has importance 0.0007196209951316378
Year_2014 has importance 0.0018057145126137825
Year_2015 has importance 0.006361912195503464
Sex_Both Sexes has importance 0.001669461940752128
Sex_Female has importance 0.01871268170762662
Sex_Male has importance 0.007197540673013594
Age_15-24 ye

In [33]:
tots_RFmse = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_RFmse:
    for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
        if elem[0] in df1o.drop(['Deaths'],axis=1).columns[i]:
            elem[1] = elem[1] + rf_mse.feature_importances_[i]

In [34]:
tots_RFmse

[['Year', 0.023298913564904312],
 ['Sex', 0.027579684321392343],
 ['Age', 0.05028931579569215],
 ['Race and Hispanic Origin', 0.008496226509616026],
 ['State', 0.0010469187031671625]]

In [35]:
huh= [['Year', 0.023630291051921028],
 ['Sex', 0.02763024027599163],
 ['Age', 0.0507013106827508],
 ['Race and Hispanic Origin', 0.008424596871325608],
 ['State', 0.0010475050266154822]]

### MAE

In [36]:
rf_mae = RandomForestRegressor(n_estimators=10000, n_jobs=8, criterion='mae', random_state=0)
rf_mae.fit(df1o.drop(['Deaths'],axis=1), df1o['Deaths'])

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=8,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [37]:
rf_mae.feature_importances_;

In [38]:
for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
    print(df1o.drop(['Deaths'],axis=1).columns[i], "has importance", rf_mae.feature_importances_[i])

Population has importance 0.6425088186564889
Year_1999 has importance 0.011654844396037884
Year_2000 has importance 0.01133945781103143
Year_2001 has importance 0.010112625869549998
Year_2002 has importance 0.00641651397931943
Year_2003 has importance 0.0046856536147907085
Year_2004 has importance 0.004082343878191998
Year_2005 has importance 0.0038873257537108733
Year_2006 has importance 0.0036269414811978544
Year_2007 has importance 0.002446229401200715
Year_2008 has importance 0.0019185290862902952
Year_2009 has importance 0.002098718673989757
Year_2010 has importance 0.0026898578275783803
Year_2011 has importance 0.003203462414693287
Year_2012 has importance 0.0032622135441710215
Year_2013 has importance 0.004947827560341437
Year_2014 has importance 0.008074804180667413
Year_2015 has importance 0.015411495502935293
Sex_Both Sexes has importance 0.010881445028548873
Sex_Female has importance 0.024333051247794987
Sex_Male has importance 0.021135837179143278
Age_15-24 years has import

In [39]:
tots_RFmae = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_RFmae:
    for i in range(len(df1o.drop(['Deaths'],axis=1).columns)):
        if elem[0] in df1o.drop(['Deaths'],axis=1).columns[i]:
            elem[1] = elem[1] + rf_mae.feature_importances_[i]

In [40]:
tots_RFmae

[['Year', 0.09985884497569779],
 ['Sex', 0.05635033345548714],
 ['Age', 0.1646589129875856],
 ['Race and Hispanic Origin', 0.02150485293106854],
 ['State', 0.015118236993674447]]