# Drug Poisoning Data Analysis - using RATE

Import libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

Load in data

In [2]:
df0 = pd.read_csv("data.csv")
df0.shape

(2703, 18)

In [3]:
df0.head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Crude Death Rate,Standard Error for Crude Rate,Low Confidence Limit for Crude Rate,Upper Confidence Limit for Crude Rate,Age-adjusted Rate,Standard Error Age-adjusted Rate,Lower Confidence Limit for Age-adjusted rate,Upper Confidence Limit for Age-adjusted Rate,State Crude Rate in Range,US Crude Rate,US Age-adjusted Rate
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821,8.6,0.1,8.4,8.9,,,,,,14.8,14.7
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283,16.6,0.4,15.9,17.3,,,,,,13.2,13.1
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984,0.3,0.1,0.2,0.5,,,,,,12.0,11.9
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056,21.6,0.3,21.1,22.1,,,,,,8.9,8.9
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571,0.2,0.0,0.1,0.2,,,,,,12.4,12.3


In [4]:
df0['Age'].unique()

array(['15-24 years', 'Less than 15 years', '35-44 years', '45-54 years',
       '75+ years', '55-64 years', '65-74 years', '25-34 years',
       'All Ages'], dtype=object)

In [5]:
df0.columns

Index(['Year', 'Sex', 'Age', 'Race and Hispanic Origin', 'State', 'Deaths',
       'Population', 'Crude Death Rate', 'Standard Error for Crude Rate',
       'Low Confidence Limit for Crude Rate',
       'Upper Confidence Limit for Crude Rate', 'Age-adjusted Rate',
       'Standard Error Age-adjusted Rate',
       'Lower Confidence Limit for Age-adjusted rate',
       'Upper Confidence Limit for Age-adjusted Rate',
       'State Crude Rate in Range', 'US Crude Rate', 'US Age-adjusted Rate'],
      dtype='object')

In [6]:
feats = df0.drop(['Deaths'], axis=1)
labels = df0['Deaths']

In [7]:
df1 = df0.drop(['Age-adjusted Rate',
                'Standard Error Age-adjusted Rate',
                'Lower Confidence Limit for Age-adjusted rate',
                'Upper Confidence Limit for Age-adjusted Rate',
                'State Crude Rate in Range',
                'US Age-adjusted Rate'], axis=1)
df1.head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population,Crude Death Rate,Standard Error for Crude Rate,Low Confidence Limit for Crude Rate,Upper Confidence Limit for Crude Rate,US Crude Rate
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821,8.6,0.1,8.4,8.9,14.8
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283,16.6,0.4,15.9,17.3,13.2
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984,0.3,0.1,0.2,0.5,12.0
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056,21.6,0.3,21.1,22.1,8.9
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571,0.2,0.0,0.1,0.2,12.4


In [8]:
df1.columns

Index(['Year', 'Sex', 'Age', 'Race and Hispanic Origin', 'State', 'Deaths',
       'Population', 'Crude Death Rate', 'Standard Error for Crude Rate',
       'Low Confidence Limit for Crude Rate',
       'Upper Confidence Limit for Crude Rate', 'US Crude Rate'],
      dtype='object')

In [9]:
df1.dtypes

Year                                       int64
Sex                                       object
Age                                       object
Race and Hispanic Origin                  object
State                                     object
Deaths                                     int64
Population                                 int64
Crude Death Rate                         float64
Standard Error for Crude Rate            float64
Low Confidence Limit for Crude Rate      float64
Upper Confidence Limit for Crude Rate    float64
US Crude Rate                            float64
dtype: object

In [10]:
df2 = df1.drop(['Crude Death Rate', 
                'Standard Error for Crude Rate',
                'Low Confidence Limit for Crude Rate',
                'Upper Confidence Limit for Crude Rate', 
                'US Crude Rate'], axis=1)

In [11]:
df2.head()

Unnamed: 0,Year,Sex,Age,Race and Hispanic Origin,State,Deaths,Population
0,2014,Both Sexes,15-24 years,All Races-All Origins,United States,3798,43979821
1,2012,Male,15-24 years,Non-Hispanic White,United States,2137,12857283
2,2008,Male,Less than 15 years,Non-Hispanic Black,United States,15,4687984
3,2003,Both Sexes,35-44 years,Non-Hispanic White,United States,6533,30270056
4,2010,Female,Less than 15 years,Non-Hispanic White,United States,28,16460571


In [12]:
df2.dtypes

Year                         int64
Sex                         object
Age                         object
Race and Hispanic Origin    object
State                       object
Deaths                       int64
Population                   int64
dtype: object

In [13]:
df2['Year'] = df2['Year'].astype('category')
df2['Sex'] = df2['Sex'].astype('category')
df2['Age'] = df2['Age'].astype('category')
df2['Race and Hispanic Origin'] = df1['Race and Hispanic Origin'].astype('category')
df2['State'] = df2['State'].astype('category')

df1.dtypes

Year                                       int64
Sex                                       object
Age                                       object
Race and Hispanic Origin                  object
State                                     object
Deaths                                     int64
Population                                 int64
Crude Death Rate                         float64
Standard Error for Crude Rate            float64
Low Confidence Limit for Crude Rate      float64
Upper Confidence Limit for Crude Rate    float64
US Crude Rate                            float64
dtype: object

In [14]:
df2o = pd.get_dummies(df2, columns=['Year', 'Sex', 'Age', 'Race and Hispanic Origin', 'State'])
df2o.columns;

In [15]:
df2o['Rate'] = 100000 * df2o['Deaths'] / df2o['Population']

## Decision Tree Model

### Using MSE

In [16]:
dTree_mse = DecisionTreeRegressor(random_state=0, criterion='mse')
dTree_mse.fit(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1), df2o['Rate'])

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [17]:
dTree_mse.feature_importances_;

In [18]:
for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
    print(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i], "has importance", dTree_mse.feature_importances_[i])

Year_1999 has importance 0.025312568734438427
Year_2000 has importance 0.024970558887373418
Year_2001 has importance 0.021689686344598638
Year_2002 has importance 0.014666054217146292
Year_2003 has importance 0.011083774457541813
Year_2004 has importance 0.009522443400563524
Year_2005 has importance 0.007555971047297281
Year_2006 has importance 0.0036405362954655286
Year_2007 has importance 0.0021396879540066826
Year_2008 has importance 0.0013478480210982232
Year_2009 has importance 0.0022823966818505074
Year_2010 has importance 0.0017233100429176402
Year_2011 has importance 0.0017587698107477386
Year_2012 has importance 0.001924169016262432
Year_2013 has importance 0.006104055151493212
Year_2014 has importance 0.013675262394648148
Year_2015 has importance 0.031909225980641005
Sex_Both Sexes has importance 0.014015131418036326
Sex_Female has importance 0.022043999756405704
Sex_Male has importance 0.06033811976270813
Age_15-24 years has importance 0.03734041605003287
Age_25-34 years has

In [19]:
tots_DTmse = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_DTmse:
    for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
        if elem[0] in df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i]:
            elem[1] = elem[1] + dTree_mse.feature_importances_[i]

In [20]:
tots_DTmse

[['Year', 0.1813063184380905],
 ['Sex', 0.09639725093715015],
 ['Age', 0.4942769942711852],
 ['Race and Hispanic Origin', 0.1441215522778803],
 ['State', 0.08389788407569373]]

### Freidman MSE

In [21]:
dTree_fmse = DecisionTreeRegressor(random_state=0, criterion='friedman_mse')
dTree_fmse.fit(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1), df2o['Rate'])

DecisionTreeRegressor(criterion='friedman_mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=0,
           splitter='best')

In [22]:
dTree_fmse.feature_importances_;

In [23]:
for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
    print(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i], "has importance", dTree_fmse.feature_importances_[i])

Year_1999 has importance 0.025312556621402475
Year_2000 has importance 0.02497054051638303
Year_2001 has importance 0.021689688751530375
Year_2002 has importance 0.014666080711836555
Year_2003 has importance 0.011083739560626565
Year_2004 has importance 0.00952243120255167
Year_2005 has importance 0.0075559943448839945
Year_2006 has importance 0.003640534568067517
Year_2007 has importance 0.002136423240252491
Year_2008 has importance 0.0013448721994259373
Year_2009 has importance 0.002285265094619711
Year_2010 has importance 0.0017255947121432198
Year_2011 has importance 0.0017593431340396518
Year_2012 has importance 0.0019246588820285055
Year_2013 has importance 0.006104450909179077
Year_2014 has importance 0.013674916842906637
Year_2015 has importance 0.03190922714621336
Sex_Both Sexes has importance 0.014883150415179012
Sex_Female has importance 0.019683872382972933
Sex_Male has importance 0.06183022813899825
Age_15-24 years has importance 0.03734041605003287
Age_25-34 years has imp

In [24]:
tots_DTfmse = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_DTfmse:
    for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
        if elem[0] in df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i]:
            elem[1] = elem[1] + dTree_fmse.feature_importances_[i]

In [25]:
tots_DTfmse

[['Year', 0.18130631843809078],
 ['Sex', 0.0963972509371502],
 ['Age', 0.4942769942711853],
 ['Race and Hispanic Origin', 0.14412155227787993],
 ['State', 0.08389788407569382]]

### MAE

In [26]:
dTree_mae = DecisionTreeRegressor(random_state=0, criterion='mae')
dTree_mae.fit(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1), df2o['Rate'])

DecisionTreeRegressor(criterion='mae', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [27]:
dTree_mae.feature_importances_;

In [28]:
for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
    print(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i], "has importance", dTree_mae.feature_importances_[i])

Year_1999 has importance 0.02812689219433261
Year_2000 has importance 0.028419680020292357
Year_2001 has importance 0.024418217306358837
Year_2002 has importance 0.019420830798661876
Year_2003 has importance 0.015396434122575048
Year_2004 has importance 0.014676898544161442
Year_2005 has importance 0.011646823190180787
Year_2006 has importance 0.010973372113164539
Year_2007 has importance 0.00797947134215829
Year_2008 has importance 0.006368858465651844
Year_2009 has importance 0.0076976141659287345
Year_2010 has importance 0.0074052931022256085
Year_2011 has importance 0.00872813433130584
Year_2012 has importance 0.009220397715239866
Year_2013 has importance 0.011039969473915051
Year_2014 has importance 0.01485131673386081
Year_2015 has importance 0.024506275471771203
Sex_Both Sexes has importance 0.013143971112738151
Sex_Female has importance 0.03784503441518715
Sex_Male has importance 0.04611705239580685
Age_15-24 years has importance 0.04297175483917
Age_25-34 years has importance 

In [29]:
tots_DTmae = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_DTmae:
    for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
        if elem[0] in df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i]:
            elem[1] = elem[1] + dTree_mae.feature_importances_[i]

In [30]:
tots_DTmae

[['Year', 0.25087647909178473],
 ['Sex', 0.09710605792373214],
 ['Age', 0.39569368350798406],
 ['Race and Hispanic Origin', 0.1170916791228019],
 ['State', 0.1392321003536971]]

## Random Forests

### MSE

In [31]:
rf_mse = RandomForestRegressor(n_estimators=1000, n_jobs=8, criterion='mse', random_state=0)
rf_mse.fit(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1), df2o['Rate'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=8,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [32]:
rf_mse.feature_importances_;

In [33]:
for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
    print(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i], "has importance", rf_mse.feature_importances_[i])

Year_1999 has importance 0.025427077191258165
Year_2000 has importance 0.023520775711967822
Year_2001 has importance 0.019110517078976604
Year_2002 has importance 0.01248645754655496
Year_2003 has importance 0.00899499655159119
Year_2004 has importance 0.007321259042618675
Year_2005 has importance 0.0055969935769717045
Year_2006 has importance 0.0036887057394795523
Year_2007 has importance 0.002016267385346208
Year_2008 has importance 0.0017891131961606495
Year_2009 has importance 0.0022888302552526597
Year_2010 has importance 0.002701062939052827
Year_2011 has importance 0.0045149383623895095
Year_2012 has importance 0.004263005205217341
Year_2013 has importance 0.008860939974954046
Year_2014 has importance 0.016266429074461977
Year_2015 has importance 0.03384497314023263
Sex_Both Sexes has importance 0.01129266782328413
Sex_Female has importance 0.04330269524772443
Sex_Male has importance 0.0427711318666882
Age_15-24 years has importance 0.030828147779751416
Age_25-34 years has impor

In [34]:
tots_RFmse = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_RFmse:
    for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
        if elem[0] in df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i]:
            elem[1] = elem[1] + rf_mse.feature_importances_[i]

In [35]:
tots_RFmse

[['Year', 0.1826923419724865],
 ['Sex', 0.09736649493769675],
 ['Age', 0.4959621432106964],
 ['Race and Hispanic Origin', 0.14055661618598692],
 ['State', 0.0834224036931332]]

In [36]:
huh= [['Year', 0.023630291051921028],
 ['Sex', 0.02763024027599163],
 ['Age', 0.0507013106827508],
 ['Race and Hispanic Origin', 0.008424596871325608],
 ['State', 0.0010475050266154822]]

### MAE

In [37]:
rf_mae = RandomForestRegressor(n_estimators=1000, n_jobs=8, criterion='mae', random_state=0)
rf_mae.fit(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1), df2o['Rate'])

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=8,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [38]:
rf_mae.feature_importances_;

In [39]:
# for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
#     print(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i], "has importance", rf_mae.feature_importances_[i])

In [40]:
tots_RFmae = [['Year', 0], ['Sex', 0], ['Age', 0], ['Race and Hispanic Origin', 0], ['State', 0]]
for elem in tots_RFmae:
    for i in range(len(df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns)):
        if elem[0] in df2o.drop(['Deaths', 'Population', 'Rate'],axis=1).columns[i]:
            elem[1] = elem[1] + rf_mae.feature_importances_[i]

In [41]:
tots_RFmae

[['Year', 0.25014109335180545],
 ['Sex', 0.09377136023113697],
 ['Age', 0.40256407595949584],
 ['Race and Hispanic Origin', 0.11814333588860795],
 ['State', 0.1353801345689538]]

## Results

In [42]:
tots_DTmse

[['Year', 0.1813063184380905],
 ['Sex', 0.09639725093715015],
 ['Age', 0.4942769942711852],
 ['Race and Hispanic Origin', 0.1441215522778803],
 ['State', 0.08389788407569373]]

In [43]:
tots_DTfmse

[['Year', 0.18130631843809078],
 ['Sex', 0.0963972509371502],
 ['Age', 0.4942769942711853],
 ['Race and Hispanic Origin', 0.14412155227787993],
 ['State', 0.08389788407569382]]

In [44]:
tots_DTmae

[['Year', 0.25087647909178473],
 ['Sex', 0.09710605792373214],
 ['Age', 0.39569368350798406],
 ['Race and Hispanic Origin', 0.1170916791228019],
 ['State', 0.1392321003536971]]

In [45]:
tots_RFmse

[['Year', 0.1826923419724865],
 ['Sex', 0.09736649493769675],
 ['Age', 0.4959621432106964],
 ['Race and Hispanic Origin', 0.14055661618598692],
 ['State', 0.0834224036931332]]

In [46]:
tots_RFmae

[['Year', 0.25014109335180545],
 ['Sex', 0.09377136023113697],
 ['Age', 0.40256407595949584],
 ['Race and Hispanic Origin', 0.11814333588860795],
 ['State', 0.1353801345689538]]