""Now that we've seen the story that the data tells, it's time to build the model. First we import the libraries and the data file.""

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import scale

print("Loaded")

Loaded


In [32]:
df = pd.read_csv (r"C:\Users\Laura-Black\Documents\PhD\Data Scientist\Springboard\Capstone Projects\Drug Abuse ED Visits\Output files\Capstone1DataWrangling.csv", 
                     index_col=0, low_memory=False)

In [33]:
print('The number of data points =\n', df.count())
print('The number of features = ',len(df.columns))
print('The target is Case Type.')
print('The number of distinct values in the target = ', df['CASETYPE'].nunique())
print('They are: ',df.CASETYPE.unique())
print('The relative size of each class = \n', df['CASETYPE'].value_counts(normalize=True))

The number of data points =
 METRO           229211
STRATA          229211
PSU             229211
REPLICATE       229211
CASEWGT         229211
                 ...  
ALCOHOL         229211
NONALCILL       229211
PHARMA          229211
NONMEDPHARMA    229211
ALLABUSE        229211
Length: 86, dtype: int64
The number of features =  86
The target is Case Type.
The number of distinct values in the target =  8
They are:  ['Other' 'Adverse Reaction' 'Seeking Detox' 'Suicide Attempt'
 'Overmedication' 'Accidential Ingestion' 'Alcohol (Age<21)'
 'Malicious Poisoning']
The relative size of each class = 
 Adverse Reaction         0.384345
Other                    0.382303
Overmedication           0.079167
Seeking Detox            0.064748
Suicide Attempt          0.039409
Alcohol (Age<21)         0.032376
Accidential Ingestion    0.014192
Malicious Poisoning      0.003460
Name: CASETYPE, dtype: float64


In [34]:
print(df.head())
print(df.info())

              METRO  STRATA  PSU  REPLICATE   CASEWGT  PSUFRAME   AGECAT  \
CASEID                                                                     
1          New York      25  108          2  0.942635         3    18-20   
2          New York      29  129          2  5.992011         9     > 65   
3       Date County       7   25          1  4.723172         6     > 65   
4           Phoenix       8   29          2  4.080147         6  06-11\t   
5            Boston      22   94          2  5.177709        10    25-29   

           SEX      RACE  YEAR  ...       DRUGID_22        ROUTE_22  \
CASEID                          ...                                   
1         Male     Black  2011  ...  Not Applicable  Not Applicable   
2         Male  Hispanic  2011  ...  Not Applicable  Not Applicable   
3       Female     Black  2011  ...  Not Applicable  Not Applicable   
4         Male  Hispanic  2011  ...  Not Applicable  Not Applicable   
5         Male  Hispanic  2011  ...  Not 

""Now, we will create the design matrix (X) and the target vector (y) for
the associated classification problem.""

In [35]:
x_split = df.drop('CASETYPE', axis=1)

X = pd.get_dummies(x_split,drop_first=True)

In [36]:
y = df['CASETYPE'].values

""Next, we will split the data into test and training sets.""

In [37]:
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X, y, test_size = 0.5, random_state=42, stratify=y)

In [38]:
print(X_train_all.info())
print(y_train_all.shape)
print(y_train_all.dtype)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114605 entries, 16884 to 36951
Columns: 7621 entries, STRATA to ALLABUSE_Not All Misuse & Abuse Episode
dtypes: float64(1), int64(7), uint8(7613)
memory usage: 839.9 MB
None
(114605,)
object


In [39]:
print('The number of data points =\n', X_train_all.count())
print('The number of features = ',len(X_train_all.columns))

The number of data points =
 STRATA                                     114605
PSU                                        114605
REPLICATE                                  114605
CASEWGT                                    114605
PSUFRAME                                   114605
                                            ...  
ALCOHOL_Yes                                114605
NONALCILL_No Illicit Drugs                 114605
PHARMA_Pharmaceuticals                     114605
NONMEDPHARMA_Non_Medical Pharma            114605
ALLABUSE_Not All Misuse & Abuse Episode    114605
Length: 7621, dtype: int64
The number of features =  7621


In [40]:
from collections import Counter

c_train = Counter(y_train_all)
print([(i, c_train[i] / len(y_train_all)) for i in c_train])
print('Length of y = ',len(y_train_all))

[('Adverse Reaction', 0.38434623271235985), ('Other', 0.38230443697918937), ('Overmedication', 0.07916757558570743), ('Seeking Detox', 0.06474412111164435), ('Alcohol (Age<21)', 0.032372060555822174), ('Suicide Attempt', 0.03940491252563152), ('Accidential Ingestion', 0.01419658828148859), ('Malicious Poisoning', 0.003464072248156712)]
Length of y =  114605


In [41]:
X_train_all.reset_index(drop = True, inplace = True)
y_train_all_df = pd.DataFrame(y_train_all)
y_train_all_df.reset_index(drop = True, inplace = True)

In [42]:
X_train_reduce, X_test_reduce, y_train_reduce, y_test_reduce = train_test_split(X_train_all, y_train_all, test_size = 0.5, random_state=0, stratify=y_train_all)

In [43]:
from collections import Counter

c_train = Counter(y_train_reduce)
[(i, c_train[i] / len(y_train_reduce)) for i in c_train]


[('Other', 0.3823077728526055),
 ('Adverse Reaction', 0.3843495864018708),
 ('Seeking Detox', 0.06474468604935256),
 ('Overmedication', 0.0791595406792084),
 ('Accidential Ingestion', 0.014187986457715263),
 ('Malicious Poisoning', 0.0034728281735367005),
 ('Suicide Attempt', 0.03940525636103452),
 ('Alcohol (Age<21)', 0.03237234302467628)]

In [44]:
c_test = Counter(y_test_reduce)
[(i, c_test[i] / len(y_test_reduce)) for i in c_test]

[('Adverse Reaction', 0.38434287908137443),
 ('Other', 0.3823011011639879),
 ('Overmedication', 0.07917561035198856),
 ('Suicide Attempt', 0.03940456869622882),
 ('Alcohol (Age<21)', 0.03237177809189746),
 ('Seeking Detox', 0.06474355618379492),
 ('Accidential Ingestion', 0.01420518995515069),
 ('Malicious Poisoning', 0.0034553164755771948)]

In [15]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(X_train_reduce, y_train_reduce)
print(sorted(Counter(y_resampled).items()))

[('Accidential Ingestion', 22024), ('Adverse Reaction', 22024), ('Alcohol (Age<21)', 22024), ('Malicious Poisoning', 22024), ('Other', 22024), ('Overmedication', 22024), ('Seeking Detox', 22024), ('Suicide Attempt', 22024)]


In [45]:
X_train = X_resampled
y_train = y_resampled
X_test = X_test_reduce
y_test = y_test_reduce

In [18]:
# Create the classifier: logreg
logreg = LogisticRegression(multi_class='multinomial',solver='lbfgs', max_iter=1000000000)

# Fit the classifier to the training data
logreg.fit(X_train, y_train)

# Predict the labels of the training and test sets
y_pred_test = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train)

# Compute and print the confusion matrices and classification reports for the Training set
print('Train Set')
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print(logreg.score(X_train,y_train))

# Compute and print the confusion matrices and classification reports for the Test set
print('Test Set')
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))
print(logreg.score(X_test,y_test))




Train Set
[[18923  2678     0    80    93     0     4   246]
 [ 1715 18763     0     3     2     0   132  1409]
 [    0     0 22024     0     0     0     0     0]
 [    0     0     0 20378  1132   482    32     0]
 [    1     1     0  1358 14544  3344  2676   100]
 [    0     1     0   390   390 21243     0     0]
 [   16   670    42    75  2138     0 17832  1251]
 [  121  1613    96    13   524     0  1385 18272]]
                       precision    recall  f1-score   support

Accidential Ingestion       0.91      0.86      0.88     22024
     Adverse Reaction       0.79      0.85      0.82     22024
     Alcohol (Age<21)       0.99      1.00      1.00     22024
  Malicious Poisoning       0.91      0.93      0.92     22024
                Other       0.77      0.66      0.71     22024
       Overmedication       0.85      0.96      0.90     22024
        Seeking Detox       0.81      0.81      0.81     22024
      Suicide Attempt       0.86      0.83      0.84     22024

            

print('Train Set')
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print(logreg.score(X_train,y_train))

""Both the training set and the test set show an overall accuracy of 0.77. The test set has the following results:
1. Accidental Ingestion: 24% of the predictions were correct. 
2. Adverse Reaction: 97% of the predictions were correct.
3. Alcohol (<21): 99% of the predictions were correct. Because there are so few of these cases, it is odd that the precision is high. There is a possibility of overfitting, but the recall is not good so there is no way to correct it.
4. Malicious Poisoning: 9% of the predictions were correct. The accuracy of this class is affected by the low number of cases in the data set. More cases in this class would have resulted in a higher prediction rate.
5. Other: 87% of the predictions were correct.
6. Overmedication: 56% of the predictions were correct.
7. Seeking Detox: 48% of the predictions were correct.
8. Suicide Attempt: 47% of the predictions were correct.

In [20]:
import pickle
filenameX = 'CP1_Model_X_train_reduce'
outfile = open(filenameX,'w+b')
pickle.dump(X_train,outfile, protocol=4)
outfile.close()

#X_train, X_test, y_train, y_test

In [21]:
filenameXtest = 'CP1_Model_X_test_reduce'
outfile = open(filenameXtest,'w+b')
pickle.dump(X_test,outfile)
outfile.close()

In [22]:
filenameYtest = 'CP1_Model_y_test_reduce'
outfile = open(filenameYtest,'wb')
pickle.dump(y_test,outfile)
outfile.close()

In [23]:
filenameYtrain = 'CP1_Model_y_train_reduce'
outfile = open(filenameYtrain,'wb')
pickle.dump(y_train,outfile)
outfile.close()

In [53]:
#Random Forest
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred_test_rf=clf.predict(X_test)
y_pred_train_rf=clf.predict(X_train)

In [54]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8622061672163761


In [55]:
print('Random Forest Test Set')
print(confusion_matrix(y_test, y_pred_test_rf))
print(classification_report(y_test, y_pred_test_rf))
print(clf.score(X_test,y_test))

Random Forest Test Set
[[   36   749     0     0    25     0     0     4]
 [    4 21762     0     0    99    12    27   120]
 [    0     0  1854     0     0     0     1     0]
 [    0     0     0    56   133     9     0     0]
 [    0     1     0     8 20480  1157   261     0]
 [    0    12     0     4  1572  2946     1     2]
 [    0   436     6     0  1575     4  1635    54]
 [    0  1168    10     0   417    26    86   551]]
                       precision    recall  f1-score   support

Accidential Ingestion       0.90      0.04      0.08       814
     Adverse Reaction       0.90      0.99      0.94     22024
     Alcohol (Age<21)       0.99      1.00      1.00      1855
  Malicious Poisoning       0.82      0.28      0.42       198
                Other       0.84      0.93      0.89     21907
       Overmedication       0.71      0.65      0.68      4537
        Seeking Detox       0.81      0.44      0.57      3710
      Suicide Attempt       0.75      0.24      0.37      2258


In [56]:
print('Random Forest Train Set')
print(confusion_matrix(y_train, y_pred_train_rf))
print(classification_report(y_train, y_pred_train_rf))
print(clf.score(X_train,y_train))

Random Forest Train Set
[[22024     0     0     0     0     0     0     0]
 [    0 22024     0     0     0     0     0     0]
 [    0     0 22024     0     0     0     0     0]
 [    0     0     0 22024     0     0     0     0]
 [    0     0     0     0 22022     0     2     0]
 [    0     0     0     0     1 22023     0     0]
 [    0     0     0     0     3     0 22021     0]
 [    0     1     0     0     0     0     0 22023]]
                       precision    recall  f1-score   support

Accidential Ingestion       1.00      1.00      1.00     22024
     Adverse Reaction       1.00      1.00      1.00     22024
     Alcohol (Age<21)       1.00      1.00      1.00     22024
  Malicious Poisoning       1.00      1.00      1.00     22024
                Other       1.00      1.00      1.00     22024
       Overmedication       1.00      1.00      1.00     22024
        Seeking Detox       1.00      1.00      1.00     22024
      Suicide Attempt       1.00      1.00      1.00     22024