""Now that we've seen the story that the data tells, it's time to build the model. First we import the libraries and the data file.""

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import scale

print("Loaded")

Loaded


In [2]:
df = pd.read_csv (r"C:\Users\Laura-Black\Documents\PhD\Data Scientist\Springboard\Capstone Projects\Drug Abuse ED Visits\Output files\Capstone1DataWrangling.csv", 
                     index_col=0, low_memory=False)

In [3]:
print('The number of data points =\n', df.count())
print('The number of features = ',len(df.columns))
print('The target is Case Type.')
print('The number of distinct values in the target = ', df['CASETYPE'].nunique())
print('They are: ',df.CASETYPE.unique())
print('The relative size of each class = \n', df['CASETYPE'].value_counts(normalize=True))

The number of data points =
 METRO           229211
STRATA          229211
PSU             229211
REPLICATE       229211
CASEWGT         229211
                 ...  
ALCOHOL         229211
NONALCILL       229211
PHARMA          229211
NONMEDPHARMA    229211
ALLABUSE        229211
Length: 86, dtype: int64
The number of features =  86
The target is Case Type.
The number of distinct values in the target =  8
They are:  ['Other' 'Adverse Reaction' 'Seeking Detox' 'Suicide Attempt'
 'Overmedication' 'Accidential Ingestion' 'Alcohol (Age<21)'
 'Malicious Poisoning']
The relative size of each class = 
 Adverse Reaction         0.384345
Other                    0.382303
Overmedication           0.079167
Seeking Detox            0.064748
Suicide Attempt          0.039409
Alcohol (Age<21)         0.032376
Accidential Ingestion    0.014192
Malicious Poisoning      0.003460
Name: CASETYPE, dtype: float64


In [4]:
print(df.head())
print(df.info())

              METRO  STRATA  PSU  REPLICATE   CASEWGT  PSUFRAME   AGECAT  \
CASEID                                                                     
1          New York      25  108          2  0.942635         3    18-20   
2          New York      29  129          2  5.992011         9     > 65   
3       Date County       7   25          1  4.723172         6     > 65   
4           Phoenix       8   29          2  4.080147         6  06-11\t   
5            Boston      22   94          2  5.177709        10    25-29   

           SEX      RACE  YEAR  ...       DRUGID_22        ROUTE_22  \
CASEID                          ...                                   
1         Male     Black  2011  ...  Not Applicable  Not Applicable   
2         Male  Hispanic  2011  ...  Not Applicable  Not Applicable   
3       Female     Black  2011  ...  Not Applicable  Not Applicable   
4         Male  Hispanic  2011  ...  Not Applicable  Not Applicable   
5         Male  Hispanic  2011  ...  Not 

""Now, we will create the design matrix (X) and the target vector (y) for
the associated classification problem.""

In [5]:
x_split = df.drop('CASETYPE', axis=1)

X = pd.get_dummies(x_split,drop_first=True)

In [12]:
df['CASETYPE-B'] = np.where(df['CASETYPE'] == 'Adverse Reaction', 'Adverse Reaction', 'Not Adverse Reaction')
print(df['CASETYPE-B'])

CASEID
1         Not Adverse Reaction
2             Adverse Reaction
3             Adverse Reaction
4             Adverse Reaction
5         Not Adverse Reaction
                  ...         
229207        Adverse Reaction
229208    Not Adverse Reaction
229209        Adverse Reaction
229210        Adverse Reaction
229211        Adverse Reaction
Name: CASETYPE-B, Length: 229211, dtype: object


In [13]:
y = df['CASETYPE-B'].values

""Next, we will split the data into test and training sets.""

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42, stratify=y)

In [16]:
from collections import Counter

c_train = Counter(y_train)
[(i, c_train[i] / len(y_train)) for i in c_train]


[('Not Adverse Reaction', 0.6156581300990358),
 ('Adverse Reaction', 0.3843418699009642)]

In [17]:
c_test = Counter(y_test)
[(i, c_test[i] / len(y_test)) for i in c_test]

[('Not Adverse Reaction', 0.6156514151715111),
 ('Adverse Reaction', 0.3843485848284888)]

In [18]:
# Create the classifier: logreg
logreg = LogisticRegression(multi_class='multinomial',solver='lbfgs', max_iter=1000000000)

# Fit the classifier to the training data
logreg.fit(X_train, y_train)

# Predict the labels of the training and test sets
y_pred_test = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train)

# Compute and print the confusion matrices and classification reports for the Training set
print('Train Set')
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print(logreg.score(X_train,y_train))

# Compute and print the confusion matrices and classification reports for the Test set
print('Test Set')
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))
print(logreg.score(X_test,y_test))


Train Set
[[51611  1246]
 [ 2733 81936]]
                      precision    recall  f1-score   support

    Adverse Reaction       0.95      0.98      0.96     52857
Not Adverse Reaction       0.99      0.97      0.98     84669

            accuracy                           0.97    137526
           macro avg       0.97      0.97      0.97    137526
        weighted avg       0.97      0.97      0.97    137526

0.9710672890944257
Test Set
[[34358   881]
 [ 1893 54553]]
                      precision    recall  f1-score   support

    Adverse Reaction       0.95      0.97      0.96     35239
Not Adverse Reaction       0.98      0.97      0.98     56446

            accuracy                           0.97     91685
           macro avg       0.97      0.97      0.97     91685
        weighted avg       0.97      0.97      0.97     91685

0.9697442329715875


print('Train Set')
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print(logreg.score(X_train,y_train))

""Both the training set and the test set show an overall accuracy of 0.97. The test set has the following results:

Adverse Reactions have a precision of 0.95, and Not Adverse Reaction has a precision of 0.98.
""

In [20]:
import pickle
filenameX = 'CP1_Model_X_train_Bivariate'
outfile = open(filenameX,'w+b')
pickle.dump(X_train,outfile)
outfile.close()

#X_train, X_test, y_train, y_test

In [21]:
filenameXtest = 'CP1_Model_X_test_Bivariate'
outfile = open(filenameXtest,'w+b')
pickle.dump(X_test,outfile)
outfile.close()

In [22]:
filenameYtest = 'CP1_Model_y_test_Bivariate'
outfile = open(filenameYtest,'wb')
pickle.dump(y_test,outfile)
outfile.close()

In [23]:
filenameYtrain = 'CP1_Model_y_train_Bivariate'
outfile = open(filenameYtrain,'wb')
pickle.dump(y_train,outfile)
outfile.close()