In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("/Users/omar.hassan/Documents/DAEN/DAEN 690/dss_cleanv2.csv")

In [None]:
data.info()

In [None]:
data['Y'] = data['Bucket']

In [None]:
data = data.drop(['loc_id', 'id', 'Month', 'Day', 'Year', 'Subsection',
                  'StandardNumber', 'complaint_related', 'technical_assistance'], axis = 1)

In [None]:
data['Y'] = data['Y'].str.replace('Administration of medications and related provisions','A')
data['Y'] = data['Y'].str.replace('Medication management plan and reference materials','B')
data['Y'] = data['Y'].str.replace('Medication review','C')
data['Y'] = data['Y'].str.replace("Physician's or other prescriber's order",'D')
data['Y'] = data['Y'].str.replace("Qualifications and supervision of staff administering medications",'E')
data['Y'] = data['Y'].str.replace('Storage of medications','F')

## Data used for ML models.
### 1) Re-run the lines below before running a new model

In [None]:
# Create a copy of dataframe with one hot encoding
data2 = data

In [None]:
data2.fillna('', inplace=True)

In [None]:
# Remove column name 'A'
data2 = data2.drop(['Bucket'], axis=1)

In [None]:
data2.info()

In [None]:
# One Hot Encoding
# ML models cannot take categorical data
from sklearn.preprocessing import LabelEncoder

# integer encode
label_encoder = LabelEncoder()
data2.iloc[:,0] = label_encoder.fit_transform(data2.iloc[:,0])
data2.iloc[:,2] = label_encoder.fit_transform(data2.iloc[:,2])
data2.iloc[:,3] = label_encoder.fit_transform(data2.iloc[:,3])
data2.iloc[:,5] = label_encoder.fit_transform(data2.iloc[:,5])
data2.iloc[:,6] = label_encoder.fit_transform(data2.iloc[:,6])
data2.iloc[:,7] = label_encoder.fit_transform(data2.iloc[:,7])
data2.iloc[:,8] = label_encoder.fit_transform(data2.iloc[:,8])
data2.iloc[:,9] = label_encoder.fit_transform(data2.iloc[:,9])
data2.iloc[:,10] = label_encoder.fit_transform(data2.iloc[:,10])
data2.iloc[:,11] = label_encoder.fit_transform(data2.iloc[:,11])
data2.iloc[:,12] = label_encoder.fit_transform(data2.iloc[:,12])
data2.iloc[:,13] = label_encoder.fit_transform(data2.iloc[:,13])
data2.head()

## STOP HERE IF RUNNING A NEW MODEL BELOW
### 2) Look out for additional notes on which lines to run 

In [None]:
X = data2.iloc[:,0:13]  #independent columns
y = data2.iloc[:,-1]    #target column

In [None]:
# Feature Selection: Univariate Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

In [None]:
# Feature Selection: Feature Importance
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

## 3) Run this line below

In [None]:
X = data2.iloc[:,[12,11,10,4,1,0,7,2,5,8]]  #independent columns
y = data2.iloc[:,-1]                        #target column

In [None]:
# Check for highly correlated variables
corr = X.corr()

corr.style.background_gradient(cmap='coolwarm')

In [None]:
# import train_test_split function
from sklearn.model_selection import train_test_split

# For evaluating models
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## 4) Run this line below

In [None]:
# input and outputs
inputs = X
outputs = y

# split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.3, random_state=1)

In [None]:
# import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

# create a Gaussian Classifier
classifer = GaussianNB()

# train the model using the training sets
classifer.fit(X_train, y_train)

# predict the response for test dataset
y_pred = classifer.predict(X_test)

In [None]:
nb_accuracy = accuracy_score(y_test,y_pred)
print('Accuracy:',nb_accuracy)

# passing actual and predicted values
nb_cm = confusion_matrix(y_test, y_pred)

# true Write data values in each cell of the matrix
plt.figure(figsize = (15,8))
sns.heatmap(nb_cm, annot=True, fmt='.0f')
plt.savefig('confusion.png')

nb_cr = classification_report(y_test, y_pred)
print('Classification Report:')
print (nb_cr)

# Re-run lines from earlier before running the KNN model below.

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
ypred=knn_clf.predict(X_test) #These are the predicted output values

In [None]:
knn_accuracy = accuracy_score(y_test,ypred)
print('Accuracy:',knn_accuracy)

# passing actual and predicted values
knn_cm = confusion_matrix(y_test, y_pred)

# true Write data values in each cell of the matrix
plt.figure(figsize = (15,8))
sns.heatmap(knn_cm, annot=True, fmt='.0f')
plt.savefig('confusion.png')

knn_cr = classification_report(y_test, ypred)
print('Classification Report:')
print (knn_cr)