# ROC (Pdox_ID)

In [None]:
# load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.model_selection import cross_val_score, validation_curve, learning_curve, train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression

In [None]:
# import data
dat = pd.read_csv('../data/final_permit_encode.csv')
dat.head()

In [None]:
# create X and y
X = dat.drop(["elapsed_workdays","pdox_b1_id","over_90","over_150"], axis = 1)
y = dat["over_150"] # over150
X.columns

## Naive Bayes

In [None]:
# Fitting a Naive Bayes model
bayes= GaussianNB()
bayes.fit(X,y)

In [None]:
# Performing  10-fold cross-validation
round(np.mean(cross_val_score(bayes, X, y, cv=10)),3)

In [None]:
# Determining the size of the trainning set
sizes = np.arange(100, 1800,100)
print('Sizes: ', sizes)

# comparing accuracy scores
train_sizes, train_scores, test_scores = learning_curve(bayes, X, y, cv=10, train_sizes=sizes)
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

In [None]:
test_mean

In [None]:
# split training and test 
training_size = 1700 # highest score above
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=training_size, test_size=X.shape[0]-training_size)

# fitting the model on training set
bayes.fit(Xtrain, ytrain)
y_pred1 = bayes.predict(Xtest)

In [None]:
# true positives and false positives
false_positive_rate1, true_positive_rate1, thresholds1 = roc_curve(ytest, y_pred1, pos_label=1)
rates1 = pd.DataFrame(dict(fpr1=false_positive_rate1, tpr1=true_positive_rate1))
roc_auc1 = auc(rates1['fpr1'], rates1['tpr1'])
print('AUC SVC: ', round(roc_auc1,3))

## Decision Tree

In [None]:
# Fit decision tree model
dt = tree.DecisionTreeClassifier()

In [None]:
dt.fit(X,y)

In [None]:
# 10-fold cross validation
round(np.mean(cross_val_score(dt, X, y, cv=10)),3)

In [None]:
# fit the model on the training set
dt.fit(Xtrain, ytrain)
y_pred2 = dt.predict(Xtest)

In [None]:
# true positives and false positives
false_positive_rate2, true_positive_rate2, thresholds2 = roc_curve(ytest, y_pred2, pos_label=1)
rates2 = pd.DataFrame(dict(fpr2=false_positive_rate2, tpr2=true_positive_rate2))
roc_auc2 = auc(rates2['fpr2'], rates2['tpr2'])
print('AUC SVC: ', round(roc_auc2,3))

## logit

In [None]:
#fit logit model
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X,y)

In [None]:
# Performing 10-fold cross-validation
round(np.mean(cross_val_score(logreg, X, y, cv=10)),3)

In [None]:
logreg.fit(Xtrain, ytrain)
y_pred3 = logreg.predict(Xtest)

In [None]:
# true positives and false positives
false_positive_rate3, true_positive_rate3, thresholds3 = roc_curve(ytest, y_pred3, pos_label=1)
rates3 = pd.DataFrame(dict(fpr3=false_positive_rate3, tpr3=true_positive_rate3))
roc_auc3 = auc(rates3['fpr3'], rates3['tpr3'])
print('AUC SVC: ', round(roc_auc3,3))

## KNN

In [None]:
# KNN
# Test which k performs the best
k_range = range(1,20)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X, y, cv=10, scoring = 'accuracy')
    k_scores.append(scores.mean())

In [None]:
k_scores

In [None]:
# choose neighbors = 9 according to scores above
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(Xtrain, ytrain)
y_pred4 = knn.predict(Xtest)

In [None]:
# true positives and false positives
false_positive_rate4, true_positive_rate4, thresholds4 = roc_curve(ytest, y_pred4, pos_label=1)
rates4 = pd.DataFrame(dict(fpr4=false_positive_rate4, tpr4=true_positive_rate4))
roc_auc4 = auc(rates4['fpr4'], rates4['tpr4'])
print('AUC SVC: ', round(roc_auc4,3))

In [None]:
#ROC curve over150
plt.plot(rates1.fpr1, rates1.tpr1, 'b',color='red', label = 'Naive Bayes = %0.2f' % roc_auc1)
plt.plot(rates4.fpr4, rates4.tpr4, 'b',color='grey', label = 'KNN = %0.2f' % roc_auc4)
plt.plot(rates2.fpr2, rates2.tpr2, 'b',color='cyan', label = 'Decision Tree = %0.2f' % roc_auc2)
plt.plot(rates3.fpr3, rates3.tpr3, 'b',color='blue',label = 'Logit = %0.2f' % roc_auc3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('Pdox Model: Receiver Operating Characteristic')
plt.legend(loc = 'lower right')
plt.savefig('ROC.png')
plt.show()