# ROC (Pdox_ID)

In [None]:
# load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, validation_curve, learning_curve, train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
# import data
dat = pd.read_csv('final_permit_uncode.csv')
dat.head()

In [None]:
# create X and Y
X = dat.drop(["elapsed_workdays","pdox_b1_id","over_90","over_150"], axis = 1)
Y2 = dat["over_150"] # over150
X.columns

In [None]:
Y1 = dat["over_90"] # over90
Y = dat["elapsed_workdays"] # Y as continuous variablr

# over150

In [None]:
#Fitting a Naive Bayes model
bayes= GaussianNB()
bayes.fit(X,Y2)

In [None]:
# Performing  10-fold cross-validation
cross_val_score(bayes, X, Y2, cv=10)

In [None]:
# Determining the size of the trainning set
sizes = np.arange(100, 1800,100)
print('Sizes: ', sizes)
train_sizes, train_scores2, test_scores2 = learning_curve(bayes, X, Y2, cv=10, train_sizes=sizes)
train_mean2 = np.mean(train_scores2, axis=1)
test_mean2 = np.mean(test_scores2, axis=1)
test_mean2

In [None]:
# split training and test 
training_size = 1700
Xtrain2, Xtest2, ytrain2, ytest2 = train_test_split(X, Y2, train_size=training_size, 
                                                test_size=X.shape[0]-training_size)                                                
bayes.fit(Xtrain2, ytrain2)
y_pred4 = bayes.predict(Xtest2)

In [None]:
# true positives and false positives
false_positive_rate4, true_positive_rate4, thresholds4 = roc_curve(ytest2, y_pred4, pos_label=1)
rates4 = pd.DataFrame(dict(fpr4=false_positive_rate4, tpr4=true_positive_rate4))
roc_auc4 = auc(rates4['fpr4'], rates4['tpr4'])
print('AUC SVC: ', roc_auc4)

In [None]:
dt = tree.DecisionTreeClassifier()

In [None]:
dt.fit(X,Y2)

In [None]:
cross_val_score(dt, X, Y2, cv=10)

In [None]:
dt.fit(Xtrain2, ytrain2)
y_pred5 = dt.predict(Xtest2)

In [None]:
# true positives and false positives
false_positive_rate5, true_positive_rate5, thresholds5 = roc_curve(ytest2, y_pred5, pos_label=1)
rates5 = pd.DataFrame(dict(fpr5=false_positive_rate5, tpr5=true_positive_rate5))
roc_auc5 = auc(rates5['fpr5'], rates5['tpr5'])
print('AUC SVC: ', roc_auc5)

In [None]:
#fit logit model
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X,Y2)

In [None]:
# Performing  10-fold cross-validation
cross_val_score(logreg, X, Y2, cv=10)

In [None]:
logreg.fit(Xtrain2, ytrain2)
y_pred6 = logreg.predict(Xtest2)

In [None]:
# true positives and false positives
false_positive_rate6, true_positive_rate6, thresholds6 = roc_curve(ytest2, y_pred6, pos_label=1)
rates6 = pd.DataFrame(dict(fpr6=false_positive_rate6, tpr6=true_positive_rate6))
roc_auc6 = auc(rates6['fpr6'], rates6['tpr6'])
print('AUC SVC: ', roc_auc6)

In [None]:
# KNN
# Test which k performs the best
k_range = range(1,20)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X, Y2, cv=10, scoring = 'accuracy')
    k_scores.append(scores.mean())

In [None]:
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(Xtrain2, ytrain2)
y_pred2 = knn.predict(Xtest2)

In [None]:
# true positives and false positives
false_positive_rate2, true_positive_rate2, thresholds2 = roc_curve(ytest2, y_pred2, pos_label=1)
rates2 = pd.DataFrame(dict(fpr2=false_positive_rate2, tpr2=true_positive_rate2))
roc_auc2 = auc(rates2['fpr2'], rates2['tpr2'])
print('AUC SVC: ', roc_auc2)

In [None]:
#ROC curve over150
plt.plot(rates4.fpr4, rates4.tpr4, 'b',color='red', label = 'Naive Bayes = %0.2f' % roc_auc4)
plt.plot(rates2.fpr2, rates2.tpr2, 'b',color='grey', label = 'KNN = %0.2f' % roc_auc2)
plt.plot(rates5.fpr5, rates5.tpr5, 'b',color='cyan', label = 'Decision Tree = %0.2f' % roc_auc5)
plt.plot(rates6.fpr6, rates6.tpr6, 'b',color='blue',label = 'Logit = %0.2f' % roc_auc6)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('Pdox Model: Receiver Operating Characteristic')
plt.legend(loc = 'lower right')
plt.savefig('ROC.png')
plt.show()

# over90

In [None]:
#Fitting a Naive Bayes model
bayes= GaussianNB()
bayes.fit(X,Y1)

In [None]:
# split training and test 
training_size = 1700
Xtrain1, Xtest1, ytrain1, ytest1 = train_test_split(X, Y1, train_size=training_size, 
                                                test_size=X.shape[0]-training_size)                                                
bayes.fit(Xtrain1, ytrain1)
y_pred4 = bayes.predict(Xtest1)

In [None]:
# true positives and false positives
false_positive_rate4, true_positive_rate4, thresholds4 = roc_curve(ytest1, y_pred4, pos_label=1)
rates4 = pd.DataFrame(dict(fpr4=false_positive_rate4, tpr4=true_positive_rate4))
roc_auc4 = auc(rates4['fpr4'], rates4['tpr4'])
print('AUC SVC: ', roc_auc4)

In [None]:
dt = tree.DecisionTreeClassifier()

In [None]:
dt.fit(X,Y1)

In [None]:
cross_val_score(dt, X, Y1, cv=10)

In [None]:
dt.fit(Xtrain1, ytrain1)
y_pred5 = dt.predict(Xtest1)

In [None]:
# true positives and false positives
false_positive_rate5, true_positive_rate5, thresholds5 = roc_curve(ytest1, y_pred5, pos_label=1)
rates5 = pd.DataFrame(dict(fpr5=false_positive_rate5, tpr5=true_positive_rate5))
roc_auc5 = auc(rates5['fpr5'], rates5['tpr5'])
print('AUC SVC: ', roc_auc5)

In [None]:
#fit logit model
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X,Y1)

In [None]:
# Performing  10-fold cross-validation
cross_val_score(logreg, X, Y1, cv=10)

In [None]:
logreg.fit(Xtrain1, ytrain1)
y_pred6 = logreg.predict(Xtest1)

In [None]:
# true positives and false positives
false_positive_rate6, true_positive_rate6, thresholds6 = roc_curve(ytest1, y_pred6, pos_label=1)
rates6 = pd.DataFrame(dict(fpr6=false_positive_rate6, tpr6=true_positive_rate6))
roc_auc6 = auc(rates6['fpr6'], rates6['tpr6'])
print('AUC SVC: ', roc_auc6)

In [None]:
# KNN
# Test which k performs the best
k_range = range(1,20)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X, Y1, cv=10, scoring = 'accuracy')
    k_scores.append(scores.mean())

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(Xtrain1, ytrain1)
y_pred2 = knn.predict(Xtest1)

In [None]:
# true positives and false positives
false_positive_rate2, true_positive_rate2, thresholds2 = roc_curve(ytest1, y_pred2, pos_label=1)
rates2 = pd.DataFrame(dict(fpr2=false_positive_rate2, tpr2=true_positive_rate2))
roc_auc2 = auc(rates2['fpr2'], rates2['tpr2'])
print('AUC SVC: ', roc_auc2)

In [None]:
#ROC curve over90
plt.plot(rates4.fpr4, rates4.tpr4, 'b',color='red', label = 'Naive Bayes = %0.2f' % roc_auc4)
plt.plot(rates2.fpr2, rates2.tpr2, 'b',color='grey', label = 'KNN = %0.2f' % roc_auc2)
plt.plot(rates5.fpr5, rates5.tpr5, 'b',color='cyan', label = 'Decision Tree = %0.2f' % roc_auc5)
plt.plot(rates6.fpr6, rates6.tpr6, 'b',color='blue',label = 'Logit = %0.2f' % roc_auc6)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('Pdox Model: Receiver Operating Characteristic')
plt.legend(loc = 'lower right')
plt.show()