In [None]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

In [None]:
# load iris dataset
data = pd.read_csv('~/Google/19Spring/ANLY501ASL/GroupProject/data/cleaned_encode.csv')
data.head()

In [None]:
# create feature matrix and target array
X = data.drop(["elapsed_workdays","pdox_b1_id","perm_id","elapsed_workdays_20"], axis = 1)
y = data["elapsed_workdays"]

## Run regression model

### OLS

In [None]:
OLS = linear_model.LinearRegression()
OLS.fit(X, y)
print("sklearn.linear_model")
print(OLS.intercept_)
print(OLS.coef_)
cross_val_score(OLS, X, y, cv = 10).mean()

### KNN Regressor

In [None]:
# Test which k performs the best
# the biggest k can be sqtr(observation)
k_range = range(1,194,1)
k_scores = []
for k in k_range:
    knn = KNeighborsRegressor(n_neighbors = k)
    scores = cross_val_score(knn, X, y, cv=10)
    k_scores.append(scores.mean())

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validation Accuracy')
plt.savefig('Choosing K for KNN Regressor.png')
plt.show()

In [None]:
# Test which k performs the best for a smaller range
k_range = range(5,15)
k_scores = []89
for k in k_range:
    knn = KNeighborsRegressor(n_neighbors = k)
    scores = cross_val_score(knn, X, y, cv=10)
    k_scores.append(scores.mean())

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validation Accuracy')
plt.show()

In [None]:
# Choose k = 11
k11 = KNeighborsRegressor(n_neighbors = 11)
k11.fit(X, y)
cross_val_score(k11, X, y, cv = 10).mean()

## Run classification model (over 30)

In [None]:
# Create a binary y
data['elapsed_workdays_30'] = data['elapsed_workdays'].apply(lambda x: 1 if x > 30 else 0)
print(data['elapsed_workdays_30'].head(10))
y = data["elapsed_workdays_30"]

### Logit

In [None]:
# Predict the model using logistic regression
logit = LogisticRegression(solver='liblinear')
logit.fit(X, y)
y_predict = logit.predict(X)
accuracy_score(y,y_predict)

In [None]:
logit.coef_

In [None]:
# Perform ten-fold cross-validation on logistic regression model
logit = LogisticRegression(solver='liblinear')
score_logit = cross_val_score(logit, X, y, cv = 10)
print(score_logit)
np.mean(score_logit)

### KNN Classifier

In [None]:
# Test which k performs the best
# the biggest k can be sqtr(observation)
k_range = range(1,194)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X, y, cv=10, scoring = 'accuracy')
    k_scores.append(scores.mean())

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validation Accuracy')
plt.savefig('Choosing K for KNN Classifier (over 30).png')
plt.show()

In [None]:
# Test which k performs the best for a smaller range
k_range = range(30,40)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X, y, cv=10, scoring = 'accuracy')
    k_scores.append(scores.mean())

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validation Accuracy')
plt.show()

In [None]:
# Choose k = 40
k40 = KNeighborsClassifier(n_neighbors = 40)
k40.fit(X, y)
y_k40_predict = k40.predict(X)
accuracy_score(y,y_k40_predict)

In [None]:
# Perform ten-fold cross-validation on KNN model(k=40)
k40 = KNeighborsClassifier(n_neighbors = 40)
score_k40 = cross_val_score(k40, X, y, cv = 10)
print(score_k40)
np.mean(score_k40)

### Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X,y)
cross_val_score(nb, X, y, cv = 10).mean()

## Run classification model (over 20)

In [None]:
# create feature matrix and target array
X = data.drop(["elapsed_workdays","pdox_b1_id","perm_id","elapsed_workdays_30","elapsed_workdays_20"], axis = 1)
y = data["elapsed_workdays_20"]

In [None]:
X.columns

### Logit

In [None]:
# Predict the model using logistic regression
logit = LogisticRegression(solver='liblinear')
logit.fit(X, y)
y_predict = logit.predict(X)
accuracy_score(y,y_predict)

In [None]:
# Perform ten-fold cross-validation on logistic regression model
logit = LogisticRegression(solver='liblinear')
score_logit = cross_val_score(logit, X, y, cv = 10)
print(score_logit)
np.mean(score_logit)

### KNN Classifier

In [None]:
# Test which k performs the best
# the biggest k can be sqtr(observation)
k_range = range(1,194)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X, y, cv=10, scoring = 'accuracy')
    k_scores.append(scores.mean())

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validation Accuracy')
plt.savefig('Choosing K for KNN Classifier (over 20).png')
plt.show()

In [None]:
# Test which k performs the best for a smaller range
# the biggest k can be sqtr(observation)
k_range = range(70,81,1)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X, y, cv=10, scoring = 'accuracy')
    k_scores.append(scores.mean())

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validation Accuracy')
plt.show()

In [None]:
# Choose k = 80
k80 = KNeighborsClassifier(n_neighbors = 80)
k80.fit(X, y)
y_k80_predict = k80.predict(X)
accuracy_score(y,y_k80_predict)

In [None]:
# Perform ten-fold cross-validation on KNN model(k=80)
k80 = KNeighborsClassifier(n_neighbors = 80)
score_k80 = cross_val_score(k80, X, y, cv = 10)
print(score_k80)
np.mean(score_k80)

### Naive Bayes¶

In [None]:
nb = GaussianNB()
nb.fit(X,y)
cross_val_score(nb, X, y, cv = 10).mean()