### Performing KNN analysis on the Kickstarter data ###

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas_profiling

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import svm, ensemble, linear_model, neighbors, naive_bayes
from sklearn.metrics import roc_auc_score, roc_curve, auc

plt.style.use('ggplot')

In [2]:
kickstarter = pd.read_csv('kickstarter_04_25_19_ready_for_modeling.csv')

In [4]:
X = kickstarter[['goal', 'staff_pick_True',
       'cam_length', 'blurb_length', 'country_US', 'main_category_comics',
       'main_category_crafts', 'main_category_dance', 'main_category_design',
       'main_category_fashion', 'main_category_film & video',
       'main_category_food', 'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater']]
y = kickstarter['state_successful']

In [5]:
X_mid, X_test_sc, y_mid, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)

In [6]:
X_train_sc, X_val_sc, y_train, y_val = train_test_split(X_mid, y_mid, test_size=0.25, random_state=42, stratify = y_mid)

In [7]:
scaler = StandardScaler()
scaler.fit(X_train_sc)
X_train = scaler.transform(X_train_sc)
X_val = scaler.transform(X_val_sc)
X_test = scaler.transform(X_test_sc)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [10]:
ks = [5, 10, 20, 25, 30,35]
param_grid = [{'n_neighbors': ks}]
knn = KNeighborsClassifier(n_neighbors=5)
knn_grid = GridSearchCV(knn, param_grid, cv=8, scoring='roc_auc', verbose=1, n_jobs=-1)
knn_grid.fit(X_train, y_train)
grid_predictions = knn_grid.predict(X_test)
best_params = knn_grid.best_params_
best_estimator = knn_grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 8 folds for each of 6 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 72.6min finished


{'n_neighbors': 35} KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=35, p=2,
           weights='uniform') [[16914 11431]
 [ 8138 25632]]               precision    recall  f1-score   support

           0       0.68      0.60      0.63     28345
           1       0.69      0.76      0.72     33770

   micro avg       0.68      0.68      0.68     62115
   macro avg       0.68      0.68      0.68     62115
weighted avg       0.68      0.68      0.68     62115



In [19]:
#knn = KNeighborsClassifier()
#param_grid = [{'n_neighbors': list(range(24,26))}]
#grid = GridSearchCV(knn,param_grid,refit=True,verbose=4, scoring='roc_auc', cv = 5, n_jobs = -1)
#grid.fit(X_train,y_train)
#grid_predictions = grid.predict(X_test)
#best_params = grid.best_params_
#best_estimator = grid.best_estimator_
#best_cm = confusion_matrix(y_test,grid_predictions)
#best_cr = classification_report(y_test,grid_predictions)
#print(best_params, best_estimator, best_cm, best_cr)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] n_neighbors=24 ..................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ......... n_neighbors=24, score=0.7542461308338198, total= 1.1min
[CV] n_neighbors=24 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.1min remaining:    0.0s


[CV] ......... n_neighbors=24, score=0.7546411352950121, total=  52.2s
[CV] n_neighbors=24 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  9.0min remaining:    0.0s


[CV] ......... n_neighbors=24, score=0.7561333741044011, total= 1.1min
[CV] n_neighbors=24 ..................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 13.9min remaining:    0.0s


[CV] ......... n_neighbors=24, score=0.7575519250073043, total=  55.6s
[CV] n_neighbors=24 ..................................................
[CV] ......... n_neighbors=24, score=0.7474287281826297, total=  51.3s
[CV] n_neighbors=25 ..................................................
[CV] ......... n_neighbors=25, score=0.7541927278273894, total= 1.0min
[CV] n_neighbors=25 ..................................................
[CV] ......... n_neighbors=25, score=0.7550050175536506, total=  47.9s
[CV] n_neighbors=25 ..................................................
[CV] ......... n_neighbors=25, score=0.7567518311399236, total= 1.0min
[CV] n_neighbors=25 ..................................................
[CV] ......... n_neighbors=25, score=0.7572033223148901, total=  52.6s
[CV] n_neighbors=25 ..................................................
[CV] ......... n_neighbors=25, score=0.7479259204802873, total=  49.7s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 42.9min finished


{'n_neighbors': 25} KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=25, p=2,
           weights='uniform') [[17093 11252]
 [ 8172 25598]]               precision    recall  f1-score   support

           0       0.68      0.60      0.64     28345
           1       0.69      0.76      0.72     33770

   micro avg       0.69      0.69      0.69     62115
   macro avg       0.69      0.68      0.68     62115
weighted avg       0.69      0.69      0.69     62115



In [9]:
knn_25 = KNeighborsClassifier(n_neighbors = 25)
knn_25.fit(X_train, y_train)
predictions = knn_25.predict(X_val)
print(confusion_matrix(y_val,predictions))
print(classification_report(y_val, predictions))

[[11501  7485]
 [ 5395 17029]]
              precision    recall  f1-score   support

           0       0.68      0.61      0.64     18986
           1       0.69      0.76      0.73     22424

   micro avg       0.69      0.69      0.69     41410
   macro avg       0.69      0.68      0.68     41410
weighted avg       0.69      0.69      0.69     41410

