### Performing Naive Bayes and SVM analysis on the Kickstarter data ###

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas_profiling
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import LinearSVC, SVC

plt.style.use('ggplot')

In [2]:
kickstarter = pd.read_csv('kickstarter_04_25_19_ready_for_modeling.csv')

In [3]:
X = kickstarter[['goal', 'staff_pick_True',
       'cam_length', 'blurb_length', 'country_US', 'main_category_comics',
       'main_category_crafts', 'main_category_dance', 'main_category_design',
       'main_category_fashion', 'main_category_film & video',
       'main_category_food', 'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater']]
y = kickstarter['state_successful']

In [None]:
sns.pairplot(kickstarter, hue = 'state_successful')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state = 101)
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(144934, 19) (144934,)
(62115, 19) (62115,)


In [5]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
#var_smoothing = [1e-9, 1e-8]
#param_grid = dict(var_smoothing = var_smoothing)
gaussian = naive_bayes.GaussianNB()
gaussian_grid = GridSearchCV(gaussian, param_grid, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
gaussian_grid.fit(X_train_scaled, y_train)
gaussian_grid_predictions = gaussian_grid.predict(X_test_scaled)
best_params = gaussian_grid.best_params_
best_estimator = gaussian_grid.best_estimator_
best_cm = confusion_matrix(y_test,gaussian_grid_predictions)
best_cr = classification_report(y_test,gaussian_grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


{'var_smoothing': 1e-09} GaussianNB(priors=None, var_smoothing=1e-09) [[13342 15003]
 [ 6007 27763]]               precision    recall  f1-score   support

           0       0.69      0.47      0.56     28345
           1       0.65      0.82      0.73     33770

   micro avg       0.66      0.66      0.66     62115
   macro avg       0.67      0.65      0.64     62115
weighted avg       0.67      0.66      0.65     62115



[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.8s finished


In [16]:
print('Best ROC_AUC for Gaussian: %.3f', gaussian_grid.best_score_)

Best ROC_AUC for Gaussian: %.3f 0.7275681167391118


In [11]:
roc_auc_score(y_test,gaussian_grid_predictions)

0.6464102624641713

In [14]:
from sklearn.metrics import roc_auc_score, roc_curve

roc_curve(y_test,gaussian_grid_predictions)

(array([0.       , 0.5292997, 1.       ]),
 array([0.        , 0.82212023, 1.        ]),
 array([2, 1, 0]))

In [15]:
print("Accuracy: %.3f"% accuracy_score(y_test, gaussian_grid_predictions))
print(classification_report(y_test, gaussian_grid_predictions))

Accuracy: 0.662
              precision    recall  f1-score   support

           0       0.69      0.47      0.56     28345
           1       0.65      0.82      0.73     33770

   micro avg       0.66      0.66      0.66     62115
   macro avg       0.67      0.65      0.64     62115
weighted avg       0.67      0.66      0.65     62115



In [17]:
#alpha = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 2, 3, 4, 5, 10, 100]
#param_grid = dict(alpha = alpha)
bernoulli = naive_bayes.BernoulliNB()
bernoulli_grid = GridSearchCV(bernoulli, param_grid, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
bernoulli_grid.fit(X_train_scaled, y_train)
bernoulli_grid_predictions = bernoulli_grid.predict(X_test_scaled)
best_params = bernoulli_grid.best_params_
best_estimator = bernoulli_grid.best_estimator_
best_cm = confusion_matrix(y_test,bernoulli_grid_predictions)
best_cr = classification_report(y_test,bernoulli_grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    9.8s finished


{'alpha': 10} BernoulliNB(alpha=10, binarize=0.0, class_prior=None, fit_prior=True) [[15427 12918]
 [ 7019 26751]]               precision    recall  f1-score   support

           0       0.69      0.54      0.61     28345
           1       0.67      0.79      0.73     33770

   micro avg       0.68      0.68      0.68     62115
   macro avg       0.68      0.67      0.67     62115
weighted avg       0.68      0.68      0.67     62115



In [19]:
print('Best ROC_AUC for Bernoulli: %.3f', bernoulli_grid.best_score_)
print(roc_auc_score(y_test,bernoulli_grid_predictions))
print(roc_curve(y_test,bernoulli_grid_predictions))
print("Accuracy: %.3f"% accuracy_score(y_test, bernoulli_grid_predictions))

Best ROC_AUC for Bernoulli: %.3f 0.7444423610343852
0.6682055224730314
(array([0.        , 0.45574175, 1.        ]), array([0.       , 0.7921528, 1.       ]), array([2, 1, 0]))
Accuracy: 0.679


In [26]:
#alpha = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 2, 3, 4, 5, 10, 100]
#param_grid = dict(alpha = alpha)
multinomial = naive_bayes.MultinomialNB()
multinomial_grid = GridSearchCV(multinomial, param_grid, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
multinomial_grid.fit(X_train, y_train)
multinomial_grid_predictions = multinomial_grid.predict(X_test)
best_params = multinomial_grid.best_params_
best_estimator = multinomial_grid.best_estimator_
best_cm = confusion_matrix(y_test,multinomial_grid_predictions)
best_cr = classification_report(y_test,multinomial_grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.2s


{'alpha': 3} MultinomialNB(alpha=3, class_prior=None, fit_prior=True) [[ 6170 22175]
 [ 2537 31233]]               precision    recall  f1-score   support

           0       0.71      0.22      0.33     28345
           1       0.58      0.92      0.72     33770

   micro avg       0.60      0.60      0.60     62115
   macro avg       0.65      0.57      0.52     62115
weighted avg       0.64      0.60      0.54     62115



[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    5.4s finished


In [27]:
print('Best ROC_AUC for Multinomial: %.3f', multinomial_grid.best_score_)
print(roc_auc_score(y_test,multinomial_grid_predictions))
print(roc_curve(y_test,multinomial_grid_predictions))
print("Accuracy: %.3f"% accuracy_score(y_test, multinomial_grid_predictions))

Best ROC_AUC for Multinomial: %.3f 0.5878696417034942
0.5712746118108903
(array([0.        , 0.78232493, 1.        ]), array([0.        , 0.92487415, 1.        ]), array([2, 1, 0]))
Accuracy: 0.602


In [None]:
C = np.logspace(0, 4, 4)
tolerance = [0.001, 0.1, 1]
param_grid = dict(tol = tolerance, C=C)
linear_SVC = LinearSVC()
linear_SVC_grid = GridSearchCV(linear_SVC, param_grid, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
linear_SVC_grid.fit(X_train_scaled, y_train)
linear_SVC_grid_predictions = linear_SVC_grid.predict(X_test_scaled)
best_params = linear_SVC_grid.best_params_
best_estimator = linear_SVC_grid.best_estimator_
best_cm = confusion_matrix(y_test,linear_SVC_grid_predictions)
best_cr = classification_report(y_test,linear_SVC_grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [32]:
print('Best ROC_AUC for Linear SVC: %.3f', linear_SVC_grid.best_score_)
print(roc_auc_score(y_test,linear_SVC_grid_predictions))
print(roc_curve(y_test,linear_SVC_grid_predictions))
print("Accuracy: %.3f"% accuracy_score(y_test, linear_SVC_grid_predictions))

Best ROC_AUC for Linear SVC: %.3f 0.7456592761418844
0.6729914230477899
(array([0.        , 0.41164226, 1.        ]), array([0.        , 0.75762511, 1.        ]), array([2, 1, 0]))
Accuracy: 0.680


In [None]:
C = np.logspace(0, 4, 5)
gamma = [0.001, 0.01, 0.1, 1]
param_grid = dict(C=C, gamma = gamma)
SVC = SVC()
SVC_grid = GridSearchCV(SVC, param_grid, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
SVC_grid.fit(X_train_scaled, y_train)
SVC_grid_predictions = SVC_grid.predict(X_test_scaled)
best_params = SVC_grid.best_params_
best_estimator = SVC_grid.best_estimator_
best_cm = confusion_matrix(y_test,SVC_grid_predictions)
best_cr = classification_report(y_test,SVC_grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

In [None]:
print('Best ROC_AUC for SVC: %.3f', SVC_grid.best_score_)
print(roc_auc_score(y_test,SVC_grid_predictions))
print(roc_curve(y_test,SVC_grid_predictions))
print("Accuracy: %.3f"% accuracy_score(y_test, SVC_grid_predictions))

In [None]:
# play with gamma, kernel choice (other, rbf), and also C values. gamma defines how far the influence of the single training example reaches, low values mean far, high values meaning close.