### Performing Naive Bayes and SVM analysis on the Kickstarter data ###

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas_profiling
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import LinearSVC, SVC

plt.style.use('ggplot')

In [2]:
kickstarter = pd.read_csv('kickstarter_04_25_19_ready_for_modeling.csv')

In [3]:
kickstarter.columns

Index(['id', 'slug', 'state_successful', 'goal', 'staff_pick_True',
       'cam_length', 'blurb_length', 'country_US', 'main_category_comics',
       'main_category_crafts', 'main_category_dance', 'main_category_design',
       'main_category_fashion', 'main_category_film & video',
       'main_category_food', 'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater', 'location', 'child_category'],
      dtype='object')

In [4]:
X = kickstarter[['goal', 'staff_pick_True',
       'cam_length', 'blurb_length', 'country_US', 'main_category_comics',
       'main_category_crafts', 'main_category_dance', 'main_category_design',
       'main_category_fashion', 'main_category_film & video',
       'main_category_food', 'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater']]
y = kickstarter['state_successful']

In [5]:
X_mid, X_test_sc, y_mid, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)

In [6]:
X_train_sc, X_val_sc, y_train, y_val = train_test_split(X_mid, y_mid, test_size=0.25, random_state=42, stratify = y_mid)

In [7]:
scaler = StandardScaler()
scaler.fit(X_train_sc)
X_train = scaler.transform(X_train_sc)
X_val = scaler.transform(X_val_sc)
X_test = scaler.transform(X_test_sc)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [8]:
var_smoothing = [1e-9, 1e-8]
param_grid = dict(var_smoothing = var_smoothing)
gaussian = naive_bayes.GaussianNB()
gaussian_grid = GridSearchCV(gaussian, param_grid, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
gaussian_grid.fit(X_train, y_train)
gaussian_grid_predictions = gaussian_grid.predict(X_val)
best_params = gaussian_grid.best_params_
best_estimator = gaussian_grid.best_estimator_
best_cm = confusion_matrix(y_val,gaussian_grid_predictions)
best_cr = classification_report(y_val,gaussian_grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.7s finished


{'var_smoothing': 1e-08} GaussianNB(priors=None, var_smoothing=1e-08) [[ 8234 10752]
 [ 3018 19406]]               precision    recall  f1-score   support

           0       0.73      0.43      0.54     18986
           1       0.64      0.87      0.74     22424

   micro avg       0.67      0.67      0.67     41410
   macro avg       0.69      0.65      0.64     41410
weighted avg       0.68      0.67      0.65     41410



In [9]:
print('Best ROC_AUC for Gaussian: %.3f', gaussian_grid.best_score_)

Best ROC_AUC for Gaussian: %.3f 0.7295262012282722


In [11]:
from sklearn.metrics import roc_auc_score, roc_curve

roc_curve(y_test,gaussian_grid_predictions)

(array([0.        , 0.72916886, 1.        ]),
 array([0.        , 0.72752408, 1.        ]),
 array([2, 1, 0]))

In [13]:
print("Accuracy: %.3f"% accuracy_score(y_val, gaussian_grid_predictions))
print(classification_report(y_val, gaussian_grid_predictions))

Accuracy: 0.667
              precision    recall  f1-score   support

           0       0.73      0.43      0.54     18986
           1       0.64      0.87      0.74     22424

   micro avg       0.67      0.67      0.67     41410
   macro avg       0.69      0.65      0.64     41410
weighted avg       0.68      0.67      0.65     41410



In [14]:
alpha = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 2, 3, 4, 5, 10, 100]
param_grid = dict(alpha = alpha)
bernoulli = naive_bayes.BernoulliNB()
bernoulli_grid = GridSearchCV(bernoulli, param_grid, cv=5, scoring='roc_auc', verbose=2, n_jobs=-1)
bernoulli_grid.fit(X_train, y_train)
bernoulli_grid_predictions = bernoulli_grid.predict(X_val)
best_params = bernoulli_grid.best_params_
best_estimator = bernoulli_grid.best_estimator_
best_cm = confusion_matrix(y_val,bernoulli_grid_predictions)
best_cr = classification_report(y_val,bernoulli_grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.9s


{'alpha': 0.2} BernoulliNB(alpha=0.2, binarize=0.0, class_prior=None, fit_prior=True) [[10367  8619]
 [ 4494 17930]]               precision    recall  f1-score   support

           0       0.70      0.55      0.61     18986
           1       0.68      0.80      0.73     22424

   micro avg       0.68      0.68      0.68     41410
   macro avg       0.69      0.67      0.67     41410
weighted avg       0.69      0.68      0.68     41410



[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    6.2s finished


In [15]:
print('Best ROC_AUC for Bernoulli: %.3f', bernoulli_grid.best_score_)
print(roc_auc_score(y_val,bernoulli_grid_predictions))
print(roc_curve(y_val,bernoulli_grid_predictions))
print("Accuracy: %.3f"% accuracy_score(y_val, bernoulli_grid_predictions))

Best ROC_AUC for Bernoulli: %.3f 0.7438362601487478
0.6728118225123276
(array([0.        , 0.45396608, 1.        ]), array([0.        , 0.79958973, 1.        ]), array([2, 1, 0]))
Accuracy: 0.683
