### Running Logistic Regression analysis on the Kickstarter data ###

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas_profiling
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve


plt.style.use('ggplot')

In [2]:
kickstarter = pd.read_csv('kickstarter_04_25_19_ready_for_modeling.csv')

In [3]:
kickstarter.columns

Index(['id', 'slug', 'state_successful', 'goal', 'staff_pick_True',
       'cam_length', 'blurb_length', 'country_US', 'main_category_comics',
       'main_category_crafts', 'main_category_dance', 'main_category_design',
       'main_category_fashion', 'main_category_film & video',
       'main_category_food', 'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater', 'location', 'child_category'],
      dtype='object')

In [4]:
X = kickstarter[['goal', 'staff_pick_True',
       'cam_length', 'blurb_length', 'country_US', 'main_category_comics',
       'main_category_crafts', 'main_category_dance', 'main_category_design',
       'main_category_fashion', 'main_category_film & video',
       'main_category_food', 'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater']]
y = kickstarter['state_successful']

In [5]:
X_mid, X_test_sc, y_mid, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)

In [6]:
X_train_sc, X_val_sc, y_train, y_val = train_test_split(X_mid, y_mid, test_size=0.25, random_state=42, stratify = y_mid)

In [7]:
scaler = StandardScaler()
scaler.fit(X_train_sc)
X_train = scaler.transform(X_train_sc)
X_val = scaler.transform(X_val_sc)
X_test = scaler.transform(X_test_sc)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [8]:
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
param_grid = dict(C=C, penalty=penalty)

logistic = LogisticRegression()
logistic_grid = GridSearchCV(logistic, param_grid, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
logistic_grid.fit(X_train, y_train)
grid_predictions = logistic_grid.predict(X_val)
best_params = logistic_grid.best_params_
best_estimator = logistic_grid.best_estimator_
best_cm = confusion_matrix(y_val,grid_predictions)
best_cr = classification_report(y_val,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   30.2s finished


{'C': 3593.813663804626, 'penalty': 'l1'} LogisticRegression(C=3593.813663804626, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False) [[11338  7648]
 [ 5026 17398]]               precision    recall  f1-score   support

           0       0.69      0.60      0.64     18986
           1       0.69      0.78      0.73     22424

   micro avg       0.69      0.69      0.69     41410
   macro avg       0.69      0.69      0.69     41410
weighted avg       0.69      0.69      0.69     41410



In [9]:
print('Best ROC_AUC for logit: %.3f', logistic_grid.best_score_)

Best ROC_AUC for logit: %.3f 0.7558716780819794


In [10]:
logistic_grid.best_estimator_.coef_

array([[-8.62782093,  0.73570116, -0.23991466, -0.03516481,  0.13452199,
         0.19252455, -0.12613439,  0.0906861 ,  0.10193355, -0.01273792,
         0.18078175, -0.12604507,  0.15117183, -0.2451385 ,  0.24055556,
        -0.03608308,  0.20219921, -0.23105023,  0.22721966]])