In [55]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

In [22]:
df = pd.read_pickle('kickstarter_analysis.pkl')
df = df.dropna()

In [23]:
## Selecting Features
features=['country','spotlight', 'staff_pick','img_count', 'vid_count',
          'usd_goal', 'description_len', 'blurb_len', 'slug_len', 'med_rewards','category_core']
dependent=['reach_goal']

In [29]:
X=df[features]
y=df[dependent]

## Getting dummy variables
X = pd.get_dummies(X, drop_first = True)

In [30]:
## To an array
X = X.values
y = y.values.ravel()

In [56]:
## split into train test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.3, random_state = 42)

In [49]:
## Initial Model
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.7022912774117416


In [50]:
## Utilizing CV
final = 0
clf = LogisticRegression(solver='liblinear')
for train, test in KFold(n_splits=5).split(X):
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    acc = accuracy_score(y[test], pred)
    final += acc
    
final = final / 5 # n of folds
print(final)

0.7067635428751593


In [58]:
## Utilizing GridSearchCV
params = {'C':[0.0001, 0.001, 0.1, 1, 10, 100]}
clf = LogisticRegression(solver='liblinear')
gs_cv = GridSearchCV(clf, param_grid=params, cv = 5)
gs_cv.fit(X_train, y_train)

print('The best parameters are: {} \nand the best score is: {}'.format(gs_cv.best_params_, gs_cv.best_score_))

The best parameters are: {'C': 0.0001} 
and the best score is: 0.7707694123200378
