In [61]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

In [62]:
df = pd.read_pickle('kickstarter_analysis.pkl')
df = df.dropna()

In [63]:
## Selecting Features
features=['country','spotlight', 'staff_pick','img_count', 'vid_count',
          'usd_goal', 'description_len', 'blurb_len', 'slug_len', 'med_rewards','category_core']
dependent=['reach_goal']

In [64]:
X=df[features]
y=df[dependent]

## Getting dummy variables
X = pd.get_dummies(X, drop_first = True)

In [65]:
## To an array
X = X.values
y = y.values.ravel()

In [66]:
## split into train test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.3, random_state = 42)

In [67]:
## Initial Model
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.7022912774117416


In [68]:
## Utilizing CV
final = 0
clf = LogisticRegression(solver='liblinear')
for train, test in KFold(n_splits=5).split(X):
    clf.fit(X[train], y[train])
    pred = clf.predict(X[test])
    acc = accuracy_score(y[test], pred)
    final += acc
    
final = final / 5 # n of folds
print(final)

0.7067635428751593


In [80]:
## Utilizing GridSearchCV
params = {'C':[0.0001, 0.001, 0.1, 1, 10, 100]}
clf = LogisticRegression(solver='liblinear')
gs_cv = GridSearchCV(clf, param_grid=params, cv = 5)
gs_cv.fit(X_train, y_train)

print('The best parameters are: {} \nand the best score is: {}'.format(gs_cv.best_params_, gs_cv.best_score_))

The best parameters are: {'C': 0.0001} 
and the best score is: 0.7707694123200378


In [90]:
## Initial Model
iters = 5000
xclf = LogisticRegression(solver='saga', max_iter = iters)
xclf.fit(X_train, y_train)

y_pred = xclf.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(iters)

0.687579899695152
5000




In [91]:
df.describe()

Unnamed: 0,spotlight,staff_pick,usd_pledged,img_count,vid_count,usd_goal,percent_goal,reach_goal,description_len,blurb_len,slug_len,med_rewards,reward_len
count,169481.0,169481.0,169481.0,169481.0,169481.0,169481.0,169481.0,169481.0,169481.0,169481.0,169481.0,169481.0,169481.0
mean,0.562588,0.128115,12079.02,6.766233,0.116644,40888.42,471.1358,0.562588,3189.024192,112.576548,32.272803,127.086065,8.122651
std,0.496069,0.334218,82761.08,11.39726,0.603345,1108715.0,25263.66,0.496069,3139.806469,26.497262,13.665054,415.662633,6.181222
min,0.0,0.0,0.0,0.0,0.0,0.7167626,0.0,0.0,1.0,1.0,1.0,0.0,1.0
25%,0.0,0.0,100.0,0.0,0.0,1500.0,2.0,0.0,1147.0,101.0,21.0,30.0,4.0
50%,1.0,0.0,1475.0,2.0,0.0,5000.0,101.3583,1.0,2205.0,124.0,32.0,56.835,7.0
75%,1.0,0.0,6260.0,9.0,0.0,13219.12,122.9,1.0,4140.0,132.0,46.0,100.0,11.0
max,1.0,1.0,8596475.0,205.0,28.0,152350100.0,6876410.0,1.0,60430.0,196.0,53.0,11201.63,253.0
