# Kickstarter Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

from datetime import datetime

Define function for ttraining and testing.

In [2]:
def fittest(X_train, X_test, y_train, y_test, estimator, scaler=None):

    if scaler != None:
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled = X_train
        X_test_scaled = X_test

    estimator.fit(X_train_scaled, y_train)
    
    y_pred = estimator.predict(X_test_scaled) 
    y_train_pred = estimator.predict(X_train_scaled) 
    
    return {'cm_test':confusion_matrix(y_test, y_pred), 
            'ac_test':accuracy_score(y_test, y_pred),
            'rc_test':recall_score(y_test, y_pred),
            'pr_test':precision_score(y_test, y_pred),
            'f1_test':f1_score(y_test, y_pred),
            'ra_test':roc_auc_score(y_test, y_pred), 
            'pred':y_pred,
            #'proba':estimator.predict_proba(X_test_scaled),
            'cm_train':confusion_matrix(y_train, y_train_pred), 
            'ac_train':accuracy_score(y_train, y_train_pred),
            'rc_train':recall_score(y_train, y_train_pred),
            'pr_train':precision_score(y_train, y_train_pred),
            'f1_train':f1_score(y_train, y_train_pred),
            'ra_train':roc_auc_score(y_train, y_train_pred)}

Import data.

In [3]:
df = pd.read_csv('../data/kickstarter_cleaned.csv')
print(df.shape)
print(df.columns)

(18124, 11)
Index(['country', 'goal', 'state', 'cat_id', 'location_type', 'location_score',
       'delta_funding', 'name_length', 'name_words', 'projects_successful',
       'projects_failed'],
      dtype='object')


Turn `successful` and `failed` into 1 and 0, respectively, and check feature balance.

In [4]:
df.state.replace({'successful':1, 'failed':0}, inplace=True)
df.state.value_counts()

1    15914
0     2210
Name: state, dtype: int64

Turn datetime columns into datetime format and extract features.

In [5]:
df.created_at = df.created_at.astype('datetime64')
df.launched_at = df.launched_at.astype('datetime64')
df.deadline = df.deadline.astype('datetime64')

df['month_created'] = df['created_at'].dt.month
df['month_launched'] = df['launched_at'].dt.month
df['month_deadline'] = df['deadline'].dt.month

df['delta_public'] = (df['created_at'] - df['launched_at']) / pd.offsets.Day(-1)
df['delta_funding'] = (df['launched_at'] - df['deadline']) / pd.offsets.Day(-1)

df['delta_total'] = ((df['created_at'] - df['deadline']) / pd.offsets.Day(-1)).round(0)

df['delta_public'] = df.delta_public.round(0)
df['delta_funding'] = df.delta_funding.round(0)

AttributeError: 'DataFrame' object has no attribute 'created_at'

Correlations

In [None]:
df[['location_score', 'gender', 'delta_public',
    #'blurb_length', 'blurb_words', 'blurb_?', 'blurb_.', 'blurb_,', 'blurb_;', 'blurb_pm', 'blurb_mwl',
    #'name_length', 'name_words', 
    'projects_successful', 'projects_failed',
    'state']].corr('pearson')

Select features to be used in the model and sort them into numerical, categorical, time. Dummify the categorical features.

In [None]:
feat_num = ['goal', 
            #'blurb_length', 'blurb_words', 'blurb_!', 'blurb_?', 'blurb_.', 'blurb_,', 'blurb_;', 
            'name_length', 'name_words', 
            'projects_successful', 'projects_failed', 'location_score'
           ]
feat_cat = ['country', 'cat_id'#, 'location_state', 'gender'
           ]
feat_time = [#'month_created', 'month_launched', 'month_deadline', 'delta_public', 
             'delta_funding']

X = df[feat_num+feat_cat+feat_time]
y = df.state

X = pd.get_dummies(X, columns=feat_cat, drop_first=True)

Shape

In [None]:
X.shape

(170070, 180)

Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

Instantiate scalers and classifiers to be studied.

In [None]:
ss = StandardScaler()
mm = MinMaxScaler()
lr = LogisticRegression(C = 10, max_iter=1000, random_state=1)
nb = GaussianNB()
rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=15, min_samples_leaf=10)
ad = AdaBoostClassifier(base_estimator=LogisticRegression(), random_state=1, n_estimators=500, learning_rate=.5)
sv = SVC()

Train and check performance.

In [None]:
start = datetime.now()
lr_ss = fittest(X_train, X_test, y_train, y_test, lr, ss)
print(format(datetime.now()-start))
lr_ss

0:00:03.199896


In [None]:
start = datetime.now()
lr_mm = fittest(X_train, X_test, y_train, y_test, lr, mm)
print(format(datetime.now()-start))
lr_mm

0:00:15.350423


In [None]:
start = datetime.now()
nb_ss = fittest(X_train, X_test, y_train, y_test, nb, ss)
print(format(datetime.now()-start))
nb_ss

0:00:02.450005


In [None]:
start = datetime.now()
nb_mm = fittest(X_train, X_test, y_train, y_test, nb, mm)
print(format(datetime.now()-start))
nb_mm

0:00:02.205504


In [None]:
start = datetime.now()
rf_ss = fittest(X_train, X_test, y_train, y_test, rf, ss)
print(format(datetime.now()-start))
rf_ss

In [None]:
start = datetime.now()
rf_mm = fittest(X_train, X_test, y_train, y_test, rf, mm)
print(format(datetime.now()-start))
rf_mm

In [None]:
start = datetime.now()
ad_ss = fittest(X_train, X_test, y_train, y_test, ad, ss)
print(format(datetime.now()-start))
ad_ss

In [None]:
start = datetime.now()
ad_mm = fittest(X_train, X_test, y_train, y_test, ad, mm)
print(format(datetime.now()-start))
ad_mm

In [None]:
start = datetime.now()
sv_ss = fittest(X_train, X_test, y_train, y_test, sv, ss)
print(format(datetime.now()-start))
sv_ss

In [None]:
start = datetime.now()
sv_mm = fittest(X_train, X_test, y_train, y_test, sv, mm)
print(format(datetime.now()-start))
sv_mm

0:00:00.000069


### Experimental

Try to blend classifiers to improve accuracy of prediction (work in progress)-

In [None]:
#ad_ss_pred = ad_ss['pred']
#ad_mm_pred = ad_mm['pred']
lr_ss_pred = lr_ss['pred']
lr_mm_pred = lr_mm['pred']
#rf_ss_pred = rf_ss['pred']
#rf_mm_pred = rf_mm['pred']
nb_ss_pred = nb_ss['pred']
nb_mm_pred = nb_mm['pred']
#sv_ss_pred = rf_ss['pred']
#sv_mm_pred = rf_mm['pred']

In [None]:
# total coincidence test-pred 
d = len([i for i in range(len(y_test)) if 
               #y_test.iloc[i] != ad_ss_pred[i] and 
               y_test.iloc[i] != lr_ss_pred[i] and 
               y_test.iloc[i] != lr_mm_pred[i] and 
               #y_test.iloc[i] != rf_ss_pred[i] and 
               #y_test.iloc[i] != rf_mm_pred[i] and 
               y_test.iloc[i] != nb_ss_pred[i] 
               and y_test.iloc[i] != nb_mm_pred[i]
              ])
# individual coincidences ss-mm
#d_ad_ss_mm = len([i for i in range(len(y_test)) if ad_ss_pred[i] != ad_mm_pred[i]])
d_lr_ss_mm = len([i for i in range(len(y_test)) if lr_ss_pred[i] != lr_mm_pred[i]])
d_nb_ss_mm = len([i for i in range(len(y_test)) if nb_ss_pred[i] != nb_mm_pred[i]])
#d_rf_ss_mm = len([i for i in range(len(y_test)) if rf_ss_pred[i] != rf_mm_pred[i]])
#d_sv_ss_mm = len([i for i in range(len(y_test)) if sv_ss_pred[i] != sv_mm_pred[i]])

d, d_lr_ss_mm, d_nb_ss_mm

(5143, 134, 28)

In [None]:
blend_proba = (0*lr_ss['proba'][:,1] + 100*ad_ss['proba'][:,1] + nb_ss['proba'][:,1])/101
y_pred_blend = [int(x > .5) for x in blend_proba]

print(confusion_matrix(y_test, y_pred_blend))
print(accuracy_score(y_test, y_pred_blend))

[[18442    58]
 [14420  9598]]
0.6594853944211864


## Observations / Todo

- (Y) add time deltas and keep only `launched_at`
- (N) add blurb length as numerical feature: no improvement
- (Y) add name length and name word count: helps
- (Y) analyze effect of cities: extract state in US
- (Y) Fine categorization performs much better than just category_parent_id
- (N) Categorization performs much better than cat_score
- (?) Compare location --> state with location_score