In [1]:
# Load necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

# For SVM stuff
import nltk
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from copy import deepcopy

In [2]:
def load_preprocessed_data(name):
    # TODO: as additional preprocessing is completed, add options here.
    if name == 'basic_name':
        x_train = pd.read_csv("data/preprocess-{}-x-train-correct.csv".format(name), index_col=0)
        y_train = pd.read_csv("data/preprocess-{}-y-train-correct.csv".format(name), squeeze=True)
        x_valid = pd.read_csv("data/preprocess-{}-x-valid-correct.csv".format(name), index_col=0)
        y_valid = pd.read_csv("data/preprocess-{}-y-valid-correct.csv".format(name), squeeze=True)
    elif name == 'base' or name == 'doc2vec':
        x_train = pd.read_csv("data/preprocess-{}-x-train.csv".format(name), index_col=0)
        y_train = pd.read_csv("data/preprocess-{}-y-train.csv".format(name), squeeze=True)
        x_valid = pd.read_csv("data/preprocess-{}-x-valid.csv".format(name), index_col=0)
        y_valid = pd.read_csv("data/preprocess-{}-y-valid.csv".format(name), squeeze=True)
    else:
        raise Exception("No preprocessd data available for that name.")
    return x_train, y_train, x_valid, y_valid

In [3]:
x_train_base, y_train_base, x_valid_base, y_valid_base = load_preprocessed_data('base')
x_train_d2v, y_train_d2v, x_valid_d2v, y_valid_d2v = load_preprocessed_data('doc2vec')
x_train_basic, y_train_basic, x_valid_basic, y_valid_basic = load_preprocessed_data('basic_name')
# TODO: load addtional preprocessed data

In [4]:
print(len(x_train_base))
print(len(y_train_base))
print(y_train_base.head())
x_train_base.head()

249956
249956
0     True
1     True
2    False
3    False
4    False
Name: success, dtype: bool


Unnamed: 0,main_category,category,duration,usd_goal_real,launched_month,deadline_month
0,12,19,-0.316333,-0.036515,4.0,5.0
2,8,136,2.025723,-0.03174,10.0,0.0
4,7,44,-0.272265,-0.032463,7.0,8.0
5,5,31,-0.400095,-0.034948,2.0,3.0
6,6,93,-0.326225,0.00814,0.0,2.0


In [5]:
print(len(x_train_d2v))
print(len(y_train_d2v))
print(y_train_d2v.head())
x_train_d2v.head()

249956
249956
0     True
1     True
2    False
3    False
4    False
Name: success, dtype: bool


Unnamed: 0,main_category,category,duration,usd_goal_real,launched_month,deadline_month,doc2vec_names_0,doc2vec_names_1,doc2vec_names_2,doc2vec_names_3,...,doc2vec_names_10,doc2vec_names_11,doc2vec_names_12,doc2vec_names_13,doc2vec_names_14,doc2vec_names_15,doc2vec_names_16,doc2vec_names_17,doc2vec_names_18,doc2vec_names_19
0,12,19,-0.316333,-0.036515,4.0,5.0,0.009077,-0.029281,-0.014818,-0.017581,...,0.013778,-0.047939,-0.063131,0.015074,-0.003315,-0.072243,0.029401,-0.058894,-0.008851,-0.004987
2,8,136,2.025723,-0.03174,10.0,0.0,-0.015903,0.037791,-0.035279,-0.029695,...,-0.055231,-0.034383,0.003102,0.029496,-0.097409,-0.059037,0.045664,-0.052574,0.034218,0.041212
4,7,44,-0.272265,-0.032463,7.0,8.0,0.018435,0.037622,0.031674,-0.049406,...,-0.048317,0.023729,-0.057103,0.023843,0.044732,-0.049317,-0.029703,-0.100054,0.014423,0.055918
5,5,31,-0.400095,-0.034948,2.0,3.0,0.020175,-0.015965,-0.012979,-0.058172,...,-0.008836,-0.055694,-0.00587,0.016705,-0.025267,0.002448,0.018852,-0.024194,0.007968,0.03114
6,6,93,-0.326225,0.00814,0.0,2.0,0.005605,0.003581,0.019905,-0.063847,...,-0.033168,-0.017154,-0.019477,0.027361,-0.083247,0.00659,-0.020938,-0.052568,-0.026672,0.040495


In [6]:
# Had to rename due to a change in preprocessing. The values are the same, just not the column names.x_train_basic.columns = ['index', 'main_category', 'category', 'duration', 'usd_goal_real', 'launched_month', 'deadline_month', 'num_chars', 'num_unusual_words', 'num_weird_chars']
x_train_basic.set_index('Unnamed: 0.1', drop=True, inplace=True)

print(len(x_train_basic))
print(len(y_train_basic))
print(y_train_basic.head())
x_train_basic.head()

249956
249956
0     True
1     True
2    False
3    False
4    False
Name: 1, dtype: bool


Unnamed: 0_level_0,main_category,category,duration,usd_goal_real,launched_month_deadline_month_0,launched_month_deadline_month_1,num_chars,num_unusual_words,num_weird_chars
Unnamed: 0.1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,12,19,-0.316333,-0.036515,4.0,5.0,56,1,2
2,8,136,2.025723,-0.03174,10.0,0.0,30,2,0
4,7,44,-0.272265,-0.032463,7.0,8.0,60,4,0
5,5,31,-0.400095,-0.034948,2.0,3.0,34,3,1
6,6,93,-0.326225,0.00814,0.0,2.0,49,2,6


In [12]:
params = {
    'C' : [0.01, 0.1, 0.5, 1.0, 1.5],
    'gamma' : ['auto', 'scale'],
}

In [17]:
search = GridSearchCV(
    SVC(kernel='rbf', max_iter=10000),
    param_grid=params,
    n_jobs=-1,
    verbose=50,
)

In [18]:
_ = search.fit(x_train_basic, y_train_basic)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.




[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done   2 out of  30 | elapsed:  5.2min remaining: 72.6min
[Parallel(n_jobs=-1)]: Done   3 out of  30 | elapsed:  5.2min remaining: 47.2min
[Parallel(n_jobs=-1)]: Done   4 out of  30 | elapsed:  5.4min remaining: 35.2min
[Parallel(n_jobs=-1)]: Done   5 out of  30 | elapsed:  5.4min remaining: 27.1min
[Parallel(n_jobs=-1)]: Done   6 out of  30 | elapsed:  5.5min remaining: 21.8min
[Parallel(n_jobs=-1)]: Done   7 out of  30 | elapsed:  5.5min remaining: 18.1min
[Parallel(n_jobs=-1)]: Done   8 out of  30 | elapsed:  5.5min remaining: 15.1min
[Parallel(n_jobs=-1)]: Done   9 out of  30 | elapsed:  5.6min remaining: 13.0min
[Parallel(n_jobs=-1)]: Done  10 out of  30 | elapsed:  5.6min remaining: 11.2min
[Parallel(n_jobs=-1)]: Done  11 out of  30 | elapsed:  5.6min remaining:  9.7min
[Parallel(n_jobs=-1)]: Done  12 out of  30 | elapsed:  5.7min remaining:  8.5min
[Parallel(n_jobs=-1)]: Done  13 out of  30 | el



In [19]:
print(search.best_score_)
print(search.best_params_)

0.6138000288050697
{'C': 0.5, 'gamma': 'auto'}


In [20]:
search_d2v = GridSearchCV(
    SVC(kernel='rbf', max_iter=10000),
    param_grid=params,
    n_jobs=-1,
    verbose=50,
)
_ = search_d2v.fit(x_train_base, y_train_base)
print(search_d2v.best_score_)
print(search_d2v.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.




[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done   2 out of  30 | elapsed:  5.2min remaining: 72.1min
[Parallel(n_jobs=-1)]: Done   3 out of  30 | elapsed:  5.2min remaining: 47.0min
[Parallel(n_jobs=-1)]: Done   4 out of  30 | elapsed:  5.2min remaining: 34.0min
[Parallel(n_jobs=-1)]: Done   5 out of  30 | elapsed:  5.3min remaining: 26.6min
[Parallel(n_jobs=-1)]: Done   6 out of  30 | elapsed:  5.4min remaining: 21.8min
[Parallel(n_jobs=-1)]: Done   7 out of  30 | elapsed:  5.5min remaining: 17.9min
[Parallel(n_jobs=-1)]: Done   8 out of  30 | elapsed:  5.5min remaining: 15.1min
[Parallel(n_jobs=-1)]: Done   9 out of  30 | elapsed:  5.5min remaining: 12.9min
[Parallel(n_jobs=-1)]: Done  10 out of  30 | elapsed:  5.5min remaining: 11.1min
[Parallel(n_jobs=-1)]: Done  11 out of  30 | elapsed:  5.6min remaining:  9.7min
[Parallel(n_jobs=-1)]: Done  12 out of  30 | elapsed:  5.6min remaining:  8.4min
[Parallel(n_jobs=-1)]: Done  13 out of  30 | el



In [7]:
params = {
    'C' : [0.25, 0.5, 0.75],
    'gamma' : ['auto'],
}

search_d2v = GridSearchCV(
    SVC(kernel='rbf', max_iter=10000),
    param_grid=params,
    n_jobs=-1,
    verbose=50,
)
_ = search_d2v.fit(x_train_d2v, y_train_d2v)
print(search_d2v.best_score_)
print(search_d2v.best_params_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.




[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:  9.3min remaining: 32.5min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:  9.5min remaining: 19.0min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  9.6min remaining: 12.0min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:  9.6min remaining:  7.7min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed: 10.1min remaining:  5.1min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed: 10.5min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 11.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 11.2min finished
0.5515130662996688
{'C': 0.75, 'gamma': 'auto'}




In [8]:
print(len(x_train_basic))
print(len(y_train_basic))
print(y_train_basic.head())
x_train_basic.head()

249956
249956
0     True
1     True
2    False
3    False
4    False
Name: 1, dtype: bool


Unnamed: 0_level_0,main_category,category,duration,usd_goal_real,launched_month_deadline_month_0,launched_month_deadline_month_1,num_chars,num_unusual_words,num_weird_chars
Unnamed: 0.1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,12,19,-0.316333,-0.036515,4.0,5.0,56,1,2
2,8,136,2.025723,-0.03174,10.0,0.0,30,2,0
4,7,44,-0.272265,-0.032463,7.0,8.0,60,4,0
5,5,31,-0.400095,-0.034948,2.0,3.0,34,3,1
6,6,93,-0.326225,0.00814,0.0,2.0,49,2,6


In [27]:
def get_knn_basic_mapper():
    mapper = DataFrameMapper([
        (['launched_month_deadline_month_0'], OneHotEncoder()),
        (['launched_month_deadline_month_1'], OneHotEncoder()),
        (['num_chars'], MinMaxScaler()),
        (['num_unusual_words'], MinMaxScaler()),
        (['num_weird_chars'], MinMaxScaler()),
        (['main_category'], OneHotEncoder()),
        (['category'], OneHotEncoder()),
        (['duration'], None),
        (['usd_goal_real'], None)
    ], df_out=True)
    return mapper

In [28]:
basic_mapper = get_knn_basic_mapper()
x_train_basic_scaled = basic_mapper.fit_transform(x_train_basic)
x_valid_basic_scaled = basic_mapper.transform(x_valid_basic)
x_train_basic_scaled.head()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0_level_0,launched_month_deadline_month_0_x0_0.0,launched_month_deadline_month_0_x0_1.0,launched_month_deadline_month_0_x0_2.0,launched_month_deadline_month_0_x0_3.0,launched_month_deadline_month_0_x0_4.0,launched_month_deadline_month_0_x0_5.0,launched_month_deadline_month_0_x0_6.0,launched_month_deadline_month_0_x0_7.0,launched_month_deadline_month_0_x0_8.0,launched_month_deadline_month_0_x0_9.0,...,category_x0_151.0,category_x0_152.0,category_x0_153.0,category_x0_154.0,category_x0_155.0,category_x0_156.0,category_x0_157.0,category_x0_158.0,duration,usd_goal_real
Unnamed: 0.1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.316333,-0.036515
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.025723,-0.03174
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.272265,-0.032463
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.400095,-0.034948
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.326225,0.00814


In [30]:
params = {
    'C' : [0.25, 0.5, 0.75],
}

search_basic_scaled = GridSearchCV(
    SVC(kernel='rbf', gamma='auto', max_iter=10000),
    param_grid=params,
    n_jobs=-1,
    verbose=50,
)
_ = search_basic_scaled.fit(x_train_basic_scaled, y_train_basic)
print(search_basic_scaled.best_score_)
print(search_basic_scaled.best_params_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.




[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 35.6min
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed: 35.7min remaining: 124.9min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed: 35.8min remaining: 71.6min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed: 39.2min remaining: 49.0min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed: 40.0min remaining: 32.0min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed: 40.3min remaining: 20.2min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed: 40.8min remaining: 11.6min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 43.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 43.8min finished
0.4113804029509194
{'C': 0.5}




Doesn't look great, but best so far is:

For Bas:
SVC w/ rbf kernel, C=.75, gamma=auto on base dataset. Let's continue with this thought.

For Doc2Vec, best current found is:
0.6121397365936405
{'C': 0.5, 'gamma': 'auto'

In [7]:
x_test_base = pd.read_csv("data/preprocess-base-x-test.csv", index_col=0)
y_test_base = pd.read_csv("data/preprocess-base-y-test.csv", squeeze=True)

x_test_d2v = pd.read_csv("data/preprocess-doc2vec-x-test.csv", index_col=0)
y_test_d2v = pd.read_csv("data/preprocess-doc2vec-y-test.csv", squeeze=True)

In [9]:
svc = SVC(kernel='rbf', C=.75, gamma='auto', max_iter=75000)

In [10]:
%%time
svc.fit(x_train_base, y_train_base)

CPU times: user 36min 43s, sys: 2.35 s, total: 36min 45s
Wall time: 36min 42s




SVC(C=0.75, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=75000, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
base_score_test = svc.score(x_test_base, y_test_base)
print(base_score_test)

0.6531885536024956


In [12]:
%%time
svc2 = SVC(kernel='rbf', C=.75, gamma='auto', max_iter=75000)
svc2.fit(x_train_d2v, y_train_d2v)

CPU times: user 1h 4min 39s, sys: 3.48 s, total: 1h 4min 43s
Wall time: 1h 4min 39s




SVC(C=0.75, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=75000, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
d2v_score_test = svc2.score(x_test_d2v, y_test_d2v)
print(d2v_score_test)

0.6459295698602195
