In [76]:
import graphlab
import numpy as np
import pandas as pd
import re
import datetime
from graphlab import model_parameter_search
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
graphlab.canvas.set_target('ipynb')

#  FEATURE ENGINEERING TRAIN DATA

In [5]:
train_data_prime = pd.read_csv('train.csv')

In [6]:
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train_data_prime[x] = train_data_prime[x].apply(lambda k: datetime.date.fromtimestamp(int(k)).strftime('%Y-%m-%d'))

In [7]:
train_data_prime.to_csv('train_data_TS.csv',
             float_format=None,
             header=True,
             index=False,
             index_label=None,
             mode='w',
             encoding = 'utf-8',
             line_terminator='\n',
             tupleize_cols=False,
             date_format='%Y%m%d',
             decimal = '.'
            )

In [8]:
TS_set = ['deadline', 'state_changed_at', 'created_at', 'launched_at']
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d')
TS_train_data = pd.read_csv('train_data_TS.csv', parse_dates= TS_set, date_parser=dateparse)

In [9]:
TS_train_data.head()

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,USD,2009-05-03,2009-05-03,2009-04-24,2009-04-24,3,1
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,USD,2009-05-15,2009-05-15,2009-04-28,2009-04-28,2,0
2,kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,mr-squiggles,False,US,USD,2009-05-22,2009-05-22,2009-05-12,2009-05-12,0,0
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,help-me-write-my-second-novel,False,US,USD,2009-05-28,2009-05-28,2009-04-28,2009-04-28,18,1
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,2009-05-31,2009-05-31,2009-05-01,2009-05-01,1,0


In [10]:
TS_train_data['keywords'] = TS_train_data['keywords'].replace(['-'], [' '], regex=True)
TS_train_data['name'] = TS_train_data['keywords']
TS_train_data['desc'] = TS_train_data['desc'].str.replace(":"," ").astype('str') 
TS_train_data['desc'] = TS_train_data['desc'].str.replace('"',' ').astype('str')
TS_train_data['desc'] = TS_train_data['desc'].str.replace('-',' ').astype('str')
TS_train_data['desc'] = TS_train_data['desc'].replace(['Ã','¶', 'Ã', 'Â', '©', '¥', '¤', '\s'], ['a','', 'a', 'a', '', 'y', '', ' '], regex=True)
TS_train_data['name'] = TS_train_data['name'].replace(['-'], [' '], regex=True)
TS_train_data['desc'] = TS_train_data['desc'].map(lambda x: x.strip())

TS_train_data['deadline-created_at'] = (TS_train_data.deadline - TS_train_data.created_at)
TS_train_data['state_changed_at-created_at'] = TS_train_data.deadline - TS_train_data.created_at
TS_train_data['state_changed_at-deadline'] = TS_train_data.state_changed_at - TS_train_data.deadline
TS_train_data['deadline-launched_at'] = TS_train_data.deadline - TS_train_data.launched_at
TS_train_data['state_changed_at-launched_at'] = TS_train_data.state_changed_at - TS_train_data.launched_at

normal_goal = []
for x, y in zip(TS_train_data.currency.tolist(), TS_train_data.goal.tolist()):
    if x == 'USD':
        normal_goal += [y]
    if x == 'GBP':
        normal_goal += [1.5 * y]
    if x == 'EUR':
        normal_goal += [1.2 * y]
    if x == 'CAD':
        normal_goal += [0.85 * y]
    if x == 'AUD':
        normal_goal += [0.85 * y]
    if x == 'SEK':
        normal_goal += [0.14 * y]
    if x == 'NZD':
        normal_goal += [0.70 * y]
    if x == 'DKK':
        normal_goal += [0.17 * y]
    if x == 'NOK':
        normal_goal += [0.15 * y]
    if x == 'CHF':
        normal_goal += [y]
    if x == 'MXN':
        normal_goal += [0.07 * y]
    if x == 'SGD':
        normal_goal += [0.73 * y]
    if x == 'HKD':
        normal_goal += [0.13 * y]

TS_train_data['normal_goal'] = normal_goal

In [11]:
TS_train_data.to_csv('train_data_FE1.csv',
             float_format=None,
             header=True,
             index=False,
             index_label=None,
             mode='w',
             encoding = 'utf-8',
             line_terminator='\n',
             tupleize_cols=False,
             date_format='%Y%m%d',
             decimal = '.'
            )

In [23]:
kickstarter = graphlab.SFrame('train_data_FE1.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,long,long,long,long,long,long,str,str,str,str,str,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [25]:
#to understand which type of product is being funded

remove = ['\r', '\x0b', '\n', '\x0c', '\t', ' ', '.', "'", '&', '-', '$', '!']
#create a cluster of words using desc
kickstarter['word_count_trim'] = graphlab.text_analytics.trim_rare_words(kickstarter['desc'], to_lower=True, delimiters=remove)
kickstarter['word_count'] = graphlab.text_analytics.count_words(kickstarter['word_count_trim'])
kickstarter['stop_words_trim'] = kickstarter['word_count'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), True) 

#tfidf still doesn't get rid of simple words like and, the in and stuff, doesn't still give a better/true rare words
tfidf = graphlab.text_analytics.tf_idf(kickstarter['stop_words_trim'])

kickstarter['tfidf'] = tfidf

In [26]:
kickstarter.head()

project_id,name,desc,goal,keywords
kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them ...,20.0,drawing for dollars
kkst1474482071,sponsor dereck blackburn lostwars artist in ...,"I, Dereck Blackburn will be taking upon an ...",300.0,sponsor dereck blackburn lostwars artist in ...
kkst183622197,mr squiggles,So I saw darkpony's successfully funded ...,30.0,mr squiggles
kkst597742710,help me write my second novel ...,Do your part to help out starving artists and ...,500.0,help me write my second novel ...
kkst1913131122,support casting my sculpture in bronze ...,"I'm nearing completion on a sculpture, currently ...",2000.0,support casting my sculpture in bronze ...
kkst1085176748,daily digest,I'm a fledgling videoblogger living in ...,700.0,daily digest
kkst1468954715,igoozex free iphone app,I am an independent iPhone developer that ...,250.0,igoozex free iphone app
kkst194050612,drive a faster car 20,Drive A Faster Car (http //www.driveafastercar ...,1000.0,drive a faster car 20
kkst708883590,lostles at tinys giant,"Opening Friday, June 5 2009, on view through ...",5000.0,lostles at tinys giant
kkst890976740,choose your own adventure a robot painting series ...,This project is for a Choose Your Own Adven ...,3500.0,choose your own adventure a robot painting series ...

disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count
False,US,USD,20090503,20090503,20090424,20090424,3
False,US,USD,20090515,20090515,20090428,20090428,2
False,US,USD,20090522,20090522,20090512,20090512,0
False,US,USD,20090528,20090528,20090428,20090428,18
False,US,USD,20090531,20090531,20090501,20090501,1
False,US,USD,20090531,20090531,20090429,20090504,14
False,US,USD,20090601,20090601,20090507,20090507,2
False,US,USD,20090603,20090603,20090504,20090504,32
False,US,USD,20090606,20090606,20090504,20090504,44
False,US,USD,20090613,20090613,20090513,20090513,18

final_status,deadline-created_at,state_changed_at- created_at ...,state_changed_at-deadline
1,9 days 00:00:00.000000000,9 days 00:00:00.000000000,0 days 00:00:00.000000000
0,17 days 00:00:00.000000000 ...,17 days 00:00:00.000000000 ...,0 days 00:00:00.000000000
0,10 days 00:00:00.000000000 ...,10 days 00:00:00.000000000 ...,0 days 00:00:00.000000000
1,30 days 00:00:00.000000000 ...,30 days 00:00:00.000000000 ...,0 days 00:00:00.000000000
0,30 days 00:00:00.000000000 ...,30 days 00:00:00.000000000 ...,0 days 00:00:00.000000000
0,32 days 00:00:00.000000000 ...,32 days 00:00:00.000000000 ...,0 days 00:00:00.000000000
0,25 days 00:00:00.000000000 ...,25 days 00:00:00.000000000 ...,0 days 00:00:00.000000000
1,30 days 00:00:00.000000000 ...,30 days 00:00:00.000000000 ...,0 days 00:00:00.000000000
0,33 days 00:00:00.000000000 ...,33 days 00:00:00.000000000 ...,0 days 00:00:00.000000000
0,31 days 00:00:00.000000000 ...,31 days 00:00:00.000000000 ...,0 days 00:00:00.000000000

deadline-launched_at,state_changed_at- launched_at ...,normal_goal,word_count_trim
9 days 00:00:00.000000000,9 days 00:00:00.000000000,20.0,i like drawing pictures and then i color them ...
17 days 00:00:00.000000000 ...,17 days 00:00:00.000000000 ...,300.0,"i, will be taking upon an incredible journey in ..."
10 days 00:00:00.000000000 ...,10 days 00:00:00.000000000 ...,30.0,so i saw s successfully funded drawing for ...
30 days 00:00:00.000000000 ...,30 days 00:00:00.000000000 ...,500.0,do your part to help out starving artists and ...
30 days 00:00:00.000000000 ...,30 days 00:00:00.000000000 ...,2000.0,"i m nearing completion on a sculpture, currently ..."
27 days 00:00:00.000000000 ...,27 days 00:00:00.000000000 ...,700.0,"i m a fledgling living in brooklyn, ny the webcast ..."
25 days 00:00:00.000000000 ...,25 days 00:00:00.000000000 ...,250.0,i am an independent iphone developer that ...
30 days 00:00:00.000000000 ...,30 days 00:00:00.000000000 ...,1000.0,drive a faster car (http //www com) has been a ...
33 days 00:00:00.000000000 ...,33 days 00:00:00.000000000 ...,5000.0,"opening friday, june 5 2009, on view through ..."
31 days 00:00:00.000000000 ...,31 days 00:00:00.000000000 ...,3500.0,this project is for a choose your own adven ...

word_count,stop_words_trim,tfidf
"{'wants': 1L, 'me': 1L, 'draw': 1L, 'someone': ...","{'draw': 1L, 'color': 1L, 'suggest': 1L, 'thoug ...","{'draw': 6.3707244126912395, ..."
"{'be': 2L, 'in': 1L, 'to': 1L, 'incredible': ...","{'incredible': 1L, 'hometown': 1L, 'taki ...","{'incredible': 6.045902793290002, ..."
"{'and': 1L, 'project': 1L, 'guy,': 1L, ...","{'project': 1L, 'guy,': 1L, 'dollars': 1L, ...","{'project': 3.2161420939341983, ..."
"{'and': 2L, 'do': 1L, 'starving': 1L, 'novel': ...","{'starving': 1L, 'writing': 1L, 'write': ...","{'starving': 8.064719713153403, ..."
"{'and': 1L, 'on': 1L, '13': 1L, 'titled': 1L, ...","{'completion': 1L, '13': 1L, 'titled': 1L, ...","{'completion': 6.854881789375069, '13': ..."
"{'a': 1L, 'living': 1L, 'http': 1L, 'which': 1L, ...","{'living': 1L, 'http': 1L, 'brooklyn,': 1L, ...","{'living': 4.895281320711073, ..."
"{'independent': 1L, 'software': 1L, ...","{'independent': 1L, 'recently': 1L, 'users': ...","{'independent': 5.115647521065474, ..."
"{'and': 1L, 'project': 1L, 'love': 1L, 'fast ...","{'project': 1L, 'love': 1L, 'faster': 1L, ...","{'project': 3.2161420939341983, ..."
"{'on': 1L, 'giant': 1L, 'sandwich': 1L, 'shop,': ...","{'september,': 1L, 'giant': 1L, 'sandwich': ...","{'september,': 9.193184964971193, ..."
"{'and': 1L, 'own': 1L, 'pledge': 1L, 'series': ...","{'pledge': 1L, 'series': 1L, '35': 1L, 'project': ...","{'pledge': 6.0189262055918, ..."


In [75]:
kickstarter['final_status'].show(view = 'Categorical')

Canvas is accessible via web browser at the URL: http://localhost:51249/index.html
Opening Canvas in default web browser.


In [27]:
kickstarter["funded"]  = kickstarter["final_status"].apply(lambda click:+1 if click ==1 else -1)

# Here we will undersample the larger class(no clicks)

#find the ratio of sizes and use that percentage to undersample no clicks
funded_raw = kickstarter[kickstarter["final_status"]==+1]
not_funded_raw = kickstarter[kickstarter["final_status"]==0]
percentage = len(funded_raw)/float(len(not_funded_raw))
funded = funded_raw
not_funded = not_funded_raw.sample(percentage)

kickstarter = funded.append(not_funded)

In [20]:
kickstarter['funded'].show(view = 'Categorical')

Canvas is updated and available in a tab in the default browser.


In [28]:
train_data, validation_data = kickstarter.random_split(.8)

selected_features = ['tfidf', 'normal_goal', 'country', 'desc', 'deadline-created_at', 'state_changed_at-created_at',
                     'state_changed_at-deadline', 'deadline-launched_at', 'state_changed_at-launched_at']

# TRAINING

In [32]:
#logistic classification model

logistic_classification_model = graphlab.logistic_classifier.create(train_data,
                                                                      target = 'funded',
                                                                      features = selected_features,
                                                                      max_iterations = 10,
                                                                   verbose = False)
prediction = logistic_classification_model.classify(validation_data)
results = logistic_classification_model.evaluate(validation_data)

print "Accuracy         : %s" % results['accuracy']
print "Confusion Matrix : \n%s" % results['confusion_matrix']

Accuracy         : 0.614005123826
Confusion Matrix : 
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      1       |        1        |  3688 |
|      1       |        -1       |  3337 |
|      -1      |        1        |  2087 |
|      -1      |        -1       |  4940 |
+--------------+-----------------+-------+
[4 rows x 3 columns]



In [33]:
#svm classification model

svm_classification_model = graphlab.svm_classifier.create(train_data,
                                                          target = 'funded',
                                                          features = selected_features,
                                                          max_iterations = 10,
                                                         verbose = False)
prediction = svm_classification_model.classify(validation_data)
results = svm_classification_model.evaluate(validation_data)

print "Accuracy         : %s" % results['accuracy']
print "Confusion Matrix : \n%s" % results['confusion_matrix']

Accuracy         : 0.621975519499
Confusion Matrix : 
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      1       |        1        |  4406 |
|      1       |        -1       |  2619 |
|      -1      |        -1       |  4334 |
|      -1      |        1        |  2693 |
+--------------+-----------------+-------+
[4 rows x 3 columns]



In [34]:
#gradient boosted tree classification model

gb_classification_model = graphlab.boosted_trees_classifier.create(train_data,
                                                                      target = 'funded',
                                                                      features = selected_features,
                                                                      max_iterations = 15,
                                                                  verbose = False)
prediction = gb_classification_model.classify(validation_data)
results = gb_classification_model.evaluate(validation_data)

print "Accuracy         : %s" % results['accuracy']
print "Confusion Matrix : \n%s" % results['confusion_matrix']

Accuracy         : 0.638272132081
Confusion Matrix : 
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      1       |        1        |  4965 |
|      1       |        -1       |  2060 |
|      -1      |        1        |  3023 |
|      -1      |        -1       |  4004 |
+--------------+-----------------+-------+
[4 rows x 3 columns]



In [35]:
#parameter search with logistic classifier
params = {'target': 'funded'}
logistic_parameter_search = model_parameter_search.create((train_data, validation_data),graphlab.logistic_classifier.create, params,
                                                         max_models= 15)

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.job: Creating a LocalAsync environment called 'async'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-28-2017-05-30-1400000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Aug-28-2017-05-30-1400000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-Aug-28-2017-05-30-1400000' already exists. Renaming the job to 'Model-Parameter-Search-Aug-28-2017-05-30-1400000-3de8a'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-28-2017-05-30-1400000-3de8a' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Aug-28-2017-05-30-1400000-3de8a' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-28-2017-05-30-1400001' ready for execution


In [36]:
logistic_parameter_search.get_results()

model_id,l1_penalty,l2_penalty,target,training_accuracy,validation_accuracy
9,0.01,0.001,funded,1.0,0.709294050669
8,100.0,0.001,funded,0.999981939679,0.894249928836
1,100.0,0.001,funded,0.999981939679,0.894249928836
0,0.01,0.0001,funded,1.0,0.709294050669
3,0.001,0.01,funded,1.0,0.709365214916
2,0.0,0.01,funded,1.0,0.713492741247
5,0.01,0.01,funded,1.0,0.709294050669
4,0.0,0.1,funded,1.0,0.713563905494
7,1.0,100.0,funded,1.0,0.714062055223
6,0.001,10.0,funded,1.0,0.709365214916


In [41]:
print logistic_parameter_search.get_models()[8]
print logistic_parameter_search.get_models()[7]

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 386821
Number of examples             : 55370
Number of classes              : 2
Number of feature columns      : 24
Number of unpacked features    : 108560

Hyperparameters
---------------
L1 penalty                     : 100.0
L2 penalty                     : 0.001

Training Summary
----------------
Solver                         : fista
Solver iterations              : 10
Solver status                  : TERMINATED: Iteration limit reached.
Training time (sec)            : 4.7322

Settings
--------
Log-likelihood                 : 17216.6759

Highest Positive Coefficients
-----------------------------
final_status                   : 1.1121
project_id[kkst998212169]      : 0.4932
name[sewing machine for my two pickles] : 0.4932
desc[Two Pickles is my dream business  unique boys clothing and accessories... please help start my business with a basic sewing machine!  )] : 0.4932
keywords

In [42]:
#parameter search with svm classifier
params2 = {'target': 'funded'}
svm_parameter_search = model_parameter_search.create((train_data, validation_data),graphlab.svm_classifier.create, params2,
                                                     max_models= 15)

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-28-2017-05-35-4700000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Aug-28-2017-05-35-4700000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-Aug-28-2017-05-35-4700000' already exists. Renaming the job to 'Model-Parameter-Search-Aug-28-2017-05-35-4700000-535ec'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-28-2017-05-35-4700000-535ec' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Aug-28-2017-05-35-4700000-535ec' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-28-2017-05-35-4700001' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Aug-28-2017-05-35

In [43]:
svm_parameter_search.get_results()

model_id,penalty,target,training_accuracy,validation_accuracy
9,0.01,funded,0.999909698393,0.686806148591
8,0.1,funded,0.99841069171,0.685027042414
1,10.0,funded,0.563572331588,0.491816111586
0,0.1,funded,0.99841069171,0.685027042414
3,1.0,funded,0.523189452772,0.492812411045
2,0.001,funded,0.999945819036,0.686023341873
5,0.001,funded,0.999945819036,0.686023341873
4,0.01,funded,0.999909698393,0.686806148591
7,1.0,funded,0.523189452772,0.492812411045
6,10.0,funded,0.563572331588,0.491816111586


In [44]:
params3 = {'target': 'funded'}
bt_parameter_search = model_parameter_search.create((train_data, validation_data),graphlab.boosted_trees_classifier.create, params3)

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-28-2017-05-37-4100000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Aug-28-2017-05-37-4100000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-Aug-28-2017-05-37-4100000' already exists. Renaming the job to 'Model-Parameter-Search-Aug-28-2017-05-37-4100000-e9725'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-28-2017-05-37-4100000-e9725' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Aug-28-2017-05-37-4100000-e9725' scheduled.


In [45]:
bt_parameter_search.get_results()

model_id,column_subsample,early_stopping_rounds,max_depth,max_iterations,min_child_weight,min_loss_reduction
9,1.0,5,6,100,1,10
8,1.0,5,8,100,2,10
1,0.8,5,4,100,1,10
0,0.9,5,10,100,1,1
3,0.9,5,10,100,4,10
2,0.9,5,4,100,1,1
5,0.9,5,6,100,16,1
4,0.8,5,6,100,16,10
7,0.9,5,8,100,4,10
6,1.0,5,8,100,8,0

row_subsample,step_size,target,training_accuracy,validation_accuracy
0.9,0.0001,funded,1.0,1.0
0.9,0.5,funded,1.0,1.0
0.9,0.1,funded,1.0,1.0
0.9,1e-05,funded,1.0,1.0
0.9,1e-05,funded,1.0,1.0
0.9,0.25,funded,1.0,1.0
0.9,0.1,funded,1.0,1.0
0.9,0.1,funded,1.0,1.0
0.9,0.5,funded,1.0,1.0
1.0,0.25,funded,1.0,1.0


In [46]:
print bt_parameter_search.get_models()[9]
print bt_parameter_search.get_models()[6]

Class                          : BoostedTreesClassifier

Schema
------
Number of examples             : 55370
Number of feature columns      : 24
Number of unpacked features    : 108560
Number of classes              : 2

Settings
--------
Number of trees                : 100
Max tree depth                 : 6
Training time (sec)            : 7.5351
Training accuracy              : 1.0
Validation accuracy            : 1.0
Training log_loss              : 0.6832
Validation log_loss            : 0.6832

Class                          : BoostedTreesClassifier

Schema
------
Number of examples             : 55370
Number of feature columns      : 24
Number of unpacked features    : 108560
Number of classes              : 2

Settings
--------
Number of trees                : 30
Max tree depth                 : 8
Training time (sec)            : 2.4214
Training accuracy              : 1.0
Validation accuracy            : 1.0
Training log_loss              : 0.0003
Validation log_loss         

In [51]:
#tuned boosted tree model

tuned_bt_model_1 = graphlab.boosted_trees_regression.create(train_data,
                                                          target = 'funded',
                                                          features=selected_features,
                                                          max_iterations=100,
                                                          max_depth=6,
                                                          step_size=0.0001,
                                                          min_loss_reduction=10,
                                                          min_child_weight=1,
                                                          row_subsample=0.9,
                                                          column_subsample=1.0,
                                                          verbose=False)
#what does max error mean??
results_bt_model_1 = tuned_bt_model_1.evaluate(validation_data)
print results_bt_model_1

{'max_error': 1.5007004141807556, 'rmse': 1.115018747876845}


In [49]:
tuned_bt_model_2 = graphlab.boosted_trees_regression.create(train_data,
                                                          target = 'funded',
                                                          features=selected_features,
                                                          max_iterations=30,
                                                          max_depth=8,
                                                          step_size=0.25,
                                                          min_loss_reduction=0,
                                                          min_child_weight=8,
                                                          row_subsample=1.0,
                                                          column_subsample=1.0,
                                                          verbose=False)
#what does max error mean??
results_bt_model_2 = tuned_bt_model_2.evaluate(validation_data)
print results_bt_model_2

{'max_error': 1.9140050411224365, 'rmse': 0.9213641278671888}


In [53]:
#tuned logistic classification model

tuned_lc_model_1 = graphlab.logistic_classifier.create(train_data,
                                                     target = 'funded',
                                                     features=selected_features,
                                                     l2_penalty=0.001,
                                                     l1_penalty=100.0,
                                                     solver='fista',
                                                     feature_rescaling=True,
                                                       verbose=False,
                                                     max_iterations=10
                                                    )

prediction_lc_1 = tuned_lc_model_1.classify(validation_data)
results_lc_1 = tuned_lc_model_1.evaluate(validation_data)

print "Accuracy         : %s" % results_lc_1['accuracy']
print "Confusion Matrix : \n%s" % results_lc_1['confusion_matrix']

Accuracy         : 0.652362653003
Confusion Matrix : 
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      1       |        1        |  4496 |
|      1       |        -1       |  2529 |
|      -1      |        1        |  2356 |
|      -1      |        -1       |  4671 |
+--------------+-----------------+-------+
[4 rows x 3 columns]



In [54]:
tuned_lc_model_2 = graphlab.logistic_classifier.create(train_data,
                                                     target = 'funded',
                                                     features=selected_features,
                                                     l2_penalty=100,
                                                     l1_penalty=1.0,
                                                     solver='fista',
                                                     feature_rescaling=True,
                                                       verbose=False,
                                                     max_iterations=10
                                                    )

prediction_lc_2 = tuned_lc_model_2.classify(validation_data)
results_lc_2 = tuned_lc_model_2.evaluate(validation_data)

print "Accuracy         : %s" % results_lc_2['accuracy']
print "Confusion Matrix : \n%s" % results_lc_2['confusion_matrix']

Accuracy         : 0.62781098776
Confusion Matrix : 
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      1       |        1        |  4344 |
|      1       |        -1       |  2681 |
|      -1      |        1        |  2549 |
|      -1      |        -1       |  4478 |
+--------------+-----------------+-------+
[4 rows x 3 columns]



# FEATURE ENGINEERING TEST DATA

In [55]:
test_data_prime = pd.read_csv('test.csv')

In [56]:
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    test_data_prime[x] = test_data_prime[x].apply(lambda k: datetime.date.fromtimestamp(int(k)).strftime('%Y-%m-%d'))

test_data_prime.to_csv('test_data_TS.csv',
             float_format=None,
             header=True,
             index=False,
             index_label=None,
             mode='w',
             encoding = 'utf-8',
             line_terminator='\n',
             tupleize_cols=False,
             date_format='%Y%m%d',
             decimal = '.'
            )

In [57]:
TS_set = ['deadline', 'state_changed_at', 'created_at', 'launched_at']
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d')
TS_test_data = pd.read_csv('test_data_TS.csv', parse_dates= TS_set, date_parser=dateparse)

In [58]:
TS_test_data['keywords'] = TS_test_data['keywords'].replace(['-'], [' '], regex=True)
TS_test_data['name'] = TS_test_data['keywords']
TS_test_data['desc'] = TS_test_data['desc'].str.replace(":"," ").astype('str') 
TS_test_data['desc'] = TS_test_data['desc'].str.replace('"',' ').astype('str')
TS_test_data['desc'] = TS_test_data['desc'].str.replace('-',' ').astype('str')
TS_test_data['desc'] = TS_test_data['desc'].replace(['Ã','¶', 'Ã', 'Â', '©', '¥', '¤', '\s'], ['a','', 'a', 'a', '', 'y', '', ' '], regex=True)
TS_test_data['name'] = TS_test_data['name'].replace(['-'], [' '], regex=True)
TS_test_data['desc'] = TS_test_data['desc'].map(lambda x: x.strip())

TS_test_data['deadline-created_at'] = (TS_test_data.deadline - TS_test_data.created_at)
TS_test_data['state_changed_at-created_at'] = TS_test_data.deadline - TS_test_data.created_at
TS_test_data['state_changed_at-deadline'] = TS_test_data.state_changed_at - TS_test_data.deadline
TS_test_data['deadline-launched_at'] = TS_test_data.deadline - TS_test_data.launched_at
TS_test_data['state_changed_at-launched_at'] = TS_test_data.state_changed_at - TS_test_data.launched_at

normal_goal = []
for x, y in zip(TS_test_data.currency.tolist(), TS_test_data.goal.tolist()):
    if x == 'USD':
        normal_goal += [y]
    if x == 'GBP':
        normal_goal += [1.5 * y]
    if x == 'EUR':
        normal_goal += [1.2 * y]
    if x == 'CAD':
        normal_goal += [0.85 * y]
    if x == 'AUD':
        normal_goal += [0.85 * y]
    if x == 'SEK':
        normal_goal += [0.14 * y]
    if x == 'NZD':
        normal_goal += [0.70 * y]
    if x == 'DKK':
        normal_goal += [0.17 * y]
    if x == 'NOK':
        normal_goal += [0.15 * y]
    if x == 'CHF':
        normal_goal += [y]
    if x == 'MXN':
        normal_goal += [0.07 * y]
    if x == 'SGD':
        normal_goal += [0.73 * y]
    if x == 'HKD':
        normal_goal += [0.13 * y]

TS_test_data.to_csv('test_data_FE1.csv',
             float_format=None,
             header=True,
             index=False,
             index_label=None,
             mode='w',
             encoding = 'utf-8',
             line_terminator='\n',
             tupleize_cols=False,
             date_format='%Y%m%d',
             decimal = '.'
            )

# TESTING

In [59]:
data = graphlab.SFrame('test_data_FE1.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,long,long,long,long,str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [60]:
#create a cluster of words using desc

data['word_count_trim'] = graphlab.text_analytics.trim_rare_words(data['desc'], to_lower=True, delimiters=remove)
data['word_count'] = graphlab.text_analytics.count_words(data['word_count_trim'])
data['stop_words_trim'] = data['word_count'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), True) 

#tfidf still doesn't get rid of simple words like and, the in and stuff, doesn't still give a better/true rare words
tfidf = graphlab.text_analytics.tf_idf(data['stop_words_trim'])

data['tfidf'] = tfidf

# PREDICT

In [65]:
#using the test data to make predictions

probability_predictions_Tuned_LC1 = tuned_lc_model_1.predict(data)

In [67]:
probability_predictions_Tuned_BT2 = tuned_bt_model_2.predict(data)

In [62]:
data.head(2)

project_id,name,desc,goal,keywords,disable_communication
kkst917493670,brathair,"My first film, of many to come. Trying to pursu ...",7000.0,brathair,False
kkst1664901914,the screenwriter,A young man that has earned his master's in ...,35000.0,the screenwriter,False

country,currency,deadline,state_changed_at,created_at,launched_at,deadline-created_at
US,USD,20151208,20151208,20151027,20151029,42 days 00:00:00.000000000 ...
US,USD,20160121,20160121,20151216,20151217,36 days 00:00:00.000000000 ...

state_changed_at- created_at ...,state_changed_at-deadline,deadline-launched_at,state_changed_at- launched_at ...
42 days 00:00:00.000000000 ...,0 days 00:00:00.000000000,40 days 00:00:00.000000000 ...,40 days 00:00:00.000000000 ...
36 days 00:00:00.000000000 ...,0 days 00:00:00.000000000,35 days 00:00:00.000000000 ...,35 days 00:00:00.000000000 ...

word_count_trim,word_count,stop_words_trim,tfidf
"my first film, of many to come trying to pursue my ...","{'come': 1L, 'full': 1L, 'film,': 1L, 'pursue': ...","{'full': 1L, 'film,': 1L, 'pursue': 1L, ...","{'full': 4.199678817030913, ..."
a young man that has earned his master s in ...,"{'blind': 1L, 'a': 3L, 'his': 1L, 'deal': 1L, ...","{'blind': 1L, 'deal': 1L, 'contest': 1L, 'movie': ...","{'blind': 7.015192583987727, ..."


# EXPORT PREDICTION FILE

In [66]:
#to export the file

sub = pd.DataFrame()
sub['project_id'] = data['project_id']
sub['final_status'] = probability_predictions_Tuned_LC1
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]
sub.to_csv("prediction_file_lc_model_1.csv",index=False)

In [68]:
sub2 = pd.DataFrame()
sub2['project_id'] = data['project_id']
sub2['final_status'] = probability_predictions_Tuned_BT2
sub2['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]
sub2.to_csv("prediction_file_bt_model_2.csv",index=False)

In [73]:
check2 = graphlab.SFrame('prediction_file_lc_model_1.csv')
check2['final_status'].show(view = 'Categorical')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
