In [1]:
#will a project get funded
import graphlab

graphlab.canvas.set_target('ipynb')

In [4]:
#feature engineering training data
import re
import datetime
import numpy as np

remove = ['\r', '\x0b', '\x0c', '\t', ' ', '.', "'", '&', '-', '$', '!', ':']

kickstarter = graphlab.SFrame.read_csv('train.csv',
                                       delimiter=remove,
                                       column_type_hints=[str,str,str,float,str,str,str,str,int,int,int,int,int,int]
                                      )

unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    kickstarter[x] = kickstarter[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))

In [5]:
kickstarter.head(5)

project_id,name,desc,goal,keywords,disable_communication
kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them ...,20.0,drawing-for-dollars,False
kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an ...",300.0,sponsor-dereck-blackburn- lostwars-artist-in- ...,False
kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded ...,30.0,mr-squiggles,False
kkst1913131122,Support casting my sculpture in bronze ...,"I'm nearing completion on a sculpture, currently ...",2000.0,support-casting-my- sculpture-in-bronze ...,False
kkst1085176748,daily digest,I'm a fledgling videoblogger living in ...,700.0,daily-digest,False

country,currency,deadline,state_changed_at,created_at,launched_at,backers_count
US,USD,2009-05-03 01:59:59,2009-05-03 02:00:17,2009-04-24 14:15:07,2009-04-24 14:52:03,3
US,USD,2009-05-15 18:10:00,2009-05-15 19:00:18,2009-04-28 18:10:24,2009-04-28 22:26:32,2
US,USD,2009-05-22 16:26:00,2009-05-22 16:30:18,2009-05-12 16:26:53,2009-05-12 16:39:58,0
US,USD,2009-05-31 06:38:00,2009-05-31 06:45:17,2009-05-01 06:38:34,2009-05-01 07:22:21,1
US,USD,2009-05-31 19:20:00,2009-05-31 19:30:19,2009-04-29 19:19:59,2009-05-04 14:14:28,14

final_status
1
0
0
0
0


In [6]:
#to understand which type of product is being funded

remove = ['\r', '\x0b', '\n', '\x0c', '\t', ' ', '.', "'", '&', '-', '$', '!']
#create a cluster of words using desc
kickstarter['word_count_trim'] = graphlab.text_analytics.trim_rare_words(kickstarter['desc'], to_lower=True, delimiters=remove)
kickstarter['word_count'] = graphlab.text_analytics.count_words(kickstarter['word_count_trim'])
kickstarter['stop_words_trim'] = kickstarter['word_count'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), True) 

#tfidf still doesn't get rid of simple words like and, the in and stuff, doesn't still give a better/true rare words
tfidf = graphlab.text_analytics.tf_idf(kickstarter['stop_words_trim'])

if graphlab.version <= '1.6.1':
    tfidf = tfidf['docs']
    
kickstarter['tfidf'] = tfidf

In [7]:
#create a topic model
topic_model = graphlab.topic_model.create(kickstarter['stop_words_trim'])

# if graphlab.version <= '1.6.1':
#     topic_model = topic_model['docs']
    
#kickstarter['topics'] = topic_model

In [None]:
#how to move topic model into a column in kickstarter

In [8]:
#can i query the topic model with the bm25 score?
query = ['art', 'comics', 'crafts', 'dance', 'design','fashion','film', 'video','food','games', 'journalism', 'music',
         'photography', 'publishing', 'technology', 'theater']

bm25_scores = graphlab.text_analytics.bm25(kickstarter['tfidf'], query)

In [11]:
#using a classification model to predict- final_status vs tfidf, goal

train_data, test_data = kickstarter.random_split(.8)

kickstarter['funded'] = kickstarter['final_status'] == 1

selected_features = ['tfidf', 'goal', 'country', 'launched_at']

In [12]:
#svm classification model

svm_classification_model = graphlab.svm_classifier.create(train_data,
                                                                      target = 'funded',
                                                                 penalty = 0.001,
                                                                      features = selected_features,
                                                                      max_iterations = 2 )
prediction = svm_classification_model.classify(test_data)
results = svm_classification_model.evaluate(test_data)

print "Accuracy         : %s" % results['accuracy']
print "Confusion Matrix : \n%s" % results['confusion_matrix']

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



Accuracy         : 0.685105170695
Confusion Matrix : 
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  528  |
|      1       |        1        |  685  |
|      0       |        0        | 13744 |
|      1       |        0        |  6104 |
+--------------+-----------------+-------+
[4 rows x 3 columns]



In [15]:
#parameter search with logistic classifier
from graphlab import model_parameter_search

params = {'target': 'final_status'}

logistic_parameter_search = model_parameter_search.create((train_data, test_data),graphlab.logistic_classifier.create, params)

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-26-2017-10-18-4600000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Aug-26-2017-10-18-4600000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-Aug-26-2017-10-18-4600000' already exists. Renaming the job to 'Model-Parameter-Search-Aug-26-2017-10-18-4600000-b51c9'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Aug-26-2017-10-18-4600000-b51c9' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Aug-26-2017-10-18-4600000-b51c9' scheduled.


In [16]:
logistic_parameter_search.get_results().sort('validation_accuracy', ascending= False)

model_id,l1_penalty,l2_penalty,target,training_accuracy,validation_accuracy
1,10.0,10.0,final_status,0.999988133522,0.724799392242
4,10.0,10.0,final_status,0.999988133522,0.724799392242
9,0.0001,0.1,final_status,0.999988133522,0.7203836475
8,0.001,0.0,final_status,0.999988133522,0.7203836475
2,0.001,0.001,final_status,0.999988133522,0.7203836475
5,0.001,0.0,final_status,0.999988133522,0.7203836475
0,0.1,0.0001,final_status,0.999988133522,0.720336166374
3,0.1,0.1,final_status,0.999988133522,0.720336166374
6,0.1,0.0,final_status,0.999988133522,0.720336166374
7,0.0,1.0,final_status,1.0,0.719861355111


In [17]:
print logistic_parameter_search.get_models()[1]

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 868556
Number of examples             : 84271
Number of classes              : 2
Number of feature columns      : 18
Number of unpacked features    : 120131

Hyperparameters
---------------
L1 penalty                     : 10.0
L2 penalty                     : 10.0

Training Summary
----------------
Solver                         : fista
Solver iterations              : 10
Solver status                  : TERMINATED: Iteration limit reached.
Training time (sec)            : 8.6086

Settings
--------
Log-likelihood                 : 2918.5395

Highest Positive Coefficients
-----------------------------
project_id[kkst229390481]      : 0.6821
name[Lowcountry Street Grocery : Mobile Farmers' Market] : 0.6821
desc[Charleston's first mission-driven mobile farmers' market committed to improving healthy food access & bolstering our local food economy] : 0.6821
keywords[lowcountry-street-grocery

In [18]:
#tuned logistic classification model

tuned_lc_model = graphlab.logistic_classifier.create(train_data,
                                                     target = 'funded',
                                                     features=None,
                                                     l2_penalty=10,
                                                     l1_penalty=10.0,
                                                     solver='fista',
                                                     feature_rescaling=True,
                                                     max_iterations=10,
                                                     validation_set='auto',
                                                     verbose=True
                                                    )

prediction = tuned_lc_model.classify(test_data)
results = tuned_lc_model.evaluate(test_data)

print "Accuracy         : %s" % results['accuracy']
print "Confusion Matrix : \n%s" % results['confusion_matrix']

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



Accuracy         : 0.724229618727
Confusion Matrix : 
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  490  |
|      0       |        0        | 13782 |
|      1       |        1        |  1471 |
|      1       |        0        |  5318 |
+--------------+-----------------+-------+
[4 rows x 3 columns]



In [9]:
#logistic classification model

logistic_classification_model = graphlab.logistic_classifier.create(train_data,
                                                                      target = 'funded',
                                                                      features = selected_features,
                                                                      l2_penalty = 0,
                                                                      max_iterations = 3 )
prediction = logistic_classification_model.classify(test_data)
results = logistic_classification_model.evaluate(test_data)

print "Accuracy         : %s" % results['accuracy']
print "Confusion Matrix : \n%s" % results['confusion_matrix']

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



Accuracy         : 0.691906866302
Confusion Matrix : 
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      0       |        1        |  1904 |
|      0       |        0        |  8721 |
|      1       |        0        |  2767 |
|      1       |        1        |  1769 |
+--------------+-----------------+-------+
[4 rows x 3 columns]



In [101]:
#neural network classification model


# nn_classification_model = graphlab.neuralnet_classifier.create(kickstarter,
#                                                               target = 'funded',
#                                                               max_iterations = 3)

# kickstarter['features'] = nn_classification_model.extract_features(kickstarter)

# nn_classification_model = graphlab.classifier.create(train_data,
#                                                      target = 'funded',
#                                                      features = ['features']
#                                                     )

# prediction = nn_classification_model.classify(test_data)
# results = nn_classification_model.evaluate(test_data)

# print "Accuracy         : %s" % results['accuracy']
# print "Confusion Matrix : \n%s" % results['confusion_matrix']

In [145]:
#feature engineering test data

import re
import datetime
from graphlab import SFrame


remove = ['\r', '\x0b', '\x0c', '\t', ' ', '.', "'", '&', '-', '$', '!', ':', '"', '/']

# data = graphlab.SFrame.read_csv('test1.csv',
#                                 delimiter=remove,
#                                 quote_char= '\0',
#                                 error_bad_lines=False
#                                )

data = graphlab.SFrame('test1.csv')

# (sf, bad_lines) = graphlab.SFrame.read_csv_with_errors('test1.csv')
# bad_lines

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,float,str,str,str,str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [146]:

#create a cluster of words using desc

data['word_count_trim'] = graphlab.text_analytics.trim_rare_words(data['desc'], to_lower=True, delimiters=remove)
data['word_count'] = graphlab.text_analytics.count_words(data['word_count_trim'])
data['stop_words_trim'] = data['word_count'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), True) 

#tfidf still doesn't get rid of simple words like and, the in and stuff, doesn't still give a better/true rare words
tfidf = graphlab.text_analytics.tf_idf(data['stop_words_trim'])

if graphlab.version <= '1.6.1':
    tfidf = tfidf['docs']
    
data['tfidf'] = tfidf

In [147]:
#using the test data to make predictions

probability_predictions = tuned_lc_model.predict(data, output_type='probability')

In [149]:
#to export the file

sub = pandas.DataFrame()
sub['project_id'] = data['project_id']
sub['final_status'] = probability_predictions
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]
sub.to_csv("prediction_file.csv",index=False)

In [96]:
check = graphlab.SFrame('prediction_file.csv')
check.sort('final_status', ascending=True
          )

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


project_id,final_status
kkst1482006802,0.0320340245643
kkst1838463372,0.0384586149138
kkst1901963057,0.046790186118
kkst795481289,0.0470349892302
kkst1521088282,0.0476284232623
kkst1997327184,0.0480018783814
kkst759934009,0.0484265720539
kkst599549428,0.0486086876088
kkst2045263573,0.267023145945
kkst649181190,0.267039627328


In [139]:
import numpy as np
import pandas
import re

datum = pandas.read_csv('test.csv')

unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    datum[x] = datum[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))

def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower().strip('"')
    return p1

kickdesc = kickdesc.map(desc_clean)
datum['keywords'] = datum['keywords'].replace(['-'], [' '], regex=True)
datum['name'] = datum['keywords']
datum['desc'] = datum['desc'].str.replace(":"," ").astype('str') 
datum['desc'] = datum['desc'].str.replace('"',' ').astype('str')
datum['desc'] = datum['desc'].str.replace('-',' ').astype('str')
datum['desc'] = datum['desc'].replace(['Ã','¶', 'Ã', 'Â', '©', '¥', '¤', '\s'], ['a','', 'a', 'a', '', 'y', '', ' '], regex=True)
datum['name'] = datum['name'].replace(['-'], [' '], regex=True)
datum['desc'] = datum['desc'].map(lambda x: x.strip())

In [142]:
datum['desc']

0        My first film, of many to come. Trying to purs...
1        A young man that has earned his master's in sc...
2        Film about a high school constructed for negro...
3        The acclaimed series about a group of transgen...
4        Blackdom's history offers  a new narrative tha...
5        Sexual immorality is Satan's weapon to wage wa...
6        Film focuses on connection between social alie...
7        Paris is hired by a jazz singer to kill an old...
8                   new web series created by jonney terry
9        A martyr faces execution at the hands of the S...
10       A Brighter Day                                ...
11       A tender short film about a young man who need...
12       The Heart of a Woman and The Heart of a Man is...
13       This film is a fictional crime drama following...
14       A family dramedy about a grandfather  and gran...
15       A short drama based on a true events. Story of...
16       An hour long pilot about a group of suburban L.

In [143]:
datum.to_csv('test1.csv',
             float_format=None,
             header=True,
             index=False,
             index_label=None,
             mode='w',
             encoding = 'utf-8',
             line_terminator='\n',
             tupleize_cols=False,
             date_format='%Y%m%d',
             decimal = '.'
            )

In [48]:
from graphlab import SArray

datum['word_count_trim'] = SArray(datum['desc'], dtype=str)
type(datum['word_count_trim'])

pandas.core.series.Series

In [40]:

#create a cluster of words using desc
remove = ['\r', '\x0b', '\x0c', '\t', ' ', '.', "'", '&', '-', '$', '!', ':', '"', '/']


datum['word_count_trim'] = graphlab.text_analytics.trim_rare_words(datum['desc'], to_lower=True, delimiters=remove)
datum['word_count'] = graphlab.text_analytics.count_words(datum['word_count_trim'])
datum['stop_words_trim'] = datum['word_count'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), True) 

#tfidf still doesn't get rid of simple words like and, the in and stuff, doesn't still give a better/true rare words
tfidf = graphlab.text_analytics.tf_idf(datum['stop_words_trim'])

if graphlab.version <= '1.6.1':
    tfidf = tfidf['docs']
    
datum['tfidf'] = tfidf

ToolkitError: Input sa is not an SArray.

In [None]:
#using the test data to make predictions

probability_predictions = tuned_lc_model.predict(datum, output_type='probability')

In [None]:
#to export the file

sub = pd.DataFrame()
sub['project_id'] = data['project_id']
sub['final_status'] = probability_predictions
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]
sub.to_csv("prediction_file.csv",index=False)