In [1]:
%pip install sklearn-pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Load necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

# For SVM stuff
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import WhitespaceTokenizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import make_pipeline

In [3]:
# Taken from Michael's notebook
def preprocess_df(df):
    df["deadline"] = pd.to_datetime(df["deadline"])
    df["launched"] = pd.to_datetime(df["launched"])
    df["success"] = df["pledged"] >= df["goal"]
    df["duration"] = df["deadline"] - df["launched"]
    
    return df

In [4]:
# Taken from Michael's notebook
def clean_df(df):
    df = df.drop(df.loc[df["duration"] < datetime.timedelta(days=1)].index)
    df = df.drop(df.loc[df["state"] == "live"].index)
    df = df.drop(df.loc[df["state"] == "canceled"].index)
    
    return df

In [5]:
# Taken from Michael's notebook
train_full = pd.read_csv("data/2018-train.csv").dropna()
validate_full = pd.read_csv("data/2018-validate.csv").dropna()

In [6]:
# Taken from Michael's notebook
train_pp = preprocess_df(train_full)
validate_pp = preprocess_df(validate_full)

In [7]:
# Taken from Michael's notebook
train_clean = clean_df(train_pp)
valid_clean = clean_df(validate_pp)
train_clean.head()

Unnamed: 0.1,Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,success,duration
0,319122,695425648,Peace-building through story-making with youth...,Children's Books,Publishing,SEK,2015-06-09,6000.0,2015-05-10 14:53:53,6251.0,successful,15,SE,756.92,762.02,731.42,True,29 days 09:06:07
2,175494,189251239,Colored Baggies for Boardgames,Tabletop Games,Games,USD,2013-01-07,6000.0,2012-11-08 20:06:31,15151.0,successful,518,US,15151.0,15151.0,6000.0,True,59 days 03:53:29
4,141771,1720248225,Two Scoops of Beauty health wellness women eve...,Events,Food,USD,2016-09-04,5202.0,2016-08-05 01:26:56,0.0,failed,0,US,0.0,0.0,5202.0,False,29 days 22:33:04
5,236644,272982453,Gavarcia - Haute Couture in Canada,Couture,Fashion,CAD,2015-04-16,3000.0,2015-03-18 16:27:44,556.0,failed,8,CA,434.86,456.07,2460.83,False,28 days 07:32:16
6,231360,245912004,"""One Last Crazy F*cking Night"" (#OLCFN) The Movie",Narrative Film,Film & Video,USD,2013-03-01,50000.0,2013-01-30 17:55:02,3041.0,failed,28,US,3041.0,3041.0,50000.0,False,29 days 06:04:58


In [8]:
# Define a count vectorizer
def tokenize(text): 
    tknzr = WhitespaceTokenizer()
    return tknzr.tokenize(text)

def get_count_vectorizer():
    nltk.download('stopwords')
    en_stopwords = set(stopwords.words("english")) 
    count_vectorizer = CountVectorizer(stop_words=en_stopwords, analyzer='word', tokenizer=tokenize, min_df=1)
    return count_vectorizer

def get_main_category_encoder():
    main_category_le = LabelEncoder()
    main_category_le.fit(train_clean['main_category'])
    return main_category_le

def get_category_encoder():
    category_le = LabelEncoder()
    category_le.fit(train_clean['category'])
    return category_le

def get_mapper():
    # Define a mapper for transforming the dataframe
    main_category_le = get_main_category_encoder()
    category_le = get_category_encoder()
    count_vectorizer = get_count_vectorizer()

    mapper = DataFrameMapper([
        ('name', count_vectorizer),
        ('main_category', main_category_le),
        ('category', category_le),
    ], df_out=True)
    return mapper

'''
        (['duration'], StandardScaler()),
        (['usd_goal_real'], StandardScaler()),
        (['launched_month', 'deadline_month'], OrdinalEncoder()),
'''


"\n        (['duration'], StandardScaler()),\n        (['usd_goal_real'], StandardScaler()),\n        (['launched_month', 'deadline_month'], OrdinalEncoder()),\n"

In [9]:
def prep(df):
    X = df[["name", "main_category", "category", "duration", "usd_goal_real"]].copy()
    X["launched_month"] = df["launched"].apply(lambda x: x.month)
    X["deadline_month"] = df["deadline"].apply(lambda x: x.month)
    X["duration"] = X["duration"].apply(lambda x: x.seconds)

    y = df["success"].copy()
    return X, y

In [10]:
X_train, y_train = prep(train_clean)
X_valid, y_valid = prep(valid_clean)

In [11]:
mapper = get_mapper()

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


In [12]:
skf = StratifiedKFold(5)
kfolds = skf.get_n_splits(X_train, y_train)

pipeline = make_pipeline(
    mapper,
    SVC(),
)

In [None]:
params = {
        'svc__C': [0.01, 1.0, 10.0],
        'svc__degree': [2, 3],
        'svc__gamma': [0.1, 0.5, 1],
        'svc__kernel': ['linear', 'poly', 'rbf'],
}


random_svm = RandomizedSearchCV(pipeline,
                                params,
                                cv=kfolds,
                                scoring="accuracy",
                                verbose=1,   
                                n_jobs=-1)

_ = random_svm.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


In [None]:
def report_results(pred, y):      
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result

def print_results(results):
    print("Results: ")
    for key, value in results.items():
        print("\t{}: {}".format(key, value))