In [1]:
# This is needed for DataFrameMapper
# %pip install sklearn-pandas

In [2]:
# Load necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

# For SVM stuff
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import WhitespaceTokenizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn_pandas import DataFrameMapper

Format does basic work to change the format of columns into something we can use.

In [3]:
def format_df(df):
    df["deadline"] = pd.to_datetime(df["deadline"])
    df["launched"] = pd.to_datetime(df["launched"])
    df["success"] = df["pledged"] >= df["goal"]
    df["duration"] = df["deadline"] - df["launched"]
    return df

Clean removes columns we don't care about. Namely:
* When the duration is less than one day
* If the project state is 'live'
* If the project state is cancelled

In [4]:
# Taken from Michael's notebook
def clean_df(df):
    df = df.drop(df.loc[df["duration"] < datetime.timedelta(days=1)].index)
    df = df.drop(df.loc[df["state"] == "live"].index)
    df = df.drop(df.loc[df["state"] == "canceled"].index)
    return df

These functions define the transformations of the columns we care about into the forms we're interested in running actual algorithms on. 

In [5]:
def tokenize(text): 
    tknzr = WhitespaceTokenizer()
    return tknzr.tokenize(text)

# TODO: use an embedding instead??
def get_count_vectorizer():
    nltk.download('stopwords')
    en_stopwords = set(stopwords.words("english")) 
    count_vectorizer = CountVectorizer(stop_words=en_stopwords, analyzer='word', tokenizer=tokenize, min_df=1)
    return count_vectorizer

def get_main_category_encoder():
    main_category_le = LabelEncoder()
    main_category_le.fit(train_clean['main_category'])
    return main_category_le

def get_category_encoder():
    category_le = LabelEncoder()
    category_le.fit(train_clean['category'])
    return category_le

def get_mapper():
    main_category_le = get_main_category_encoder()
    category_le = get_category_encoder()
    count_vectorizer = get_count_vectorizer()

    mapper = DataFrameMapper([
        ('main_category', main_category_le),
        ('category', category_le),
        (['duration'], StandardScaler()),
        (['usd_goal_real'], StandardScaler()),
        (['launched_month', 'deadline_month'], OrdinalEncoder()),
    ], df_out=True)
    return mapper

def transform_df(df, mapper, fit=False):
    X = df[["name", "main_category", "category", "duration", "usd_goal_real"]].copy()
    X["launched_month"] = df["launched"].apply(lambda x: x.month)
    X["deadline_month"] = df["deadline"].apply(lambda x: x.month)
    X["duration_seconds"] = X["duration"].apply(lambda x: x.seconds)
    
    if fit:
        X_mapped = mapper.fit_transform(X)
    else:
        X_mapped = mapper.transform(X)
        
    y = df["success"].copy()
    
    return X_mapped, y, mapper
    

In [6]:
# Taken from Michael's notebook
train_full = pd.read_csv("data/2018-train.csv").dropna()
validate_full = pd.read_csv("data/2018-validate.csv").dropna()

In [7]:
# Taken from Michael's notebook
train_format = format_df(train_full)
validate_format = format_df(validate_full)

In [8]:
# Taken from Michael's notebook
train_clean = clean_df(train_format)
valid_clean = clean_df(validate_format)
train_clean.head(3)

Unnamed: 0.1,Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,success,duration
0,319122,695425648,Peace-building through story-making with youth...,Children's Books,Publishing,SEK,2015-06-09,6000.0,2015-05-10 14:53:53,6251.0,successful,15,SE,756.92,762.02,731.42,True,29 days 09:06:07
2,175494,189251239,Colored Baggies for Boardgames,Tabletop Games,Games,USD,2013-01-07,6000.0,2012-11-08 20:06:31,15151.0,successful,518,US,15151.0,15151.0,6000.0,True,59 days 03:53:29
4,141771,1720248225,Two Scoops of Beauty health wellness women eve...,Events,Food,USD,2016-09-04,5202.0,2016-08-05 01:26:56,0.0,failed,0,US,0.0,0.0,5202.0,False,29 days 22:33:04


In [9]:
%%time
mapper = get_mapper()
X_train, y_train, mapper = transform_df(train_clean, mapper, fit=True)
X_valid, y_valid, mapper = transform_df(valid_clean, mapper, fit=False)

[nltk_data] Downloading package stopwords to /home/delta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CPU times: user 5.51 s, sys: 62.4 ms, total: 5.57 s
Wall time: 5.65 s


In [10]:
X_train.head(5)

Unnamed: 0,main_category,category,duration,usd_goal_real,launched_month_deadline_month_0,launched_month_deadline_month_1
0,12,19,-0.316333,-0.036515,4.0,5.0
2,8,136,2.025723,-0.03174,10.0,0.0
4,7,44,-0.272265,-0.032463,7.0,8.0
5,5,31,-0.400095,-0.034948,2.0,3.0
6,6,93,-0.326225,0.00814,0.0,2.0


In [11]:
y_train.head(5)

0     True
2     True
4    False
5    False
6    False
Name: success, dtype: bool

In [14]:
# TODO: write to file
X_train.to_csv("data/preprocess-base-x-train.csv")
y_train.to_csv("data/preprocess-base-y-train.csv")
X_valid.to_csv("data/preprocess-base-x-valid.csv")
y_valid.to_csv("data/preprocess-base-y-valid.csv")

  y_train.to_csv("data/preprocess-base-y-train.csv")
  y_valid.to_csv("data/preprocess-base-y-valid.csv")
