In [None]:
# This is needed for DataFrameMapper
# %pip install sklearn-pandas

In [None]:
# Load necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

# For SVM stuff
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import WhitespaceTokenizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn_pandas import DataFrameMapper
from gensim.models.doc2vec import Doc2Vec

Format does basic work to change the format of columns into something we can use.

In [None]:
def format_df(df):
    df["deadline"] = pd.to_datetime(df["deadline"])
    df["launched"] = pd.to_datetime(df["launched"])
    df["success"] = df["pledged"] >= df["goal"]
    df["duration"] = df["deadline"] - df["launched"]
    return df

Clean removes columns we don't care about. Namely:
* When the duration is less than one day
* If the project state is 'live'
* If the project state is cancelled

In [None]:
# Taken from Michael's notebook
def clean_df(df):
    df = df.drop(df.loc[df["duration"] < datetime.timedelta(days=1)].index)
    df = df.drop(df.loc[df["state"] == "live"].index)
    df = df.drop(df.loc[df["state"] == "canceled"].index)
    return df

These functions define the transformations of the columns we care about into the forms we're interested in running actual algorithms on. 

In [None]:
def tokenize(text): 
    tknzr = WhitespaceTokenizer()
    return tknzr.tokenize(text)

# TODO: use an embedding instead??
def get_count_vectorizer():
    nltk.download('stopwords')
    en_stopwords = set(stopwords.words("english")) 
    count_vectorizer = CountVectorizer(stop_words=en_stopwords, analyzer='word', tokenizer=tokenize, min_df=1)
    return count_vectorizer

def get_main_category_encoder():
    main_category_le = LabelEncoder()
    main_category_le.fit(train_clean['main_category'])
    return main_category_le

def get_category_encoder():
    category_le = LabelEncoder()
    category_le.fit(train_clean['category'])
    return category_le

def build_doc2vec(names, embedding_length=20):
    tokenized = names.apply(tokenize)
    tokenized = list(tokenized)
    
    # this is fairly important, having a corpus_file instead of in-memory data
    # speeds up building the model significantly
    # https://github.com/RaRe-Technologies/gensim/issues/2218
    with open("data/train_names.txt", "w") as f:
        for doc in tokenized:
            f.write(" ".join(doc) + "\n")
    
    # https://radimrehurek.com/gensim/models/doc2vec.html
    # note that infer_vector is NOT deterministic
    # https://github.com/RaRe-Technologies/gensim/issues/447
    # and they shouldn't be forced to be determinstic
    model = Doc2Vec(corpus_file="data/train_names.txt", vector_size=20, min_count=1, workers=7)
    return model
    
# expects a series of names
def doc2vec_names(names, model):
    tokenized = names.apply(tokenize)
    tokenized = list(tokenized)
    
    vectorized = [model.infer_vector(doc) for doc in tokenized]
    return vectorized

def get_mapper():
    main_category_le = get_main_category_encoder()
    category_le = get_category_encoder()
    count_vectorizer = get_count_vectorizer()

    mapper = DataFrameMapper([
        ('name', count_vectorizer),
        ('main_category', main_category_le),
        ('category', category_le),
        (['duration'], StandardScaler()),
        (['usd_goal_real'], StandardScaler()),
        (['launched_month', 'deadline_month', 'doc2vec_names'], OrdinalEncoder()),
    ], df_out=True)
    return mapper

def transform_df(df, mapper, d2v_model, fit=False):
    X = df[["name", "main_category", "category", "duration", "usd_goal_real"]].copy()
    X["launched_month"] = df["launched"].apply(lambda x: x.month)
    X["deadline_month"] = df["deadline"].apply(lambda x: x.month)
    X["duration_seconds"] = X["duration"].apply(lambda x: x.seconds)
    X["doc2vec_names"] = doc2vec_names(X["name"], d2v_model)
    
    if fit:
        X_mapped = mapper.fit_transform(X)
    else:
        X_mapped = mapper.transform(X)
        
    y = df["success"].copy()
    
    return X_mapped, y, mapper
    

In [None]:
# Taken from Michael's notebook
train_full = pd.read_csv("data/2018-train.csv").dropna()
validate_full = pd.read_csv("data/2018-validate.csv").dropna()

In [None]:
# Taken from Michael's notebook
train_format = format_df(train_full)
validate_format = format_df(validate_full)

In [None]:
# Taken from Michael's notebook
train_clean = clean_df(train_format)
valid_clean = clean_df(validate_format)
train_clean.head(3)

In [None]:
%%time
d2v_model = build_doc2vec(train_clean["name"])

In [None]:
mapper = get_mapper()
X_train, y_train, mapper = transform_df(train_clean, mapper, d2v_model, fit=True)
X_valid, y_valid, mapper = transform_df(valid_clean, mapper, d2v_model, fit=False)

In [None]:
# TODO: write to file

X_train.head(5)