In [1]:
# This is needed for DataFrameMapper
# %pip install sklearn-pandas

In [2]:
# Load necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

# For SVM stuff
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import WhitespaceTokenizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn_pandas import DataFrameMapper
from gensim.models.doc2vec import Doc2Vec

Format does basic work to change the format of columns into something we can use.

In [3]:
def format_df(df):
    df["deadline"] = pd.to_datetime(df["deadline"])
    df["launched"] = pd.to_datetime(df["launched"])
    df["success"] = df["pledged"] >= df["goal"]
    df["duration"] = df["deadline"] - df["launched"]
    return df

Clean removes columns we don't care about. Namely:
* When the duration is less than one day
* If the project state is 'live'
* If the project state is cancelled

In [4]:
# Taken from Michael's notebook
def clean_df(df):
    df = df.drop(df.loc[df["duration"] < datetime.timedelta(days=1)].index)
    df = df.drop(df.loc[df["state"] == "live"].index)
    df = df.drop(df.loc[df["state"] == "canceled"].index)
    return df

These functions define the transformations of the columns we care about into the forms we're interested in running actual algorithms on. 

In [5]:
DOC2VEC_OUTPUT_LENGTH=20

In [6]:
def tokenize(text): 
    tknzr = WhitespaceTokenizer()
    return tknzr.tokenize(text)

def get_main_category_encoder(train_clean):
    main_category_le = LabelEncoder()
    main_category_le.fit(train_clean['main_category'])
    return main_category_le

def get_category_encoder(train_clean):
    category_le = LabelEncoder()
    category_le.fit(train_clean['category'])
    return category_le

def build_doc2vec(names):
    tokenized = names.apply(tokenize)
    tokenized = list(tokenized)
    
    # this is fairly important, having a corpus_file instead of in-memory data
    # speeds up building the model significantly
    # https://github.com/RaRe-Technologies/gensim/issues/2218
    with open("data/train_names.txt", "w") as f:
        for doc in tokenized:
            f.write(" ".join(doc) + "\n")
    
    # https://radimrehurek.com/gensim/models/doc2vec.html
    # note that infer_vector is NOT deterministic
    # https://github.com/RaRe-Technologies/gensim/issues/447
    # and they shouldn't be forced to be determinstic
    model = Doc2Vec(corpus_file="data/train_names.txt", vector_size=DOC2VEC_OUTPUT_LENGTH, min_count=1, workers=7)
    return model
    
# expects a series of names
def doc2vec_names(names, model):
    tokenized = names.apply(tokenize)
    tokenized = list(tokenized)
    
    vectorized = [model.infer_vector(doc) for doc in tokenized]
    return vectorized

def get_mapper(train_clean):
    main_category_le = get_main_category_encoder(train_clean)
    category_le = get_category_encoder(train_clean)
    
    doc2vec_columns = ["doc2vec_names_%s" % i for i in range(DOC2VEC_OUTPUT_LENGTH)]
    
    mapper_list = [
        ('main_category', main_category_le),
        ('category', category_le),
        (['duration'], StandardScaler()),
        (['usd_goal_real'], StandardScaler()),
        (['launched_month'], OrdinalEncoder()),
        (['deadline_month'], OrdinalEncoder()),
#         (doc2vec_columns, None),
    ]
    for column in doc2vec_columns:
        mapper_list.append((column, None))

    mapper = DataFrameMapper(mapper_list, df_out=True)
    return mapper

def transform_df(df, mapper, d2v_model, fit=False):
    X = df[["name", "main_category", "category", "duration", "usd_goal_real"]].copy()
    X["launched_month"] = df["launched"].apply(lambda x: x.month)
    X["deadline_month"] = df["deadline"].apply(lambda x: x.month)
    X["duration_seconds"] = X["duration"].apply(lambda x: x.seconds)
    doc2vec_names_array = np.array(doc2vec_names(X["name"], d2v_model))
    
    for i in range(DOC2VEC_OUTPUT_LENGTH):
        colname = "doc2vec_names_%s" % i
        X[colname] = doc2vec_names_array[:, i]
    
    if fit:
        X_mapped = mapper.fit_transform(X)
    else:
        X_mapped = mapper.transform(X)
        
    y = df["success"].copy()
    
    # it is ridiculous that this is necessary
    for col in X_mapped.columns:
        if len(col) > 25 and "doc2vec" in col:
            X_mapped.rename(columns={col: "doc2vec_names_%s" % col[-1]}, inplace=True)
    
    return X_mapped, y, mapper
    

In [7]:
# (x_train, y_train, x_valid, y_valid)
def get_train_and_test(train_file="data/2018-train.csv", valid_file="data/2018-validate.csv", test_file="data/2018-test.csv"):
    train_full = pd.read_csv(train_file).dropna()
    validate_full = pd.read_csv(valid_file).dropna()
    test_full = pd.read_csv(test_file).dropna()
    
    train_format = format_df(train_full)
    validate_format = format_df(validate_full)
    test_format = format_df(test_full)
    
    train_clean = clean_df(train_format)
    valid_clean = clean_df(validate_format)
    test_clean = clean_df(test_format)
    
    print("building d2v model")
    d2v_model = build_doc2vec(train_clean["name"])
    
    print("getting mapper")
    mapper = get_mapper(train_clean)
    
    print("transforming train")
    X_train, y_train, mapper = transform_df(train_clean, mapper, d2v_model, fit=True)
    print("transforming valid")
    X_valid, y_valid, mapper = transform_df(valid_clean, mapper, d2v_model, fit=False)
    print("transforming test")
    X_test, y_test, mapper = transform_df(test_clean, mapper, d2v_model, fit=False)
    
    return (X_train, y_train, X_valid, y_valid, X_test, y_test)

In [8]:
X_train, y_train, X_valid, y_valid, X_test, y_test = get_train_and_test()

building d2v model
getting mapper
transforming train
transforming valid
transforming test


In [9]:
X_train.to_csv("data/preprocess-doc2vec-x-train.csv", header=True)
y_train.to_csv("data/preprocess-doc2vec-y-train.csv", header=True, index=False)
X_valid.to_csv("data/preprocess-doc2vec-x-valid.csv", header=True)
y_valid.to_csv("data/preprocess-doc2vec-y-valid.csv", header=True, index=False)
X_test.to_csv("data/preprocess-doc2vec-x-test.csv", header=True)
y_test.to_csv("data/preprocess-doc2vec-y-test.csv", header=True, index=False)