In [1]:
from pprint import pprint
import datetime
import pickle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import SGD

In [2]:
def one_hot_df(df,one_hot_cols,drop_cols):
    dfret = df.copy()
    dfret = pd.get_dummies(dfret,columns=one_hot_cols).drop(columns=drop_cols)
    return dfret

# THIS RETURNS TEST DATA, NOT VALID DESPITE VARIABLE NAMES
def get_data(data_category="base", data_suffix="", one_hot_main=False, one_hot_both=False):
    train_x = pd.read_csv("data/preprocess-%s-x-train%s.csv" % (data_category, data_suffix), index_col=0)
    train_y = pd.read_csv("data/preprocess-%s-y-train%s.csv" % (data_category, data_suffix))
    valid_x = pd.read_csv("data/preprocess-%s-x-test%s.csv" % (data_category, data_suffix), index_col=0)
    valid_y = pd.read_csv("data/preprocess-%s-y-test%s.csv" % (data_category, data_suffix))
    
    if data_category == "basic_name" and data_suffix == "-correct":
        train_x.rename(inplace=True, columns={
            "launched_month_deadline_month_0": "launched_month",
            "launched_month_deadline_month_1": "deadline_month"})
        valid_x.rename(inplace=True, columns={
            "launched_month_deadline_month_0": "launched_month",
            "launched_month_deadline_month_1": "deadline_month"})
        for col in train_x.columns:
            if "Unnamed" in col:
                train_x = train_x.drop(col, axis=1)
                valid_x = valid_x.drop(col, axis=1)
        train_y["success"] = train_y["1"]
        valid_y["success"] = valid_y["1"]
    train_y.success = train_y.success.apply(lambda x: 1 if x else 0)
    valid_y.success = valid_y.success.apply(lambda x: 1 if x else 0)
    
    if one_hot_both:
        one_hot_cols = ['main_category', 'category', 'launched_month','deadline_month']
        drop_cols = []
        train_x = one_hot_df(train_x, one_hot_cols, drop_cols)
        valid_x = one_hot_df(valid_x, one_hot_cols, drop_cols)
    elif one_hot_main:
        one_hot_cols = ['main_category','launched_month','deadline_month']
        drop_cols = ['category']
        train_x = one_hot_df(train_x, one_hot_cols, drop_cols)
        valid_x = one_hot_df(valid_x, one_hot_cols, drop_cols)
    
    train_x = np.array(train_x)
    train_y = np.array(train_y.success).reshape((-1, 1))
    valid_x = np.array(valid_x)
    valid_y = np.array(valid_y.success).reshape((-1, 1))
    
    return train_x, train_y, valid_x, valid_y

In [3]:
data_base_one_hot_both = get_data(data_category="base", one_hot_both=True)
data_doc2vec_one_hot_both = get_data(data_category="doc2vec", one_hot_both=True)
#data_basicname_one_hot_both = get_data(data_category="basic_name", data_suffix="-correct", one_hot_both=True)

In [4]:
def build_and_evaluate_model(
    train_x, train_y, valid_x, valid_y,
    epochs=20,
    lr=0.01,
    layer_sizes=(13, 13),
    dropout=None,
    verbose=True):
    print("epochs:", epochs, end=" ")
    print("layer_sizes:", layer_sizes, end=" ")
    print("dropout:", dropout)
    model = Sequential()
    for layer_size in layer_sizes:
        model.add(Dense(layer_size, activation="relu"))
        if dropout is not None:
            model.add(Dropout(dropout))
    model.add(Dense(1, activation="sigmoid"))
    sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
#     sgd = SGD(lr=lr, decay=1e-6)
    model.compile(loss="binary_crossentropy",
                 optimizer=sgd,
                 metrics=["accuracy"])

    model.fit(train_x, train_y, epochs=epochs, batch_size=128, verbose=verbose)
    score = model.evaluate(valid_x, valid_y, batch_size=128, verbose=False)
    names = model.metrics_names
    return {names[i]: score[i] for i in range(len(names))}, model

In [5]:
doc2vec_results, doc2vec_model = build_and_evaluate_model(*data_doc2vec_one_hot_both)
doc2vec_model.save("nn-final-models/doc2vec_model-%s.kerasmodel" % datetime.datetime.now().timestamp())
pprint(doc2vec_results)

epochs: 20 layer_sizes: (13, 13) dropout: None
Train on 249956 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: nn-final-models/doc2vec_model-1575783152.668142.kerasmodel/assets
{'accuracy': 0.66882634, 'loss': 0.5987510227147721}


In [6]:
base_results, base_model = build_and_evaluate_model(*data_base_one_hot_both)
base_model.save("nn-final-models/base_model-%s.kerasmodel" % datetime.datetime.now().timestamp())
pprint(base_results)

epochs: 20 layer_sizes: (13, 13) dropout: None
Train on 249956 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: nn-final-models/base_model-1575783299.202439.kerasmodel/assets
{'accuracy': 0.6693663, 'loss': 0.5987539006547273}


In [7]:
#basicname_results = build_and_evaluate_model(*data_basicname_one_hot_both)