In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd

#random seeds for stochastic parts of neural network 
np.random.seed(10)
from tensorflow import set_random_seed
set_random_seed(15)
from tensorflow import keras
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout
from keras.layers.embeddings import Embedding

from keras.callbacks import EarlyStopping
from keras import optimizers

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

PATH = "../input/"
import os
print(os.listdir(PATH))
# Any results you write to the current directory are saved as output.

In [None]:
## Helpers
def missing_impute(df):
    for i in df.columns:
        if df[i].dtype == "object":
            df[i] = df[i].fillna("other")
        elif (df[i].dtype == "int64" or df[i].dtype == "float64"):
            df[i] = df[i].fillna(-1)
        else:
            pass
    return df

def prepare_time_features(df, time_feature):
    df[time_feature] = df[time_feature].str.replace(" UTC", "")
    df[time_feature] = pd.to_datetime(df[time_feature], format='%Y-%m-%d %H:%M:%S')
    df['hour_of_day'] = df.time_feature.dt.hour
    df['week'] = df.time_feature.dt.week
    df['month'] = df.time_feature.dt.month
    df["year"] = df.time_feature.dt.year
    df['day_of_year'] = df.time_feature.dt.dayofyear
    df['week_of_year'] = df.time_feature.dt.weekofyear
    df["weekday"] = df.time_feature.dt.weekday
    df["quarter"] = df.time_feature.dt.quarter
    df["day_of_month"] = df.time_feature.dt.day
    
    return df

# def preproc(X_train, X_val, X_test):
#     input_list_train = []
#     input_list_val = []
#     input_list_test = []
    
#     #the cols to be embedded: rescaling to range [0, # values)
#     for c in embed_cols:
#         raw_vals = np.unique(X_train[c])
#         val_map = {}
#         for i in range(len(raw_vals)):
#             val_map[raw_vals[i]] = i       
#         input_list_train.append(X_train[c].map(val_map).values)
#         input_list_val.append(X_val[c].map(val_map).fillna(0).values)
#         input_list_test.append(X_test[c].map(val_map).fillna(0).values)
        
# #     other_cols = [c for c in X_train.columns if (not c in embed_cols)]
# #     input_list_train.append(X_train[other_cols].values)
# #     input_list_val.append(X_val[other_cols].values)
# #     input_list_test.append(X_test[other_cols].values)
    
#     return input_list_train, input_list_val, input_list_test   

In [None]:
train = pd.read_csv(f"{PATH}train.csv")
test = pd.read_csv(f"{PATH}test.csv")
testdex = test.card_id.copy()
# merchants = pd.read_csv(f"{PATH}merchants.csv")
# hist_tran = pd.read_csv(f"{PATH}historical_transactions.csv")
# new_merch_tran = pd.read_csv(f"{PATH}new_merchant_transactions.csv")

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# mms = MinMaxScaler()
# mms.fit(train["target"].values.reshape(-1, 1))
# y = mms.transform(train["target"].values.reshape(-1, 1))
# y = np.array([x[0] for x in y])

In [None]:
from sklearn import preprocessing
scaler= preprocessing.StandardScaler()
y = scaler.fit_transform(train["target"].values.reshape(-1,1))
y = pd.Series(x[0] for x in y)

In [None]:
hidden_units = (32,4)
f1_embedding_size = 8
f1_embedding_size = 8

# Each instance will consist of two inputs: a single user id, and a single movie id
input_f1 = keras.Input(shape=(1,), name='f1')
input_f2 = keras.Input(shape=(1,), name='f2')
input_f3 = keras.Input(shape=(1,), name='f3')

f1_embedded = keras.layers.Embedding(train["feature_1"].nunique(), train["feature_1"].nunique()-1, 
                                       input_length=1, name='f1_layer')(input_f1)
f2_embedded = keras.layers.Embedding(train["feature_2"].nunique(), train["feature_2"].nunique()-1, 
                                        input_length=1, name='f2_layer')(input_f2)
f3_embedded = keras.layers.Embedding(train["feature_3"].nunique(), train["feature_3"].nunique()-1, 
                                        input_length=1, name='f3_layer')(input_f3)

# Concatenate the embeddings (and remove the useless extra dimension)
concatenated = keras.layers.Concatenate()([f1_embedded,f2_embedded, f3_embedded])
out = keras.layers.Flatten()(concatenated)

# Add one or more hidden layers
for n_hidden in hidden_units:
    out = keras.layers.Dense(n_hidden, activation='relu')(out)

# A single output: our predicted rating
out = keras.layers.Dense(1, activation='linear', name='prediction')(out)

model = keras.Model(
    inputs = [input_f1,input_f2,input_f3],
    outputs = out,
)
model.summary(line_length=88)

In [None]:
model.compile(
    # Technical note: when using embedding layers, I highly recommend using one of the optimizers
    # found  in tf.train: https://www.tensorflow.org/api_guides/python/train#Optimizers
    # Passing in a string like 'adam' or 'SGD' will load one of keras's optimizers (found under 
    # tf.keras.optimizers). They seem to be much slower on problems like this, because they
    # don't efficiently handle sparse gradient updates.
    tf.train.AdamOptimizer(0.005),
    loss='MSE',
    metrics=['MAE'],
)

In [None]:
hist = model.fit(
    [train.feature_1, train.feature_2,train.feature_3],
    y,
    batch_size=5000,
    epochs=20,
    verbose=1,
    validation_split=.1,
);

In [None]:
[test.feature_1, test.feature_2,test.feature_3]

In [None]:
# Model Evaluation
best = np.argmin(hist.history["val_loss"])
print("Optimal Epoch: ",best+1)
print("Train Score: {}, Validation Score: {}".format(hist.history["loss"][best],hist.history["val_loss"][best]))

plt.plot(hist.history['loss'], label='train')
plt.plot(hist.history['val_loss'], label='validation')
plt.xlabel("Epochs")
plt.ylabel("Root Mean Square Error")
plt.title("Train and Validation Error")
plt.legend()
plt.savefig("Train and Validation MSE Progression.png")
plt.show()

pred = model.predict([test.feature_1, test.feature_2,test.feature_3])
pred = scaler.inverse_transform(pred)
pred1 = np.array([x[0] for x in pred])

sub_df = pd.DataFrame({"card_id":testdex.values})
sub_df["target"] = pred1
sub_df.to_csv("submit.csv", index=False)

In [None]:
sub_df['target'].describe()