In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd

#random seeds for stochastic parts of neural network 
np.random.seed(10)
from tensorflow import set_random_seed
set_random_seed(15)

from keras.models import Model
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout
from keras.layers.embeddings import Embedding

from keras.callbacks import EarlyStopping
from keras import optimizers

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

PATH = "../input/"
import os
print(os.listdir(PATH))
# Any results you write to the current directory are saved as output.

In [None]:
## Helpers
def missing_impute(df):
    for i in df.columns:
        if df[i].dtype == "object":
            df[i] = df[i].fillna("other")
        elif (df[i].dtype == "int64" or df[i].dtype == "float64"):
            df[i] = df[i].fillna(-1)
        else:
            pass
    return df

def prepare_time_features(df, time_feature):
    df[time_feature] = df[time_feature].str.replace(" UTC", "")
    df[time_feature] = pd.to_datetime(df[time_feature], format='%Y-%m-%d %H:%M:%S')
    df['hour_of_day'] = df.time_feature.dt.hour
    df['week'] = df.time_feature.dt.week
    df['month'] = df.time_feature.dt.month
    df["year"] = df.time_feature.dt.year
    df['day_of_year'] = df.time_feature.dt.dayofyear
    df['week_of_year'] = df.time_feature.dt.weekofyear
    df["weekday"] = df.time_feature.dt.weekday
    df["quarter"] = df.time_feature.dt.quarter
    df["day_of_month"] = df.time_feature.dt.day
    
    return df

def preproc(X_train, X_val, X_test):
    input_list_train = []
    input_list_val = []
    input_list_test = []
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in embed_cols:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_val.append(X_val[c].map(val_map).fillna(0).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
        
#     other_cols = [c for c in X_train.columns if (not c in embed_cols)]
#     input_list_train.append(X_train[other_cols].values)
#     input_list_val.append(X_val[other_cols].values)
#     input_list_test.append(X_test[other_cols].values)
    
    return input_list_train, input_list_val, input_list_test   

In [None]:
train = pd.read_csv(f"{PATH}train.csv")
test = pd.read_csv(f"{PATH}test.csv")
testdex = test.card_id.copy()
# merchants = pd.read_csv(f"{PATH}merchants.csv")
# hist_tran = pd.read_csv(f"{PATH}historical_transactions.csv")
# new_merch_tran = pd.read_csv(f"{PATH}new_merchant_transactions.csv")

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
mms = MinMaxScaler()
mms.fit(train["target"].values.reshape(-1, 1))
y = mms.transform(train["target"].values.reshape(-1, 1))
y = np.array([x[0] for x in y])

In [None]:
# y = np.log(train["target"].values+0.00001)

In [None]:
pd.Series(y).describe()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train[["feature_1","feature_2","feature_3"]], y,
                                                     test_size =.2)
embed_cols = ["feature_1","feature_2","feature_3"]
X_train, X_valid, test = preproc(X_train, X_valid, test[embed_cols])

In [None]:
for col in ["feature_1","feature_2","feature_3"]:
    print(train[col].value_counts())
    print("NUNIQUE: ", train[col].nunique())

In [None]:
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

def build_embedding_network():
    
    inputs = []
    embeddings = []
    
    feature_1 = Input(shape=(1,))
    embedding = Embedding(5, 3, input_length=1)(feature_1)
    embedding = Reshape(target_shape=(3,))(embedding)
    inputs.append(feature_1)
    embeddings.append(embedding)
    
    feature_2 = Input(shape=(1,))
    embedding = Embedding(3, 2, input_length=1)(feature_2)
    embedding = Reshape(target_shape=(2,))(embedding)
    inputs.append(feature_2)
    embeddings.append(embedding)
    
    feature_3 = Input(shape=(1,))
    embedding = Embedding(2, 2, input_length=1)(feature_3)
    embedding = Reshape(target_shape=(2,))(embedding)
    inputs.append(feature_3)
    embeddings.append(embedding)
    
#     input_numeric = Input(shape=(24,))
#     embedding_numeric = Dense(16)(input_numeric) 
#     inputs.append(input_numeric)
#     embeddings.append(embedding_numeric)

    x = Concatenate()(embeddings)
    x = Dense(80, activation='relu')(x)
    x = Dropout(.05)(x)
    x = Dense(20, activation='relu')(x)
    x = Dropout(.05)(x)
#     x = Dense(10, activation='relu')(x)
#     x = Dropout(.05)(x)
    output = Dense(1, activation= 'linear')(x)
    
    model = Model(inputs, output)
    
    return model

In [None]:
# Compile

NN_Params = {"batch_size":4000,
              "verbose":1,
              "epochs":10}

opt = optimizers.Adam(lr= 0.0001)

NN = build_embedding_network()
NN.compile(loss=root_mean_squared_error, optimizer=opt)

callbacks_list=[EarlyStopping(monitor="val_loss",min_delta=.1, patience=3, mode='auto')]
hist = NN.fit(X_train, y_train,
                      validation_data=(X_valid, y_valid),
                      callbacks=callbacks_list,
                      **NN_Params)

# Model Evaluation
best = np.argmin(hist.history["val_loss"])
print("Optimal Epoch: ",best+1)
print("Train Score: {}, Validation Score: {}".format(hist.history["loss"][best],hist.history["val_loss"][best]))

plt.plot(hist.history['loss'], label='train')
plt.plot(hist.history['val_loss'], label='validation')
plt.xlabel("Epochs")
plt.ylabel("Root Mean Square Error")
plt.title("Train and Validation Error")
plt.legend()
plt.savefig("Train and Validation MSE Progression.png")
plt.show()

In [None]:
pred = NN.predict(test)
pred1 = np.array([x[0] for x in pred])

sub_df = pd.DataFrame({"card_id":testdex.values})
sub_df["target"] = pred1
sub_df.to_csv("submit.csv", index=False)

In [None]:
train.target.describe()

In [None]:
pred2 = mms.inverse_transform(pred1.reshape(-1, 1))
pred2 = np.array([x[0] for x in pred2])
pd.Series(pred2).describe()

In [None]:
sub_df = pd.DataFrame({"card_id":testdex.values})
sub_df["target"] = pred2
sub_df.to_csv("submit.csv", index=False)