In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(context="notebook", palette="Spectral", style = 'darkgrid', font_scale = 1.5, color_codes=True)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
n_rows_to_load = 1000000

pd.set_option('display.max_columns', 50)

### Load train + test dataset

In [None]:
# Load all csv files into the memory
hist_transactions = pd.read_csv("/home/nikolaevra/datasets/elo/historical_transactions.csv") #, index_col='card_id')
# merchants = pd.read_csv("/home/nikolaevra/datasets/elo/merchants.csv", index_col='merchant_id')
# train_df = pd.read_csv("/home/nikolaevra/datasets/elo/train.csv", index_col='card_id')
test_df_orig = pd.read_csv("/home/nikolaevra/datasets/elo/test.csv") #, index_col='card_id')

# # First inner join historical transactions 
# # with train dataframe using the card_id column as index.
# main_hist = hist_transactions.join(
#     train_df,
#     how='inner',
#     rsuffix='_train',
#     lsuffix='_transac'
# ).drop_duplicates().reset_index(drop=True).set_index('merchant_id')

# # Second inner join the first joined table with the merchants 
# # table using the merchant_id
# main_df = main_hist.join(
#     merchants,
#     how='inner',
#     rsuffix='_merchant'
# ).drop_duplicates().reset_index(drop=True)

# # Shuffle the data.
# main_df = main_df.sample(frac=1)

# # Parse the datetime string into datetime object.
# main_df['purchase_date'] = pd.to_datetime(main_df['purchase_date'])

# # Parse datetime object and create dummy columns to store day, month and year count.
# main_df['purchase_date_day'] = pd.to_datetime(
#     main_df['purchase_date']
# ).dt.day
# main_df['purchase_date_month'] = pd.to_datetime(
#     main_df['purchase_date']
# ).dt.month

### Preprocess data

In [None]:
# Drop repeated columns
drop_cols = [
    'city_id_merchant', 'state_id_merchant', 'city_id',
    'category_2_merchant', 'category_1_merchant', 'purchase_date',
    'merchant_category_id_merchant', 'subsector_id_merchant', 'merchant_group_id',
    'merchant_category_id', 'first_active_month', 'avg_sales_lag3', 
    'avg_purchases_lag3', 'active_months_lag3', 'avg_sales_lag6', 
    'avg_purchases_lag6', 'active_months_lag6','avg_sales_lag12', 'active_months_lag12'
]

main_df = main_df.drop(columns=drop_cols)

In [None]:
# Find columns that have missing values.
cols_missing_data = [main_df.columns[i] for i, missing_data in enumerate(main_df.isna().any()) if missing_data]

for col in cols_missing_data:
    main_df[col] = main_df[col].fillna(-1)

In [None]:
# Identify all of the categorical columns so that we can 
# one hot encode them and later remove.
categorical_cols = [
    'category_1', 'category_2',
    'category_3', 'category_4', 
    'authorized_flag', 'state_id', 'subsector_id',
    'most_recent_sales_range', 'most_recent_purchases_range',
    'purchase_date_day', 'purchase_date_month',
    'feature_1', 'feature_2', 'feature_3'
]

In [None]:
continuous_cols = [col for col in main_df.columns if col not in categorical_cols + ['target']]
continuous_cols

In [None]:
unique_vals = dict()

for cat in categorical_cols:
    unique_vals[cat] = list(main_df[cat].unique())

In [None]:
for cat in categorical_cols:
    main_df[cat] = main_df[cat].astype('category', categories=unique_vals[cat])

In [None]:
first_cat = categorical_cols[0]
raw_cat = pd.get_dummies(main_df[first_cat]).values
print('added feature:', first_cat, '->', raw_cat.shape)

# For every categorical column, one hot encode it 
# and add to the main dataframe. 
for cat_col in categorical_cols[1:]:
    dummies = pd.get_dummies(main_df[cat_col]).values
    print(dummies.shape)
    raw_cat = np.concatenate((raw_cat, dummies), axis = 1)
    print('added feature:', cat_col, '->', raw_cat.shape)
    
# Drop all of the categorical columns because we have
# replaced them with one hot encoded ones.     
clean_oh_df = main_df.drop(columns = categorical_cols)

# Convert target column into its own numpy array.
raw_Y = clean_oh_df['target'].values

# Remove target column, so that we can use the rest 
# as input columns.
clean_oh_df = clean_oh_df.drop(columns=['target'])

print('Non-categorical data:')
clean_oh_df.head()

In [None]:
from sklearn import preprocessing
x = clean_oh_df[continuous_cols[0]].values
min_max_scaler = preprocessing.MinMaxScaler()
raw_norm_cont = min_max_scaler.fit_transform(x.reshape(-1, 1))

for col in continuous_cols[1:]:
    x = clean_oh_df[col].values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x.reshape(-1, 1))
    raw_norm_cont = np.concatenate((raw_norm_cont, x_scaled), axis=1)

raw_X = np.concatenate((raw_cat, raw_norm_cont), axis=1)
print("Final Shape:", raw_X.shape)

In [None]:
raw_X[0]

In [None]:
raw_Y[1]

In [None]:
print("X:", raw_X.shape, "Y:", raw_Y.shape)

In [None]:
split_ratio = 0.9
split_point = int(split_ratio * raw_X.shape[0])

# Split into train and test sets.
print('Splitting at:', split_point)
train_X = raw_X[:split_point, :]
train_Y = raw_Y[:split_point]

test_X = raw_X[split_point:, :]
test_Y = raw_Y[split_point:]

print("train_X", train_X.shape)
print("train_Y", train_Y.shape)
print("test_X", test_X.shape)
print("test_Y", test_Y.shape)

## Building the network

In [None]:
from keras import backend as K
from keras import models
from keras import layers

import livelossplot
plot_losses = livelossplot.PlotLossesKeras()

In [None]:
# Build neural network
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(train_X.shape[1], )))
# model.add(layers.Dropout(0.25))
model.add(layers.Dense(1, activation='linear'))

model.summary()

In [None]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

In [None]:
# Compile model
model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['mse'])

# Train model
model.fit(train_X, train_Y,
          batch_size=256,
          epochs=30,
          callbacks=[plot_losses],
          verbose=1,
          validation_data=(test_X, test_Y))

# Score model
score = model.evaluate(test_X, test_Y, verbose=0)
print('Test loss:', score)

## Import test data

In [None]:
test_df_orig.shape

In [None]:
hist_transactions.shape

In [None]:
# First inner join historical transactions 
# with train dataframe using the card_id column as index.
test_hist = pd.merge(
    test_df_orig, # left
    hist_transactions, # right
    how='left', 
    left_index=True,
    right_index=True
)#.drop_duplicates().reset_index().set_index('merchant_id')

In [None]:
test_hist.shape

In [None]:
# Second inner join the first joined table with the merchants 
# table using the merchant_id
test_df = test_hist.join(
    merchants,
    how='right',
    rsuffix='_merchant'
).drop_duplicates().reset_index(drop=True)

# Parse the datetime string into datetime object.
test_df['purchase_date'] = pd.to_datetime(test_df['purchase_date'])

# Parse datetime object and create dummy columns to store day, month and year count.
test_df['purchase_date_day'] = pd.to_datetime(
    test_df['purchase_date']
).dt.day
test_df['purchase_date_month'] = pd.to_datetime(
    test_df['purchase_date']
).dt.month

In [None]:
test_df.shape

In [None]:
test_df.columns

In [None]:
test_df = test_df.drop(columns=drop_cols)

In [None]:
# Find columns that have missing values.
cols_missing_data = [test_df.columns[i] for i, missing_data in enumerate(test_df.isna().any()) if missing_data]

for col in cols_missing_data:
    test_df[col] = test_df[col].fillna(-1)

In [None]:
for cat in categorical_cols:
    test_df[cat] = test_df[cat].astype('category', categories=unique_vals[cat])

In [None]:
test_df.info()

In [None]:
first_cat = categorical_cols[0]
raw_cat = pd.get_dummies(test_df[first_cat]).values
print('added feature:', first_cat, '->', raw_cat.shape)

# For every categorical column, one hot encode it 
# and add to the main dataframe. 
for cat_col in categorical_cols[1:]:
    dummies = pd.get_dummies(test_df[cat_col]).values
    print(dummies.shape)
    raw_cat = np.concatenate((raw_cat, dummies), axis = 1)
    print('added feature:', cat_col, '->', raw_cat.shape)

In [None]:
test_df.columns

In [None]:
# Drop all of the categorical columns because we have
# replaced them with one hot encoded ones.     
clean_cnt_df = test_df.drop(columns=categorical_cols)

# Convert target column into its own numpy array.
card_ids = clean_cnt_df['card_id'].values

# Remove target column, so that we can use the rest 
# as input columns.
clean_cnt_df = clean_cnt_df.drop(columns=['card_id'])

print('Non-categorical data:')
clean_cnt_df.head()

In [None]:
from sklearn import preprocessing
x = clean_cnt_df[continuous_cols[0]].values
min_max_scaler = preprocessing.MinMaxScaler()
raw_norm_cont = min_max_scaler.fit_transform(x.reshape(-1, 1))

for col in continuous_cols[1:]:
    x = clean_cnt_df[col].values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x.reshape(-1, 1))
    raw_norm_cont = np.concatenate((raw_norm_cont, x_scaled), axis=1)

raw_X = np.concatenate((raw_cat, raw_norm_cont), axis=1)
print("Final Shape:", raw_X.shape)

In [None]:
raw_X[0]

In [None]:
predictions.shape

In [None]:
card_ids.shape

In [None]:
predictions = model.predict(raw_X)
answer = np.concatenate((card_ids, predictions), axis=1)

In [None]:
answer

In [None]:
df = DataFrame.from_records(data, columns=['card_id', 'target'])
df.to_csv("predictions.csv", sep=',')

In [None]:
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], 
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
df_a

In [None]:
raw_data = {
        'subject_id': ['4', '5', '6', '7', '8'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], 
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
df_b = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
df_b

In [None]:
test_df_orig.isna().all()

In [None]:
test_df_orig = test_df_orig.reset_index()
hist_transactions = hist_transactions.reset_index()

In [None]:
len(test_df_orig['card_id'].unique())

In [None]:
test_df_orig = test_df_orig.set_index('card_id')
hist_transactions = hist_transactions.set_index('card_id')

In [None]:
hist_transactions.shape

In [None]:
df_new = hist_transactions.join(test_df_orig, how='left')
df_new.shape

In [None]:
len(test_df_orig.reset_index()['card_id'].unique())

In [None]:
len(df_new.reset_index()['card_id'].unique())

In [None]:
df_new.shape