# Elo Merchant

## Introduction
This notebook represents Sean Ng's submission to elo merchant. 

I got some feature engineering ideas from:

https://www.kaggle.com/denzo123/a-closer-look-at-date-variables


## Helper functions to manage memory

In [121]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [122]:
#To reset all variables
def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

In [123]:
def find_variables():
    import sys

    # These are the usual ipython objects, including this one you are creating
    ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

    # Get a sorted list of the objects and their sizes
    sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [124]:
## Importing data

In [125]:
#Declaring imports
import csv
import os
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost as xgd


In [126]:
EXPLORE = False

In [127]:
#Files
train_data_path = '../input/train.csv'
test_data_path = '../input/test.csv'
history_path = '../input/historical_transactions.csv'
merchant_path = '../input/merchants.csv'
new_transactions_path = '../input/new_merchant_transactions.csv'


In [128]:
train_df = reduce_mem_usage(pd.read_csv(train_data_path, parse_dates=["first_active_month"]))
test_df = reduce_mem_usage(pd.read_csv(test_data_path, parse_dates=["first_active_month"]))
n_train = train_df.shape[0]


Mem. usage decreased to  4.04 Mb (56.2% reduction)
Mem. usage decreased to  2.24 Mb (52.5% reduction)


In [129]:
merchants = pd.read_csv(merchant_path)

In [130]:
new_transactions_df = reduce_mem_usage(
    pd.read_csv(new_transactions_path, parse_dates=["purchase_date"])
    .append(
        pd.read_csv(history_path, parse_dates=["purchase_date"]),
        ignore_index=True
    )
)

Mem. usage decreased to 1867.06 Mb (43.7% reduction)


In [131]:
all_data = train_df.append(test_df, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


## Simple exploration

In [132]:
all_data.describe()

Unnamed: 0,feature_1,feature_2,feature_3,target
count,325540.0,325540.0,325540.0,201917.0
mean,3.10681,1.744038,0.565116,
std,1.18728,0.75054,0.495742,0.0
min,1.0,1.0,0.0,-33.21875
25%,2.0,1.0,0.0,-0.883301
50%,3.0,2.0,1.0,-0.023438
75%,4.0,2.0,1.0,0.765625
max,5.0,3.0,1.0,17.96875


In [133]:
if EXPLORE:
    all_transactions.describe()

In [134]:
print("All data shape:" + str(all_data.shape))
print("-----------------------------------------------------")
print("Train data shape:" + str(train_df.shape))
print("Test data shape:" + str(test_df.shape))
print("=====================================================")
print("All transactions data shape:" + str(all_transactions.shape))
print("-----------------------------------------------------")
print("Old transactions data shape:" + str(history_df.shape))
print("New transactions data shape:" + str(new_transactions_df.shape))

All data shape:(325540, 6)
-----------------------------------------------------
Train data shape:(201917, 6)
Test data shape:(123623, 5)
All transactions data shape:(31075392, 20)
-----------------------------------------------------
Old transactions data shape:(29112361, 14)
New transactions data shape:(31075392, 14)


In [135]:
def get_df_name(df):
    for x in globals():
        if globals()[x] is df:
            return x

In [136]:
#Finding number of null values
def count_nulls(df_list):
    result = {}
    for df in df_list:
        vals = {}
        for key in df.keys():
            count = df[key].isnull().sum()
            if count > 0:
                vals[key] = count
        name = get_df_name(df)
        result[name] = vals
    return pd.DataFrame.from_dict(result, dtype=int)
count_nulls([all_data, all_transactions, merchants])


Unnamed: 0,all_data,all_transactions,merchants
avg_sales_lag12,,,13.0
avg_sales_lag3,,,13.0
avg_sales_lag6,,,13.0
category_2,,,11887.0
first_active_month,1.0,,
merchant_id,,164697.0,
target,123623.0,,


# Handling Nulls

1. Viewing the distribution of non-null values

In [137]:
#putting each col into a bucket
def classify_categories(df, cols):
    discrete = []
    continuous = []
    for col in cols:
        
        length = len(df[col].unique())
        if length <= 25 :
            discrete.append(col)
        else:
            continuous.append(col)
    return discrete, continuous


In [138]:
def plot_na(df, cols):
    discrete, continuous = classify_categories(df, cols)
    all_count = len(continuous) + len(discrete)
    #Distributions of continuous dVata
    fig, axes = plt.subplots(nrows=all_count, ncols=1, figsize=(3,5*all_count))
    if all_count == 1:
        axes = [axes]
    for i, col_name in enumerate(discrete):
        df[col_name].value_counts().plot(kind='bar', ax=axes[i], title = col_name)
    for i, col_name in enumerate(continuous):
        df[col_name].plot(kind='density', ax=axes[i+len(discrete)], title = col_name)
    


In [139]:
if EXPLORE:
    plot_na(all_transactions, ["category_2", "category_3"])

In [140]:
if EXPLORE:
    plot_na(merchants, ['avg_sales_lag12','avg_sales_lag3','avg_sales_lag6','category_2'])

In [141]:
if EXPLORE:
    #Plot date to get a rough idea
    all_data.set_index(['first_active_month']).groupby('first_active_month').card_id.count().plot()

In [142]:
if EXPLORE:
    merchants.avg_sales_lag3[merchants.avg_sales_lag3!=np.nan].sort_values(ascending=False)[500:].plot()

# Feature Engineering

## Converting date to corresponding date difference

In [143]:

from datetime import datetime
from dateutil import relativedelta
def convert_date_to_timediff(df, cols):
    today = datetime.today()
    for col in cols:
        diff = df[col].dropna().map(lambda x: relativedelta.relativedelta(today,x))
        diff = diff.map(lambda x: x.years * 12 + x.months).astype('int64')
        df[col] = diff
    return df 

In [144]:
#all_data = convert_date_to_timediff(all_data, ["first_active_month"])

In [145]:
#all_transactions = convert_date_to_timediff(all_transactions, ["purchase_date"])


## Making the day of week relevant

In [149]:
%load_ext Cython

In [150]:
%%cython
def f(x):
    return x.weekday()

In [None]:
all_transactions["day_of_week"] = all_transactions["purchase_date"].apply(f)


Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'merchant_category_id', 'merchant_id', 'month_lag', 'purchase_amount',
       'purchase_date', 'state_id', 'subsector_id', 'category_2_1.0',
       'category_2_2.0', 'category_2_3.0', 'category_2_4.0', 'category_2_5.0',
       'category_3_A', 'category_3_B', 'category_3_C', 'day_of_week'],
      dtype='object')

## LabelEncode/ Hot Encode the necessary values


In [27]:
from sklearn.preprocessing import LabelEncoder
def LabelEncodeCols(df, cols):
    new_df = df.copy()
    for col in cols:
        lbl = LabelEncoder()
        new_df[col] = lbl.fit_transform(new_df[col])
    return new_df



In [153]:
all_transactions = LabelEncodeCols(all_transactions, ["authorized_flag", "category_1"])

In [156]:
all_transactions = pd.get_dummies(all_transactions, columns=["category_2", "category_3", "day_of_week"])

In [159]:
all_transactions.keys()


Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'merchant_category_id', 'merchant_id', 'month_lag', 'purchase_amount',
       'purchase_date', 'state_id', 'subsector_id', 'category_2_1.0',
       'category_2_2.0', 'category_2_3.0', 'category_2_4.0', 'category_2_5.0',
       'category_3_A', 'category_3_B', 'category_3_C', 'day_of_week_0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6'],
      dtype='object')

In [160]:
#TODO: Consider if I should flag out data that is not authoized (authorized flag)
def aggregate_transactions(history):
    
    agg = {
        "authorized_flag":["mean"],
        "category_1":["mean"],
        "category_2_1.0":["mean"], 
        "category_2_2.0":["mean"],
        "category_2_3.0":["mean"],
        "category_2_4.0":["mean"],
        "category_2_5.0":["mean"],
        "category_3_A":["mean"],
        "category_3_B":["mean"],
        "category_3_C":["mean"],
        "city_id":['nunique'],
        "merchant_id":['nunique'],
        "state_id":['nunique'],
        "subsector_id":["nunique"],
        "purchase_amount":["mean", "std", np.ptp],
        "day_of_week_0":["mean"],
        "day_of_week_1":["mean"],
        "day_of_week_2":["mean"],
        "day_of_week_3":["mean"],
        "day_of_week_4":["mean"],
        "day_of_week_5":["mean"],
        "day_of_week_6":["mean"]
    }
    
    new_df = history.groupby("card_id").agg(agg)
    #Replace columns
    new_df.columns = [' '.join(col).strip() for col in new_df.columns.values]
    return new_df
    

In [161]:
aggregate_transactions = aggregate_transactions(all_transactions)

In [162]:
all_data = pd.merge(all_data, aggregate_transactions, on='card_id', how='left')

In [34]:
#Consider adding other columns. i.e. grouping the values across time (!!!!!)

In [35]:
#tuning hyperparameters

In [36]:
#Features that others added
#Handling month_lag?

1. Month difference between today and purchase date (might be more significant than just date(?). I need to be able to convert to something significant anyway
2. Aggregation based statistics


In [163]:
months = all_data['first_active_month']
del all_data['first_active_month']

# Training Phase

In [164]:
all_data = reduce_mem_usage(all_data)

Mem. usage decreased to 22.35 Mb (66.2% reduction)


In [165]:
if "target" in all_data.keys():
    target = all_data["target"][:n_train]
to_be_deleted = ["target", "card_id"]
for col in to_be_deleted:
    if col in all_data.keys():
        del all_data[col]
train = all_data[:n_train]
test = all_data[n_train:]

In [166]:
# Selecting the right features
param = {'num_leaves': 111,
         'min_data_in_leaf': 149, 
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2634,
         "random_state": 133,
         "verbosity": -1}

In [None]:
if EXPLORE:
    plt.figure(figsize=(14,25))
    plt.title('LightGBM feature importances')
    sns.barplot(x="importance", y="feature", data=feature_importances)

In [None]:
sub_df = pd.DataFrame({"card_id":test_df.card_id.values})
sub_df["target"] = predictions
sub_df.to_csv("submit.csv", index=False)

In [None]:
if EXPLORE:
    corrmat = all_data.corr()
    plt.subplots(figsize=(12,9))
    sns.heatmap(corrmat, vmax=0.9, square=True)

In [84]:
n_folds=5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, target, scoring="neg_mean_squared_error", cv = kf, verbose=1))
    return(rmse)

In [85]:
from sklearn.linear_model import ElasticNet, Lasso, Ridge, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb


In [184]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.005, random_state=1, tol=0.0000001))

ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.5, l1_ratio=.9, random_state=3))
ridge = make_pipeline(RobustScaler(), Ridge(alpha=10))

In [186]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))




Lasso score: 3.8408 (0.0334)



[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.8s finished


In [169]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))



ElasticNet score: 3.8434 (0.0342)



[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.7s finished


In [170]:
score = rmsle_cv(ridge)
print("Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.6656206307859994e-13 / 5.960464477539063e-08
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.3172836833305084e-11 / 5.960464477539063e-08
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.6568339853296338e-13 / 5.960464477539063e-08
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.6614152816094713e-13 / 5.960464477539063e-08


Ridge score: 3.8399 (0.0332)



Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.6733459778407733e-13 / 5.960464477539063e-08
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.5s finished


In [187]:
hi

NameError: name 'hi' is not defined

#Doesn't seem to work
KRR =  KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [171]:
GBoost = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05,
                                   max_depth=3, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Gradient Boosting score: 3.8614 (0.0356)



[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min finished


In [172]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             n_estimators=100,
                             random_state =7, nthread = -1)

score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 3.8374 (0.0332)



[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.4min finished


In [70]:
print(train.shape)
print(test.shape)

(201917, 20)
(123623, 20)


In [63]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

[CV]  ................................................................
[CV] ...................... , score=-14.596460013018843, total=   3.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s


[CV] ...................... , score=-14.641113285954846, total=   3.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.7s remaining:    0.0s


[CV] ...................... , score=-14.518480013374495, total=   3.2s
[CV]  ................................................................
[CV] ...................... , score=-14.651496713617208, total=   3.3s
[CV]  ................................................................
[CV] ...................... , score=-15.217078129992197, total=   3.2s
LGBM score: 3.8372 (0.0325)



[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.5s finished


{'max_depth': 3, 'n_estimators': 100}