In [1]:
import os
import json
import time
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

import datetime

In [57]:
from sklearn.model_selection import KFold
from sklearn import model_selection, preprocessing, metrics
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

In [3]:
df_train = pd.read_csv('train.csv', parse_dates=['first_active_month'])
df_test = pd.read_csv('test.csv', parse_dates=['first_active_month'])

In [4]:
df_train["month"] = df_train["first_active_month"].dt.month
df_test["month"] = df_test["first_active_month"].dt.month

df_train["year"] = df_train["first_active_month"].dt.year
df_test["year"] = df_test["first_active_month"].dt.year

df_train['elapsed_time'] = (datetime.date(2018, 2, 1) - df_train['first_active_month'].dt.date).dt.days
df_test['elapsed_time'] = (datetime.date(2018, 2, 1) - df_test['first_active_month'].dt.date).dt.days


In [5]:
df_train = pd.get_dummies(df_train, columns=['feature_1', 'feature_2'])
df_test = pd.get_dummies(df_test, columns=['feature_1', 'feature_2'])

In [6]:
df_hist = pd.read_csv('historical_transactions.csv')
df_hist = pd.get_dummies(df_hist, columns=['category_2', 'category_3'])
df_hist['authorized_flag'] = df_hist['authorized_flag'].map({'Y': 1, 'N': 0})
df_hist['category_1'] = df_hist['category_1'].map({'Y': 1, 'N': 0})

In [7]:
def aggregate_transactions(df, prefix):  
    df.loc[:, 'purchase_date'] = pd.DatetimeIndex(df['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {
        'authorized_flag': ['sum', 'mean'],
        'category_1': ['mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        'merchant_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_date': [np.ptp],
        'month_lag': ['min', 'max']
    }
    agg_df = df.groupby(['card_id']).agg(agg_func)
    agg_df.columns = [prefix + '_'.join(col).strip() 
                           for col in agg_df.columns.values]
    agg_df.reset_index(inplace=True)
    
    df = (df.groupby('card_id')
          .size()
          .reset_index(name='{}transactions_count'.format(prefix)))
    
    agg_df = pd.merge(df, agg_df, on='card_id', how='left')
    
    return agg_df

In [8]:
df_hist = aggregate_transactions(df_hist, prefix='hist_')

df_train = pd.merge(df_train, df_hist, on='card_id',how='left')
df_test = pd.merge(df_test, df_hist, on='card_id',how='left')

print(df_train.shape, df_test.shape)

(201917, 41) (123623, 40)


In [9]:
df_new = pd.read_csv('new_merchant_transactions.csv')

df_new = pd.get_dummies(df_new, columns=['category_2', 'category_3'])
df_new['authorized_flag'] = df_new['authorized_flag'].map({'Y': 1, 'N': 0})
df_new['category_1'] = df_new['category_1'].map({'Y': 1, 'N': 0})

In [10]:
df_new = aggregate_transactions(df_new, prefix='new_')

df_train = pd.merge(df_train, df_new, on='card_id',how='left')
df_test = pd.merge(df_test, df_new, on='card_id',how='left')

print(df_train.shape, df_test.shape)

(201917, 67) (123623, 66)


In [11]:
target = df_train['target']
cols_to_drop = ['card_id', 'first_active_month', 'target']
use_cols = [c for c in df_train.columns if c not in cols_to_drop]
features = list(df_train[use_cols].columns)

## 1. LightGBM

In [12]:
lgb_params = {'num_leaves': 50,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1}

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_lgb = np.zeros(len(df_train))
predictions_lgb = np.zeros(len(df_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print('-')
    print("Fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(lgb_params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds=100)
    oof_lgb[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

-
Fold 1
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.75147	valid_1's rmse: 3.81823
[200]	training's rmse: 3.69496	valid_1's rmse: 3.78503
[300]	training's rmse: 3.65231	valid_1's rmse: 3.7654
[400]	training's rmse: 3.61679	valid_1's rmse: 3.75126
[500]	training's rmse: 3.58676	valid_1's rmse: 3.74122
[600]	training's rmse: 3.56052	valid_1's rmse: 3.73348
[700]	training's rmse: 3.53924	valid_1's rmse: 3.7287
[800]	training's rmse: 3.51838	valid_1's rmse: 3.72433
[900]	training's rmse: 3.50043	valid_1's rmse: 3.72129
[1000]	training's rmse: 3.48334	valid_1's rmse: 3.71835
[1100]	training's rmse: 3.46742	valid_1's rmse: 3.71616
[1200]	training's rmse: 3.45332	valid_1's rmse: 3.71452
[1300]	training's rmse: 3.4399	valid_1's rmse: 3.71248
[1400]	training's rmse: 3.42768	valid_1's rmse: 3.7113
[1500]	training's rmse: 3.41651	valid_1's rmse: 3.71049
[1600]	training's rmse: 3.40581	valid_1's rmse: 3.70992
[1700]	training's rmse: 3.39496	valid_1's rms

[2200]	training's rmse: 3.35592	valid_1's rmse: 3.63996
Early stopping, best iteration is:
[2107]	training's rmse: 3.36397	valid_1's rmse: 3.63987


In [15]:
validation_score = np.sqrt(mean_squared_error(target, oof_lgb))
validation_score

3.689544862590313

## 2. XGBoost

In [18]:
xgb_params = {'eta': 0.005, 
              'max_depth': 10, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8, 
              'objective': 'reg:linear', 
              'eval_metric': 'rmse', 
              'silent': True}

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_xgb = np.zeros(len(df_train))
predictions_xgb = np.zeros(len(df_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print('-')
    print("Fold {}".format(fold_ + 1))
    trn_data = xgb.DMatrix(data=df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = xgb.DMatrix(data=df_train.iloc[val_idx][features], label=target.iloc[val_idx])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    print("xgb " + str(fold_) + "-" * 50)
    num_round = 10000
    xgb_model = xgb.train(xgb_params, trn_data, num_round, watchlist, early_stopping_rounds=50, verbose_eval=200)
    oof_xgb[val_idx] = xgb_model.predict(xgb.DMatrix(df_train.iloc[val_idx][features]), ntree_limit=xgb_model.best_ntree_limit+50)

    predictions_xgb += xgb_model.predict(xgb.DMatrix(df_test[features]), ntree_limit=xgb_model.best_ntree_limit+50) / folds.n_splits

-
Fold 1
xgb 0--------------------------------------------------
[0]	train-rmse:3.94002	valid-rmse:3.98691
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1000]	train-rmse:2.96408	valid-rmse:3.7128
Stopping. Best iteration:
[1279]	train-rmse:2.87598	valid-rmse:3.71108

-
Fold 2
xgb 1--------------------------------------------------
[0]	train-rmse:3.96009	valid-rmse:3.90614
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1000]	train-rmse:2.94587	valid-rmse:3.6586
Stopping. Best iteration:
[1010]	train-rmse:2.94149	valid-rmse:3.65854

-
Fold 3
xgb 2--------------------------------------------------
[0]	train-rmse:3.96731	valid-rmse:3.87636
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1000]	train

In [48]:
validation_score = np.sqrt(mean_squared_error(target, oof_xgb))
validation_score

3.6937509061097757

## 3. CatBoost

In [45]:
kfolds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_cb = np.zeros(len(df_train))
predictions_cb = np.zeros(len(df_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print('-')
    print("Fold {}".format(fold_ + 1))
    X_train, y_train = df_train[features].iloc[trn_idx], target.iloc[trn_idx]
    X_valid, y_valid = df_train[features].iloc[val_idx], target.iloc[val_idx]
    
    print("cb " + str(fold_) + "-" * 50)
    
    # CatBoost Regressor estimator
    model = cb.CatBoostRegressor(learning_rate = 0.005,
        iterations = 10000,
        eval_metric = 'RMSE',
        allow_writing_files = False,
        od_type = 'Iter',
        bagging_temperature = 0.2,
        depth = 10,
        od_wait = 20,
        silent = True)
    
            
    # Fit
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=50,
        verbose_eval=400
    )
    
    oof_cb[val_idx] = model.predict(X_valid)
    predictions_cb += model.predict(df_test[features]) / kfolds.n_splits                            

-
Fold 1
cb 0--------------------------------------------------
0:	learn: 3.8605329	test: 3.8605329	test1: 3.9041308	best: 3.9041308 (0)	total: 148ms	remaining: 24m 41s
400:	learn: 3.6696745	test: 3.6696745	test1: 3.7545042	best: 3.7545042 (400)	total: 1m 2s	remaining: 24m 58s
800:	learn: 3.6117062	test: 3.6117062	test1: 3.7331421	best: 3.7331421 (800)	total: 2m 3s	remaining: 23m 41s
1200:	learn: 3.5713340	test: 3.5713340	test1: 3.7236637	best: 3.7236613 (1199)	total: 3m 5s	remaining: 22m 39s
1600:	learn: 3.5371728	test: 3.5371728	test1: 3.7189568	best: 3.7189568 (1600)	total: 4m 6s	remaining: 21m 33s
2000:	learn: 3.5064442	test: 3.5064442	test1: 3.7155374	best: 3.7155296 (1999)	total: 5m 17s	remaining: 21m 8s
2400:	learn: 3.4766375	test: 3.4766375	test1: 3.7127417	best: 3.7127417 (2400)	total: 6m 22s	remaining: 20m 10s
2800:	learn: 3.4495197	test: 3.4495197	test1: 3.7112050	best: 3.7111923 (2798)	total: 7m 25s	remaining: 19m 5s
3200:	learn: 3.4241141	test: 3.4241141	test1: 3.7101523	b


Iteration with suspicious time 557 sec ignored in overall statistics.

Iteration with suspicious time 558 sec ignored in overall statistics.


1200:	learn: 3.5814457	test: 3.5814457	test1: 3.6503238	best: 3.6503238 (1200)	total: 3m 4s	remaining: 22m 30s



Iteration with suspicious time 81.5 sec ignored in overall statistics.


1600:	learn: 3.5464226	test: 3.5464226	test1: 3.6471027	best: 3.6471027 (1600)	total: 4m 17s	remaining: 22m 31s
2000:	learn: 3.5149179	test: 3.5149179	test1: 3.6451506	best: 3.6451506 (2000)	total: 5m 13s	remaining: 20m 56s
2400:	learn: 3.4863355	test: 3.4863355	test1: 3.6439584	best: 3.6439318 (2356)	total: 6m 7s	remaining: 19m 26s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 3.643931809
bestIteration = 2356

Shrink model to first 2357 iterations.


In [49]:
validation_score = np.sqrt(mean_squared_error(target, oof_cb))
validation_score

3.692218444327492

In [66]:
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_ridge = np.zeros(len(df_train))
predictions_ridge = np.zeros(len(df_test))

tst_data = df_test.copy()
tst_data.fillna((tst_data.mean()), inplace=True)

tst_data = tst_data[features].values

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train, target)):
    print("fold no.{}".format(fold_+1))
    trn_data, trn_y = df_train.iloc[trn_idx][features], target.iloc[trn_idx].values
    val_data, val_y = df_train.iloc[val_idx][features], target.iloc[val_idx].values
    
    trn_data.fillna((trn_data.mean()), inplace=True)
    val_data.fillna((val_data.mean()), inplace=True)
    
    trn_data = trn_data.values
    val_data = val_data.values

    clf = Ridge(alpha=100)
    clf.fit(trn_data, trn_y)
    
    oof_ridge[val_idx] = clf.predict(val_data)
    predictions_ridge += clf.predict(tst_data) / folds.n_splits

fold no.1
fold no.2
fold no.3
fold no.4
fold no.5


In [67]:
validation_score = np.sqrt(mean_squared_error(target, oof_ridge))
validation_score

3.8285953928413603

In [68]:
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_lasso = np.zeros(len(df_train))
predictions_lasso = np.zeros(len(df_test))

tst_data = df_test.copy()
tst_data.fillna((tst_data.mean()), inplace=True)

tst_data = tst_data[features].values

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train, target)):
    print("fold no.{}".format(fold_+1))
    trn_data, trn_y = df_train.iloc[trn_idx][features], target.iloc[trn_idx].values
    val_data, val_y = df_train.iloc[val_idx][features], target.iloc[val_idx].values
    
    trn_data.fillna((trn_data.mean()), inplace=True)
    val_data.fillna((val_data.mean()), inplace=True)
    
    trn_data = trn_data.values
    val_data = val_data.values

    clf = Lasso(alpha=100)
    clf.fit(trn_data, trn_y)
    
    oof_ridge[val_idx] = clf.predict(val_data)
    predictions_lasso += clf.predict(tst_data) / folds.n_splits

fold no.1
fold no.2
fold no.3
fold no.4
fold no.5


In [69]:
validation_score = np.sqrt(mean_squared_error(target, oof_lasso))
validation_score

3.8705589161316296

In [72]:
train_stack = np.vstack([oof_cb, oof_lgb, oof_xgb, oof_ridge, oof_lasso]).transpose()
test_stack = np.vstack([predictions_cb, predictions_lgb, predictions_xgb, predictions_ridge, predictions_lasso]).transpose()

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, target)):
    print("fold no. {}".format(fold_ + 1))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    clf = Ridge(alpha=500)
    clf.fit(trn_data, trn_y)
    
    oof[val_idx] = clf.predict(val_data)
    predictions += clf.predict(test_stack) / folds.n_splits


np.sqrt(mean_squared_error(target, oof))

fold no. 1
fold no. 2
fold no. 3
fold no. 4
fold no. 5


3.686160383006063

In [73]:
df_submission = pd.DataFrame({"card_id": df_test["card_id"].values})
df_submission["target"] = predictions
df_submission.to_csv("mds_einstein_stack3.csv", index=False)