In [1]:
class Config:
    name = "EDA/Agg-RFE"

    n_splits = 5
    seed = 2022
    target = "target"

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/MyDrive/workspace/kaggle.json"
    drive_path = "/content/drive/MyDrive/workspace/kaggle-amex"

    # Kaggle Env
    kaggle_dataset_path = None

    # Reka Env
    dir_path = '/home/abe/kaggle/kaggle-amex'

In [2]:
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import sys
import gc
import multiprocessing
import joblib
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
from IPython import get_ipython
tqdm.pandas()
warnings.filterwarnings('ignore')

## Environment Settings

In [3]:
INPUT = os.path.join(Config.dir_path, 'input')
OUTPUT = os.path.join(Config.dir_path, 'output')
SUBMISSION = os.path.join(Config.dir_path, 'submissions')
OUTPUT_EXP = os.path.join(OUTPUT, Config.name)
EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

# make dirs
for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
    os.makedirs(d, exist_ok=True)

## Load data

In [4]:
train = pd.read_pickle(os.path.join(INPUT, 'train_agg.pkl'), compression='gzip')
test = pd.read_pickle(os.path.join(INPUT, 'test_agg.pkl'), compression='gzip')
train = train.sample(10000)
test = test.sample(15000)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 10000 entries, a728d85084a40e92bc5d76204de756ea36383417f8fee4672defcf258570b1da to 0d70a858b5a48194c828610b3d5ca553968a4e5fddf75ea03f8413d0545037ed
Columns: 919 entries, P_2_mean to target
dtypes: category(2), float16(713), float64(178), int64(22), int8(4)
memory usage: 48.6 MB


In [6]:
train.head()

Unnamed: 0_level_0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a728d85084a40e92bc5d76204de756ea36383417f8fee4672defcf258570b1da,0.67627,0.10562,0.542969,0.827637,0.619629,0.369141,0.269638,0.003632,0.915527,0.003632,...,13,O,1,13,1.0,1,13,5.0,1,0
52f7455e4fd9b07b60f1c8a572e37972aa9e969ab2f8b59257d6f5b98f5fa7f9,0.446533,0.082594,0.330811,0.538574,0.538574,0.337891,0.329613,0.000892,0.862305,0.008835,...,13,U,2,0,,0,13,6.0,2,1
fdd72c75e5e7fe9ba0283fb993f6df9b2f20fbdd287e534c76f47b617457a7c9,0.944824,0.023044,0.908691,0.979492,0.908691,0.27002,0.266927,0.000709,0.677734,0.589355,...,13,O,1,0,,0,13,6.0,1,0
ef86b680f200eb37a24ee52d187c2b73d8759c123948f4ddf32a8b5cc5209844,0.401855,0.118077,0.113647,0.51416,0.113647,0.063782,0.134368,0.004238,0.478271,0.064087,...,11,O,1,0,,0,11,6.0,1,1
1f3862359e8e5860fec2d2767404f32163eea836da56a1a123bce7402dd5def5,0.60498,0.081648,0.375244,0.702148,0.696289,0.186401,0.280836,0.002653,0.915039,0.002653,...,11,O,1,0,,0,11,6.0,1,0


## Evaluation Metric

In [7]:
# https://www.kaggle.com/code/inversion/amex-competition-metric-python

def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(pd.DataFrame({'target': y_true}), pd.Series(y_pred, name='prediction')),
            True)

## Transform data type

In [8]:
float64_cols = [col for col in train.columns if train[col].dtype == 'float64']
int64_cols = [col for col in train.columns if train[col].dtype == 'int64']

print(train.info())
print(test.info())
print()
print("-"*50+f' data type transformation '+'-'*50)
print()

def transform_dtype(df):
  for col in df.columns:
    if df[col].dtype == 'float64':
      df[col] = df[col].astype('float16')
    if df[col].dtype == 'float32':
      df[col] = df[col].astype('float16')
    if df[col].dtype == 'int64':
      df[col] = df[col].astype('int8')
    if df[col].dtype == 'int32':
      df[col] = df[col].astype('int8')
  return df

train = transform_dtype(train)
test = transform_dtype(test)

print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 10000 entries, a728d85084a40e92bc5d76204de756ea36383417f8fee4672defcf258570b1da to 0d70a858b5a48194c828610b3d5ca553968a4e5fddf75ea03f8413d0545037ed
Columns: 919 entries, P_2_mean to target
dtypes: category(2), float16(713), float64(178), int64(22), int8(4)
memory usage: 48.6 MB
None
<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 15000 entries, 7e1ce7131c3e15238564c77ff26045f0d799714d7ce91830ad162be0942b3160 to caa1eb02f9006b8ed10497801923332504ab22bec741057ad299412297e0b15a
Columns: 918 entries, P_2_mean to D_68_nunique
dtypes: category(2), float16(713), float64(178), int32(3), int64(22)
memory usage: 82.8 MB
None

-------------------------------------------------- data type transformation --------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 10000 entries, a728d85084a40e92bc5d76204de756ea36383417f8fee4672defcf258570b1da to 0d70a858b5a48194c828610b3d5ca553968a4e5fddf75

## Prerocess

In [9]:
from sklearn.preprocessing import LabelEncoder
cat_cols = [col for col in train.columns if train[col].dtype == 'category']

for col in cat_cols:
    le = LabelEncoder()
    le.fit(train[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

## Select Features to Use

In [10]:
features = []
unuse = ['target', 'customer_ID', 'S_2']

for col in train.columns:
  if col not in unuse:
    features.append(col)

# print(features)

## Forward Selection

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train[features].values, train[Config.target].values,
                 train_size=0.8, 
                 random_state=Config.seed, 
                 shuffle=True)

In [12]:
from sklearn.feature_selection import RFE
from lightgbm import LGBMClassifier, early_stopping

lgb_params = {"learning_rate": 0.01,
              'num_leaves': 127,
              'min_child_samples': 2400}

fit_params = {
    'callbacks': [early_stopping(stopping_rounds=10, verbose=0)],
    'eval_set': [(X_test, y_test)],
    'eval_metric': lgb_amex_metric,
    'verbose': 0
}

model = LGBMClassifier(**lgb_params,
                       boosting_type='gbdt',
                       objective='binary',
                       n_estimators=10000,
                       random_state=Config.seed,
                       force_col_wise=True,
                       n_jobs=32,
                       verbose=-1)

rfe = RFE(model,
          n_features_to_select=150,
          step=4,
          verbose=1)

rfe.fit(X_train, y_train, **fit_params)

Fitting estimator with 918 features.
Fitting estimator with 914 features.
Fitting estimator with 910 features.
Fitting estimator with 906 features.
Fitting estimator with 902 features.
Fitting estimator with 898 features.
Fitting estimator with 894 features.
Fitting estimator with 890 features.
Fitting estimator with 886 features.
Fitting estimator with 882 features.
Fitting estimator with 878 features.
Fitting estimator with 874 features.
Fitting estimator with 870 features.
Fitting estimator with 866 features.
Fitting estimator with 862 features.
Fitting estimator with 858 features.
Fitting estimator with 854 features.
Fitting estimator with 850 features.
Fitting estimator with 846 features.
Fitting estimator with 842 features.
Fitting estimator with 838 features.
Fitting estimator with 834 features.
Fitting estimator with 830 features.
Fitting estimator with 826 features.
Fitting estimator with 822 features.
Fitting estimator with 818 features.
Fitting estimator with 814 features.
F

RFE(estimator=LGBMClassifier(force_col_wise=True, learning_rate=0.01,
                             min_child_samples=2400, n_estimators=10000,
                             n_jobs=32, num_leaves=127, objective='binary',
                             random_state=2022, verbose=-1),
    n_features_to_select=800, step=4, verbose=1)

## Generate new train data

In [36]:
train_new = pd.DataFrame(rfe.transform(train[features]), 
                     columns=train[features].columns.values[rfe.get_support()])
result = pd.DataFrame(rfe.get_support(), index=train[features].columns.values, columns=['used'])
result['ranking'] = rfe.ranking_
result = result.sort_values('ranking', ascending=True).rename({result.index.name: 'feature'}).reset_index(drop=False).rename({'index': 'feature'}, axis=1)
result.to_csv(f'{EXP_MODEL}/rfe_features.csv', index=False)

## Training

In [None]:
from lightgbm.plotting import plot_metric
from lightgbm import LGBMClassifier, early_stopping
from sklearn.model_selection import StratifiedKFold

def fit_lgbm(X, y, params=None):
  models = []
  scores = []

  skf = StratifiedKFold(n_splits=Config.n_splits, shuffle=True, random_state=Config.seed)
  
  for fold, (train_indices, valid_indices) in enumerate(tqdm(skf.split(X, y))):
    print("-"*50+f' fold{fold} '+'-'*50)
    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
    X_valid, y_valid = X.iloc[valid_indices], y.iloc[valid_indices]

    model = LGBMClassifier(**params,
                           boosting_type='gbdt',
                           objective='binary',
                           n_estimators=10000,
                           random_state=Config.seed,
                           force_col_wise=True,
                           n_jobs=32,
                           verbose=-1)
    
    model.fit(X_train, y_train, 
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_names=['train', 'valid'],
              eval_metric=lgb_amex_metric,
              callbacks=[early_stopping(stopping_rounds=10, verbose=0)],
              verbose=50)
    
    # ------------------- prediction -------------------
    pred = model.predict_proba(X_valid)[:, 1]
    score = amex_metric(pd.DataFrame({'target': y_valid.values}), pd.Series(pred, name='prediction'))

    # ------------------- plot -------------------
    plot_metric(model)

    # ------------------- save -------------------
    file = f'{EXP_MODEL}/lgbm_fold{fold}.pkl'
    joblib.dump(model, file)
    scores.append(score)
    models.append(model)
    print(f'fold{fold} amex meric: {score}')
    print()

  print(f"OOF Score: {np.mean(scores):.5f}")
  return models

def inference_lgbm(models, X):
    pred = np.array([model.predict_proba(X) for model in models])
    pred = np.mean(pred, axis=0)[:, 1]
    return pred

In [38]:
feature_df = pd.read_csv(f'{EXP_MODEL}/rfe_features.csv')
features = feature_df[feature_df['used'] == True].loc[:, 'feature'].values.tolist()

In [None]:
lgb_params = {"learning_rate": 0.01,
              'num_leaves': 127,
              'min_child_samples': 2400}

models = fit_lgbm(train[features], train[Config.target], params=lgb_params)
# models = [joblib.load(f'{EXP_MODEL}/lgbm_fold{i}.pkl') for i in range(Config.n_splits)]
pred = inference_lgbm(models, test[features])

## Plot importance

In [None]:
def plot_importances(models):
    importance_df = pd.DataFrame(models[0].feature_importances_, 
                                 index=features, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(features) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()

plot_importances(models)

## Submission

In [None]:
sub = pd.DataFrame({'customer_ID': test.index,
                    'prediction': pred})
sub.to_csv(f'{EXP_PREDS}/submission.csv', index=False)

In [None]:
! kaggle competitions submit -c amex-default-prediction -f /home/abe/kaggle/kaggle-amex/submissions/submission.csv -m "Recuresive Feature Elimination for Aggregation Features"