In [None]:
import gc
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import time
import re

import category_encoders as ce
import lightgbm as lgb

from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

pd.set_option('display.max_columns', None)

In [None]:
DATASET_VERSION = "06"

***
## loading data

In [None]:
raw = pd.read_parquet(
    "../data/ext/amex-data-integer-dtypes-parquet-format/test.parquet", 
    columns=["customer_ID","S_2"]
)
raw["S_2"] = pd.to_datetime(raw["S_2"])
raw = raw.groupby("customer_ID")["S_2"].max().reset_index()
raw["S_2_month"] = raw.S_2.dt.month
raw.loc[raw.query("S_2_month == 4").index,"which"]  = "pub"
raw.loc[raw.query("S_2_month == 10").index,"which"] = "priv"
raw

In [None]:
raw = raw[["customer_ID","which"]].set_index("customer_ID")
raw

In [None]:
train = pd.read_parquet(f"../data/processed/dsv{DATASET_VERSION}/train.parquet")
test = pd.read_parquet(f"../data/processed/dsv{DATASET_VERSION}/test.parquet")

In [None]:
test = pd.merge(test, raw, how="inner", left_index=True, right_index=True)
test_pub = test.query("which == 'pub'").copy()
test_priv = test.query("which == 'priv'").copy()
del test; gc.collect()

In [None]:
input_feats = train.columns.tolist()
len(input_feats)

In [None]:
train["target"] = 0
test_pub["target"] = 1
test_priv["target"] = 1

In [None]:
#dset = pd.concat([train,test_pub], ignore_index=True)
dset = pd.concat([train,test_priv], ignore_index=True)
dset.target.value_counts() 

***
## removes some known different features

In [None]:
feats_to_remove = list()

_group = list(filter(re.compile(".*B_29_.*").match, input_feats))
print(_group, "\n")
feats_to_remove.extend(_group)

_group = list(filter(re.compile(".*S_9_.*").match, input_feats))
print(_group, "\n")
feats_to_remove.extend(_group)

_group = list(filter(re.compile(".*R_1_.*").match, input_feats))
print(_group, "\n")
feats_to_remove.extend(_group)

_group = list(filter(re.compile(".*D_59_.*").match, input_feats))
print(_group, "\n")
feats_to_remove.extend(_group)

_group = list(filter(re.compile(".*S_11_.*").match, input_feats))
print(_group, "\n")
feats_to_remove.extend(_group)

_group = list(filter(re.compile(".*S_9_.*").match, input_feats))
print(_group, "\n")
feats_to_remove.extend(_group)

_group = list(filter(re.compile(".*D_121_.*").match, input_feats))
print(_group, "\n")
feats_to_remove.extend(_group)

In [None]:
print(feats_to_remove)

In [None]:
len(feats_to_remove)

In [None]:
input_feats = [col for col in input_feats if col not in feats_to_remove]
len(input_feats)

***
## training the adversarial classifier

In [None]:
model_params = {
    'objective': 'binary',
    'metric': 'auc',
    'num_iterations': 500,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_bin': 127,
    'bin_construct_sample_cnt': 100000000,
    'bagging_freq': 1,
    'bagging_fraction': 1.0,
    'feature_fraction': 0.2,
    'lambda_l1': 6,
    'lambda_l2': 2,
    'min_data_in_leaf': 1000,
    'path_smooth': 1.5,
    'min_gain_to_split': 0.5,
    'seed': 2112,
    'force_col_wise': True,
    'feature_pre_filter': True,
    'verbosity': -1,
}

In [None]:
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=2112)
skf_split = list(skf.split(dset, dset["target"].values))

In [None]:
ITERATIONS = 100
removed = list()
scores_log = list()

for i in range(ITERATIONS):
    print("-"*100)
    print(f"ITER: {i+1}/{ITERATIONS}")
    
    tic = time.time()

    importances = list()
    scores = list()

    for train_idx,valid_idx in skf_split:
        train_dset = lgb.Dataset(
            data=dset.loc[train_idx,input_feats],
            label=dset.loc[train_idx,"target"].values,
            free_raw_data=True,
        )
        valid_dset = lgb.Dataset(
            data=dset.loc[valid_idx,input_feats],
            label=dset.loc[valid_idx,"target"].values,
            free_raw_data=True,
        )
        model = lgb.train(
            params=model_params,
            train_set=train_dset,
            valid_sets=[valid_dset,],
            callbacks=[lgb.log_evaluation(period=50), lgb.early_stopping(50)],
        )
        lgb.plot_importance(model, figsize=(8,10), importance_type="split", max_num_features=15)
        lgb.plot_importance(model, figsize=(8,10), importance_type="gain", max_num_features=15)
        plt.show()   

        imp = model.feature_importance(importance_type="gain")
        imp /= imp.sum()
        importances.append(imp)
        scores.append(model.best_score["valid_0"]["auc"])

        del train_dset,valid_dset,model; gc.collect()
        
    importances = np.mean(importances, axis=0)
    score = np.mean(scores)
    
    # removes top 5 features
    to_remove_idx = importances.argsort()[::-1][:5].tolist()
    to_remove = [input_feats[i] for i in to_remove_idx]
    for col in to_remove: input_feats.remove(col)
    removed.extend(to_remove)
    print("removed:", removed[-5:])
    scores_log.append(score)
    
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60.} min.")

In [None]:
print(removed)

In [None]:
print(scores_log)

In [None]:
plt.figure(figsize=(15,5))
plt.plot(scores_log, "o-")
plt.grid()
plt.show()

***