# 0. Load Dataset (feather)

In [2]:
# # !pip install kaggle
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle//
!cp kaggle.json ~/.kaggle/
# Permission Warning이 발생하지 않도록 해줍니다.
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download amex-default-prediction -f train_labels.csv
!kaggle competitions download amex-default-prediction -f sample_submission.csv

!kaggle datasets download -d munumbutt/amexfeather

from zipfile import ZipFile

with ZipFile('./sample_submission.csv.zip') as zipper:
  zipper.extractall(path='./input/amex-default-prediction/')
with ZipFile('./train_labels.csv.zip') as zipper:
  zipper.extractall(path='./input/amex-default-prediction/')
with ZipFile('./amexfeather.zip') as zipper:
  zipper.extractall(path='./input/amex-default-prediction/')

Saving kaggle.json to kaggle.json
Downloading train_labels.csv.zip to /content
 56% 9.00M/16.2M [00:00<00:00, 28.4MB/s]
100% 16.2M/16.2M [00:00<00:00, 48.3MB/s]
Downloading sample_submission.csv.zip to /content
 31% 10.0M/32.4M [00:00<00:00, 104MB/s]
100% 32.4M/32.4M [00:00<00:00, 205MB/s]
Downloading amexfeather.zip to /content
100% 12.6G/12.7G [01:18<00:00, 270MB/s]
100% 12.7G/12.7G [01:18<00:00, 172MB/s]


# 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from cycler import cycler
from IPython.display import display
import datetime
import scipy.stats
import warnings
! pip install colorama
from colorama import Fore, Back, Style
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibrationDisplay
! pip install lightgbm==3.3.2
from lightgbm import LGBMClassifier, log_evaluation

INFERENCE = True # set to False if you only want to cross-validate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 2. Evaluation metric

In [2]:
def amex_metric(y_true, y_pred, return_components=False) -> float:
  """Amex metric for ndarrays"""
  def top_four_percent_captured(df) -> float:
    """Correspond to the recall for a threshold of 4 %"""
    df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
    four_pct_cutoff = int(.04 * df['weight'].sum())
    df['weight_cumsum'] = df['weight'].cumsum()
    df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
    return (df_cutoff['target']==1).sum() / (df['target']==1).sum()

  def weighted_gini(df) -> float:
    df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
    df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
    total_pos = (df['target'] * df['weight']).sum()
    df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
    df['lorentz'] = df['cum_pos_found'] / total_pos
    df['gini'] = (df['lorentz'] - df['random']) * df['weight']
    return df['gini'].sum()

  def normalized_weighted_gini(df) -> float:
    """Corresponds to 2 * AUC - 1"""
    df2 = pd.DataFrame({'target': df.target, 'prediction': df.target})
    df2.sort_values('prediction', ascending=False, inplace=True)
    return weighted_gini(df) / weighted_gini(df2)
  
  df = pd.DataFrame({'target': y_true.ravel(),
                     'prediction': y_pred.ravel()})
  df.sort_values('prediction', ascending=False, inplace=True)
  g = normalized_weighted_gini(df)
  d = top_four_percent_captured(df)

  if return_components: return g, d, .5 * (g + d)
  return .5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
  """The competition metric with lightgbm's calling convention"""
  return ('amex',
          amex_metric(y_true, y_pred),
          True)

# 2. Reading and Preprocessing the data

In [81]:
#%%time
features_avg = ['B_1', 'B_11', 'B_16', 'B_17', 'B_18', 'B_2', 'B_20',
                'B_28', 'B_3', 'B_4', 'B_5', 'B_7', 'B_9', 'D_112',
                'D_121', 'D_141', 'D_39', 'D_41', 'D_42', 'D_43',
                'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 
                'D_50', 'D_51', 'D_53', 'D_54', 'D_56', 'D_58', 
                'D_59', 'D_60', 'D_91', 'P_2', 'P_3', 'R_1', 'R_2', 
                'R_27', 'R_3', 'R_7', 'S_11', 'S_26', 'S_3', 'S_5']
features_max = ['B_1', 'B_11', 'B_13', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 
                'B_22', 'B_24', 'B_27', 'B_28', 'B_29', 'B_3', 'B_31', 'B_33', 'B_36', 
                'B_4', 'B_42', 'B_5', 'B_7', 'B_9', 'D_102', 'D_103', 'D_105', 'D_109', 
                'D_110', 'D_112', 'D_113', 'D_115', 'D_121', 'D_124', 'D_128', 'D_129', 
                'D_131', 'D_139', 'D_141', 'D_144', 'D_145', 'D_39', 'D_41', 'D_42', 
                'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_50', 'D_51', 'D_52', 
                'D_53', 'D_56', 'D_58', 'D_59', 'D_60', 'D_62', 'D_70', 'D_72', 'D_74', 
                'D_75', 'D_79', 'D_81', 'D_83', 'D_84', 'D_88', 'D_89', 'P_2', 'P_3', 
                'R_1', 'R_10', 'R_11', 'R_26', 'R_28', 'R_3', 'R_4', 'R_5', 'R_7', 'R_8', 
                'S_11', 'S_12', 'S_23', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_7', 'S_8', ]
features_last = ['B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_15', 'B_16',
                 'B_17', 'B_18', 'B_19', 'B_2', 'B_20', 'B_22', 'B_23',
                 'B_24', 'B_25', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3',
                 'B_32', 'B_33', 'B_36', 'B_38', 'B_39', 'B_4', 'B_40',
                 'B_41', 'B_42', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9',
                 'D_102', 'D_103', 'D_105', 'D_106', 'D_107', 'D_109',
                 'D_112', 'D_115', 'D_117', 'D_118', 'D_119', 'D_120',
                 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 
                 'D_129', 'D_132', 'D_133', 'D_135', 'D_136', 'D_137', 
                 'D_140', 'D_141', 'D_143', 'D_145', 'D_39', 'D_41',
                 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48',
                 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55',
                 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_63',
                 'D_64', 'D_66', 'D_70', 'D_72', 'D_73', 'D_74', 'D_75',
                 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_82', 'D_83',
                 'D_84', 'D_86', 'D_91', 'D_92', 'D_93', 'D_94', 'D_96',
                 'P_2', 'P_3', 'R_1', 'R_10', 'R_11', 'R_12', 'R_13',
                 'R_14', 'R_15', 'R_17', 'R_18', 'R_19', 'R_2', 'R_20', 
                 'R_21', 'R_22', 'R_24', 'R_25', 'R_26', 'R_27', 'R_3',
                 'R_4', 'R_5', 'R_7', 'R_8', 'R_9', 'S_11', 'S_12',
                 'S_13', 'S_15', 'S_17', 'S_20', 'S_22', 'S_23', 
                 'S_24', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_6',
                 'S_7', 'S_8', 'S_9']

NROWS=50000
DATA_DIRECTORY = '/content/input/amex-default-prediction/'

for i in ['test', 'train'] if INFERENCE else ['train']:
    df = pd.read_feather(f'{DATA_DIRECTORY + i}_data.ftr')
    cid = pd.Categorical(df.pop('customer_ID'), ordered=True)
    last = (cid != np.roll(cid, -1)) # mask for last statement of every customer
    if i == 'train':
        target = df.loc[last, 'target']
        df.drop(columns=['target'], inplace=True)
    gc.collect()
    print('Read', i)
    df_avg = (df
              .groupby(cid)
              .mean()[features_avg]
              .rename(columns={f: f"{f}_avg" for f in features_avg})
             )
    gc.collect()
    print('Computed avg', i)
    df_max = (df
              .groupby(cid)
              .max()[features_max]
              .rename(columns={f: f"{f}_max" for f in features_max})
             )
    gc.collect()
    print('Computed max', i)
    df = (df.loc[last, features_last]
          .rename(columns={f: f"{f}_last" for f in features_last})
          .set_index(np.asarray(cid[last]))
         )
    gc.collect()
    print('Computed last', i)
    df = pd.concat([df, df_max, df_avg], axis=1)
    if i == 'train': train = df
    else: test = df
    print(f"{i} shape: {df.shape}")
    del df, df_avg, df_max, cid, last

#print('Shapes:', train.shape, target.shape)
print(f"target shape: {target.shape}")

Read test
Computed avg test




Computed max test
Computed last test
test shape: (924621, 287)
Read train
Computed avg train
Computed max train
Computed last train
train shape: (458913, 287)
target shape: (458913,)


# 4. Modeling wiht LGBM

## 4.2 LGBM

In [85]:
import joblib

SEED = 1117

features = [f for f in train.columns if f != 'customer_ID' and f != 'target']
print(f'{len(features)} features')

score_list = []
y_pred_list = []
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

params = {
    'n_estimators': 1200,
    'learning_rate': .03,
    'reg_lambda': 50,
    'min_child_samples': 2400,
    'num_leaves': 95,
    'max_bins': 511,
    'random_state': SEED,
    'scale_pos_weight': 2,
    'n_jobs': -1
}

for fold, (idx_train, idx_valid) in enumerate(kf.split(train, target)):
  start_time = datetime.datetime.now()

  # 훈련/검증 셋으로 나눈다
  X_train = train.iloc[idx_train][features]
  X_valid = train.iloc[idx_valid][features]
  y_train = target.iloc[idx_train]
  y_valid = target.iloc[idx_valid]

  # 하이퍼파라미터와 함께 모델 정의
  model = LGBMClassifier(**params)
  
  with warnings.catch_warnings():
    warnings.filterwarnings('ignore', category=UserWarning)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric=[lgb_amex_metric],
              callbacks=[log_evaluation(100)])
  
  # 예측한 검증 데이터
  y_valid_pred = model.predict_proba(X_valid)[:, 1]
  score = amex_metric(y_valid.values, y_valid_pred)
  
  # 최적의 트리개수
  n_trees = model.best_iteration_
  if n_trees is None: n_trees = model.n_estimators

  print(f'{Fore.GREEN}{Style.BRIGHT}Fold {fold}')
  print(f'{str(datetime.datetime.now() - start_time)[-12:-7]}') 
  print(f'{n_trees:5} trees')
  print(f'Score = {score:.5f}{Style.RESET_ALL}')

  joblib.dump(model, filename=f'/content/drive/MyDrive/tabular/amex/model/v1_fold{fold}.pkl')
  score_list.append(score)

  if INFERENCE:
    y_pred_list.append(model.predict_proba(test[features])[:, 1])
  
  end_time = datetime.datetime.now()

# 최종 OOF score
print(f'{Fore.GREEN}{Style.BRIGHT}OOF Score:{np.mean(score_list):.5f}{Style.RESET_ALL}') 

287 features
[100]	training's binary_logloss: 0.252983	training's amex: 0.774016	valid_1's binary_logloss: 0.257003	valid_1's amex: 0.766785
[200]	training's binary_logloss: 0.230444	training's amex: 0.798563	valid_1's binary_logloss: 0.239782	valid_1's amex: 0.782348
[300]	training's binary_logloss: 0.218617	training's amex: 0.812534	valid_1's binary_logloss: 0.234403	valid_1's amex: 0.788167
[400]	training's binary_logloss: 0.209668	training's amex: 0.824121	valid_1's binary_logloss: 0.231912	valid_1's amex: 0.791663
[500]	training's binary_logloss: 0.201802	training's amex: 0.834206	valid_1's binary_logloss: 0.230436	valid_1's amex: 0.792142
[600]	training's binary_logloss: 0.194581	training's amex: 0.844039	valid_1's binary_logloss: 0.229405	valid_1's amex: 0.793119
[700]	training's binary_logloss: 0.187843	training's amex: 0.853941	valid_1's binary_logloss: 0.228551	valid_1's amex: 0.793944
[800]	training's binary_logloss: 0.181497	training's amex: 0.863502	valid_1's binary_loglos

In [86]:
if INFERENCE:
  sub = pd.DataFrame({'customer_ID': test.index,
                      'prediction': np.mean(y_pred_list, axis=0)})
  sub.to_csv('submission.csv', index=False)
  display(sub)

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.046107
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.002477
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.033112
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.334773
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.961043
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0.019818
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,0.865190
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,0.583054
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,0.487606


In [87]:
!kaggle competitions submit -c amex-default-prediction -f submission.csv -m "submission v2"

100% 75.1M/75.1M [00:01<00:00, 75.3MB/s]
Successfully submitted to American Express - Default Prediction