## About this NB

This notebook contains ideas taken from the following great NBs:

1 From https://www.kaggle.com/code/ambrosm/amex-lightgbm-quickstart I took the idea of masking, which I used to build the feature last / next-to-last

2 From https://www.kaggle.com/code/kunheekimkr/amex-lgbm-gpu-starter-0-795/notebook I took the idea of reading the test set by chunks.I also took the idea that predict(df, raw_score = True) gives the log-odds. Amex metric is invariant to log-odds.

3 From https://www.kaggle.com/code/thedevastator/lag-features-are-all-you-need/ I took ideas for new features (last/mean, last - first, etc.)

4 https://www.kaggle.com/competitions/amex-default-prediction/discussion/335892 gives a great overview of topics and tricks on tabular classification.

In [1]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split

import gc
import warnings
warnings.filterwarnings('ignore')

In [3]:
class config:
    random_state = 4222
    #kaggle = True
    #path = '../input/amexfeather'
    #local_path = ''

## **Data Preprocessing**

In [4]:
def read_file(path = '', usecols = None):
    
    if usecols is not None: df = pd.read_parquet(path,columns = usecols)
    else: df = pd.read_parquet(path)
   
    print('ajá:')
    df.S_2 = pd.to_datetime( df.S_2 )
    #df = df.fillna(NaN_value) 
    print('shape of data:', df.shape)
    
    return df

In [5]:
def preprocessing(df, cat_features, num_features, i = 'train'):
    
    cid = pd.Categorical(df.pop('customer_ID'), ordered = True)
    last = (cid != np.roll(cid, -1))
    penul = np.roll(last, -1)
    
    if 'target' in df.columns:
        df.drop(columns=['target'], inplace=True)
    gc.collect()
    print('Read', i)
    
    df_num = (df.groupby(cid)[num_features]
              .agg(['first','mean','last'])
             )
    df_num.columns = ['_'.join(x) for x in df_num.columns]
    print('Computed df_num', i)
    
    df_penul = (df.loc[penul,num_features]
              .rename(columns={f: f"{f}_pl" for f in num_features})
              .set_index(np.asarray(cid[last]))
             )
    print('Computed penul', i)
    
    df_num = pd.concat([df_num, df_penul], axis=1)
    print('Computed concat penul', i)
         
    for col in df_num:
        if 'last' in col and col.replace('last', 'pl') in df_num:
                df_num[col + '_lg'] = df_num[col] / df_num[col.replace('last', 'pl')]         
    print('Computed lg', i)
    
    new_cols = [col for col in df_num.columns if '_pl' not in col]
    df_num = df_num[new_cols]  
    
    for col in df_num:
        if 'last' in col and col.replace('last', 'mean') in df_num:
                df_num[col + '_lm'] = df_num[col] / df_num[col.replace('last', 'mean')]     
    print('Computed lm', i)
    
    for col in df_num:
        if 'last' in col and col.replace('last', 'first') in df_num:
                df_num[col + '_lf'] = df_num[col] - df_num[col.replace('last', 'first')]     
    print('Computed lf', i)
                  
    df_cat = (df.groupby(cid)[cat_features]
              .agg(['first','last'])
             )
    df_cat.columns = ['_'.join(x) for x in df_cat.columns]
    
    df = pd.concat([df_num, df_cat], axis=1)
    
    del df_num, df_cat, df_penul,cid, last, penul, new_cols
    
    for col in df.columns:
        if df[col].dtype=='float64': df[col] = df[col].astype('float16')
        if df[col].dtype=='int64': df[col] = df[col].astype('int16')
    
    return df

In [6]:
print('Reading train data...')
train_path = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_file(path = train_path)

Reading train data...
ajá:
shape of data: (5531451, 190)


In [7]:
features = train.drop(['customer_ID','S_2'], axis = 1).columns.to_list()
#cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
#cat_features =["B_4","S_11","S_13","S_15","D_39","D_51","D_59","D_74","D_75","D_80","D_91","D_92"]

cat_features =["B_4",'B_30','B_38',"S_11","S_13","S_15","D_39","D_51","D_59",'D_63','D_64','D_66','D_68',"D_74","D_75","D_80","D_91","D_92",'D_114','D_116','D_117','D_120','D_126']

num_features = [col for col in features if col not in cat_features]

In [8]:
%%time
train = preprocessing(train,cat_features,num_features)

Read train
Computed df_num train
Computed penul train
Computed concat penul train
Computed lg train
Computed lm train
Computed lf train
CPU times: user 51.9 s, sys: 20.3 s, total: 1min 12s
Wall time: 1min 11s


In [9]:
features = [feat for feat in train.columns if feat != 'customer_ID' and feat != 'target' and feat != "S_2"]
len(features)

1036

In [10]:
target = pd.read_csv('../input/amex-default-prediction/train_labels.csv').target.values
print(f"target shape: {target.shape}")

target shape: (458913,)


## **Model Training**

In [11]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    if isinstance(y_true, np.ndarray):
            y_true = pd.DataFrame(y_true, columns = ["target"])
    
    if isinstance(y_pred, np.ndarray):
            y_pred = pd.DataFrame(y_pred, columns = ["prediction"])
            #y_pred["prediction"] = y_pred
    
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
      
        df['weight'] = df["target"].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df["target"] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df["target"].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df["target"] * df['weight']).sum()
        df['cum_pos_found'] = (df["target"] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    d = top_four_percent_captured(y_true, y_pred)
    g = normalized_weighted_gini(y_true, y_pred)

    return 0.5 * (g + d)

In [12]:
def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(y_true, y_pred),
            True)

In [13]:
search_params = { 
    'learning_rate' : 0.03, #0.065,
    #'lambda_l1': 7.200056653766078,
    'lambda_l2': 50, #9.35685026658397 
    'num_leaves': 100, #55, 
    'feature_fraction': 0.4, #0.19,
    'bagging_fraction': 0.9, #1.0, 
    'bagging_freq': 0,
    'min_child_samples': 2400, #100
}

fixed_params={
    'objective': 'binary',
    'metric': 'custom', 
    'boosting_type' : 'gbdt',
    'random_state' : config.random_state,
    #'n_jobs': -1,
    #'extra_trees' : True,
    #'feature_pre_filter': False,
    'n_estimators': 1200, 
    'early_stopping_round': 100
}

In [14]:
NaN_value = -127

In [15]:
def train_modelo(df,target,features):
    
    df = df.fillna(NaN_value)
    x = df[features]
    y = pd.Series(target)
    
    X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.3,
                                random_state = 4222, stratify = y)
    
    model = LGBMClassifier(**fixed_params, **search_params)
    
    model.fit(
        X_train, y_train, 
        eval_set=[(X_test,y_test)],
        eval_metric= lgb_amex_metric,
        callbacks=[log_evaluation(100)]
    )
    
    del x,y, X_train, y_train
    
    return model, X_test, y_test

In [16]:
%%time
model, X_test, y_test = train_modelo(train,target,features)

[100]	valid_0's amex: 0.752465
[200]	valid_0's amex: 0.758414
[300]	valid_0's amex: 0.766652
[400]	valid_0's amex: 0.770188
[500]	valid_0's amex: 0.772823
[600]	valid_0's amex: 0.776127
[700]	valid_0's amex: 0.779317
[800]	valid_0's amex: 0.781793
[900]	valid_0's amex: 0.783138
[1000]	valid_0's amex: 0.784932
[1100]	valid_0's amex: 0.786593
[1200]	valid_0's amex: 0.786962
[1300]	valid_0's amex: 0.788928
[1400]	valid_0's amex: 0.789935
[1500]	valid_0's amex: 0.790412
[1600]	valid_0's amex: 0.790405
[1700]	valid_0's amex: 0.790825
[1800]	valid_0's amex: 0.7917
[1900]	valid_0's amex: 0.792431
[2000]	valid_0's amex: 0.792308
[2100]	valid_0's amex: 0.792636
[2200]	valid_0's amex: 0.793215
[2300]	valid_0's amex: 0.792452
[2400]	valid_0's amex: 0.792885
[2500]	valid_0's amex: 0.792786
[2600]	valid_0's amex: 0.792895
[2700]	valid_0's amex: 0.792952
[2800]	valid_0's amex: 0.793114
[2900]	valid_0's amex: 0.792974
[3000]	valid_0's amex: 0.793384
[3100]	valid_0's amex: 0.793264
[3200]	valid_0's am

In [17]:
y_test = pd.DataFrame(y_test, columns = ["target"])
y_pred = pd.DataFrame(y_test.copy(), columns = ["prediction"])

y_pred["prediction"] = model.predict_proba(X_test)[:,1]
amex_metric(y_test, y_pred)

0.7930688054931098

In [16]:
del train, target,features, X_test, y_test, y_pred
gc.collect()

NameError: name 'y_pred' is not defined

In [None]:
model.booster_.save_model("./amex-model.txt")