Inspired by https://www.kaggle.com/code/slowlearnermack/amex-lgbm-dart-cv-0-7963-improved

In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
#from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from itertools import combinations

import utils
utils.widen_ipython_window()

In [2]:
# Paths

MAIN_PATH = "/home/mahesh/Desktop/ML/kaggle/amex/"

# Data
PATH_TO_DATA                = MAIN_PATH + "data/"
PATH_TO_PROCESSED2_DATA     = PATH_TO_DATA + "processed2/"
PATH_TO_PROCESSED3_DATA     = PATH_TO_DATA + "processed3/"
PATH_TO_PROCESSED4_DATA     = PATH_TO_DATA + "processed4/"


# Processed data
FILENAME_TRAIN_PROCESSED3_DATA_PARAQUET  = PATH_TO_PROCESSED3_DATA + "train_data.pq"
FILENAME_TEST_PROCESSED2_DATA_FEATHER    = PATH_TO_PROCESSED2_DATA + "test_data.f"
FILENAME_TEST_PROCESSED2_LGBM_DATA_FEATHER  = PATH_TO_PROCESSED2_DATA + "test_LGBM_data.f"

FILENAME_LGBM_SUBMISSION        = PATH_TO_PROCESSED2_DATA + "submission_lgbm.csv"
FILENAME_LGBM_NEW_CID_SUBMISSION        = PATH_TO_PROCESSED2_DATA + "submission_lgbm_new_cid.csv"

FILENAME_TEST_CID_OLD_NEW_MAP  = PATH_TO_PROCESSED2_DATA + "test_cid_old_new_map.f"


# Models
PATH_TO_MODEL   = MAIN_PATH + "models/"

#OOF
PATH_TO_OOF = PATH_TO_DATA + "oof/"


# ====================================================
# Configurations
# ====================================================
VERBOSE   = 2
SEED      = 42
NUM_FOLDS = 5

TARGET_LABEL      = 'target'
CUSTOMER_ID_LABEL = "customer_ID"

TRAIN_MODEL = 1

CAT_FEATURES = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]

In [3]:
def process_features(data, data_agg, features, stats_list, num_features_at_a_time = 1):
    total_features = len(features)
    low_idx = 0
    while low_idx < total_features:
        high_idx = low_idx + num_features_at_a_time
        if (high_idx > total_features):
            high_idx = total_features
        f = features[low_idx:high_idx]
        utils.pt(f'Processing feature/s {f} ... ')
        f_e = data.groupby("customer_ID")[f].agg(stats_list)
        data.drop(columns=f)
        if (data_agg is None):
            data_agg = f_e
        else:
            data_agg = data_agg.merge(f_e, how = 'inner', on = 'customer_ID')
        gc.collect()
        low_idx = high_idx
    return data_agg

In [4]:
def preprocess_data(data):
    utils.pt('Starting training feature engineer...')
    features = data.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    
    num_features = [col for col in features if col not in CAT_FEATURES]
    
    utils.pt('Processing categorical features ...')
    #data_cat_agg = data.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    data_cat_agg = None
    data_cat_agg = process_features(data, data_cat_agg, CAT_FEATURES, stats_list = ['count', 'last', 'nunique'], num_features_at_a_time = 3)
    utils.pt('Joining aggregate data ...')
    data_cat_agg.columns = ['_'.join(x) for x in data_cat_agg.columns]
    data_cat_agg.reset_index(inplace = True)
    
    utils.pt('Processing numerical features ...')
    #data_num_agg = data.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    data_num_agg = None
    data_num_agg = process_features(data, data_num_agg, num_features, stats_list = ['mean', 'std', 'min', 'max', 'last'], num_features_at_a_time = 10)
    utils.pt('Joining aggregate data ...')
    data_num_agg.columns = ['_'.join(x) for x in data_num_agg.columns]
    data_num_agg.reset_index(inplace = True)
    
    #train_labels = pd.read_csv('/content/data/train_labels.csv')
    #train_labels = pd.read_feather(FILENAME_TRAIN_PROCESSED2_LABELS_FEATHER)
    #train = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
    #del train_num_agg, train_cat_agg
    #gc.collect()
    
    utils.gc_l([data])
    
    utils.pt('Merging numerical and categorical features ...')
    data = data_num_agg.merge(data_cat_agg, how = 'inner', on = 'customer_ID')
    utils.gc_l([data_num_agg, data_cat_agg])
    return data

In [5]:
PREPROCESS_TEST_DATA = 0

if PREPROCESS_TEST_DATA:
    test_df = pd.read_feather(FILENAME_TEST_PROCESSED2_DATA_FEATHER)
    test_df = preprocess_data(test_df)
    test_df.info(memory_usage="deep")
    test_df.to_feather(FILENAME_TEST_PROCESSED2_LGBM_DATA_FEATHER)

In [6]:
def evaluate_test_data(write):

    utils.pt('Reading train data')
    train = pd.read_parquet(FILENAME_TRAIN_PROCESSED3_DATA_PARAQUET)
    
    utils.pt('Reading test data')
    test = pd.read_feather(FILENAME_TEST_PROCESSED2_LGBM_DATA_FEATHER)
    
    utils.pt('Label encode categorical features')
    cat_features = [f"{cf}_last" for cf in CAT_FEATURES]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
    
    utils.pt('Round last float features to 2 decimal place')
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        #train[col + '_round2'] = train[col].round(2)
        test[col + '_round2'] = test[col].round(2)
    # Get feature list
    features = [col for col in test.columns if col not in [CUSTOMER_ID_LABEL, TARGET_LABEL]]
    
    utils.gc_l([train])
    
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    
    test_preds_list = []
    
    for fold in range(NUM_FOLDS):
        utils.pt(f'Predicting for test data using LGBM model for fold {fold}')
        # load lgbm model
        model = joblib.load(f'{PATH_TO_MODEL}lgbm_fold{fold}_seed{SEED}.pkl')
        
        test_pred = model.predict(test[features])
        #test_predictions += test_pred / NUM_FOLDS
        test_predictions += test_pred
        
        test_preds_list.append(test_pred)
        
    test_predictions = test_predictions / NUM_FOLDS
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    if write: 
        test_df.to_csv(FILENAME_LGBM_NEW_CID_SUBMISSION, index = False)
    
    return test_preds_list

EVALUATE_TEST_DATA = 0

if EVALUATE_TEST_DATA:
    test_preds_list = evaluate_test_data(write=False)
    test_df.info()
    
    

In [7]:
WRITE_SUBMISSION_FILE = 0
if WRITE_SUBMISSION_FILE:
    test_df = pd.read_csv(FILENAME_LGBM_NEW_CID_SUBMISSION)
    test_cids = pd.read_feather(FILENAME_TEST_CID_OLD_NEW_MAP)
    test_df = test_df.merge(test_cids,how = 'inner', on = 'customer_ID')
    test_df.drop(columns=['customer_ID'], inplace = True)
    test_df.rename(columns = {'customer_ID_orig':'customer_ID'}, inplace = True)
    test_df = test_df[['customer_ID', 'prediction']]
    test_df.to_csv(FILENAME_LGBM_SUBMISSION, index = False)


In [8]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

# ====================================================
# Read data
# ====================================================
def read_data():
    train = pd.read_parquet(FILENAME_TRAIN_PROCESSED3_DATA_PARAQUET)
    #test = pd.read_parquet(CFG.input_dir + 'test_fe.parquet')
    test = []
    return train, test


def extract_X_Y(FE_data, features, cids):
    #utils.pt(str(FE_data.shape))
    #utils.pt(str(cids.shape))
    
    data = FE_data.loc[(FE_data.customer_ID.isin(cids.customer_ID.values))]
    data = data.reset_index() # this adds "index" column to the data-frame
    
    #utils.pt(str(data.shape))
    #data.info()
    #utils.pt(str(data.columns))
    
    Y = data[TARGET_LABEL]
    X = data[features]
    
    return (X,Y)

def extract_val_train_data(fold, train_FE_data, features):
    train_cids = pd.read_feather(f'{PATH_TO_PROCESSED4_DATA}/train_{CUSTOMER_ID_LABEL}_fold_{fold}.f')
    val_cids   = pd.read_feather(f'{PATH_TO_PROCESSED4_DATA}/val_{CUSTOMER_ID_LABEL}_fold_{fold}.f')
    
    (X_train, Y_train) = extract_X_Y(train_FE_data, features, train_cids) 
    (X_val  , Y_val  ) = extract_X_Y(train_FE_data, features, val_cids  )
    
    return (X_train, Y_train, X_val, Y_val)

def extract_test_data(train_FE_data, features):
    test_cids = pd.read_feather(f'{PATH_TO_PROCESSED4_DATA}/test_{CUSTOMER_ID_LABEL}.f')
    return (extract_X_Y(train_FE_data, features, test_cids))


# ====================================================
# Train & Evaluate
# ====================================================
def train_and_evaluate(train, test):
    # Label encode categorical features
    cat_features = [f"{cf}_last" for cf in CAT_FEATURES]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        #test[cat_col] = encoder.transform(test[cat_col])
    # Round last float features to 2 decimal place
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        #test[col + '_round2'] = test[col].round(2)
    # Get feature list
    features = [col for col in train.columns if col not in [CUSTOMER_ID_LABEL, TARGET_LABEL]]
    params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': SEED,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40,
        'saved_feature_importance_type' : 1
        }
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    #kfold = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = SEED)
    #for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[TARGET_LABEL])):
    (x_test, y_test) = extract_test_data(train, features)
    #for fold in range(0,NUM_FOLDS):
    for fold in range(0,1):
        print(' ')
        print('-'*50)
        utils.pt(f'Training fold {fold} with {len(features)} features...')
        (x_train, y_train, x_val, y_val) = extract_val_train_data(fold, train, features)
        #x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        #y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 10500,
            valid_sets = [lgb_train, lgb_valid],
            #valid_sets = [lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 500,
            feval = utils.lgb_amex_metric,
            
            )
        # Save best model
        joblib.dump(model, f'{PATH_TO_MODEL}lgbm_fold{fold}_seed{SEED}.pkl')
        # Predict validation
        val_pred = model.predict(x_val)
        # Add to out of folds array
        #oof_predictions[val_ind] = val_pred
        # Predict the test set
#         test_pred = model.predict(test[features])
#         test_predictions += test_pred / CFG.n_folds
        # Compute fold metric
        score = utils.amex_metric(y_val, val_pred)
        utils.pt(f'Our fold {fold} CV score is {score}')
        
        test_pred = model.predict(x_test)
        score = utils.amex_metric(y_test, test_pred)
        utils.pt(f'Our fold {fold} Test score is {score}')
        
        utils.gc_l([x_train, x_val, y_train, y_val, lgb_train, lgb_valid])
        
    # Compute out of folds metric
    #score = amex_metric(train[TARGET_LABEL], oof_predictions)
    #utils.pt(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    #oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[TARGET_LABEL], 'prediction': oof_predictions})
    #oof_df.to_csv(f'{PATH_TO_OOF}oof_lgbm_baseline_{NUM_FOLDS}fold_seed{SEED}.csv', index = False)
    # Create a dataframe to store test prediction
    #test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    #test_df.to_csv(f'/content/drive/MyDrive/Amex/Predictions/test_lgbm_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)


In [9]:

if TRAIN_MODEL:
    seed_everything(SEED)
    train, test = read_data()
    train_and_evaluate(train, test)

 
--------------------------------------------------
2022-08-30 13:16:32.578175 : Training fold 0 with 1094 features...
[LightGBM] [Info] Number of positive: 90309, number of negative: 258464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 241147
[LightGBM] [Info] Number of data points in the train set: 348773, number of used features: 1093
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051519
[LightGBM] [Info] Start training from score -1.051519
[500]	training's binary_logloss: 0.337449	training's amex_metric: 0.777646	valid_1's binary_logloss: 0.338931	valid_1's amex_metric: 0.771833
[1000]	training's binary_logloss: 0.246641	training's amex_metric: 0.794261	valid_1's binary_logloss: 0.251325	valid_1's amex_metric: 0.780651
[1500]	training's binary_logloss: 0.222451	training's amex_metric: 0.809052	valid_1's binary_logloss: 0.231105	valid_1's amex_metric: 0.787661
[2000]	training's binary_logloss: 0.20861	training's ame

In [34]:
nan_cols = [(i,train[i].isna().sum()) for i in train.columns if train[i].isnull().any()]
nan_cols.sort(key = (lambda x: x[1]), reverse = True)
print((nan_cols))

[('D_87_std', 458441), ('D_87_mean', 458084), ('D_87_min', 458084), ('D_87_max', 458084), ('D_87_last', 458084), ('D_87_last_round2', 458084), ('D_88_std', 456929), ('D_88_mean', 456494), ('D_88_min', 456494), ('D_88_max', 456494), ('D_88_last', 456494), ('D_88_last_round2', 456494), ('D_110_std', 454994), ('D_111_std', 454994), ('B_39_std', 454964), ('D_110_mean', 454826), ('D_110_min', 454826), ('D_110_max', 454826), ('D_110_last', 454826), ('D_111_mean', 454826), ('D_111_min', 454826), ('D_111_max', 454826), ('D_111_last', 454826), ('D_110_last_round2', 454826), ('D_111_last_round2', 454826), ('B_39_mean', 454807), ('B_39_min', 454807), ('B_39_max', 454807), ('B_39_last', 454807), ('B_39_last_round2', 454807), ('D_108_std', 452822), ('B_42_std', 452592), ('B_42_mean', 452404), ('B_42_min', 452404), ('B_42_max', 452404), ('B_42_last', 452404), ('B_42_last_round2', 452404), ('D_73_std', 451403), ('D_73_mean', 450905), ('D_73_min', 450905), ('D_73_max', 450905), ('D_73_last', 450905), 