In [1]:
from google.colab import files
import pandas as pd

In [2]:
! pip install -q kaggle

In [3]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nanabi","key":"11702931c704d28b51fb8c30ee14b089"}'}

In [4]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets list

ref                                                       title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
ruchi798/data-science-job-salaries                        Data Science Job Salaries                            7KB  2022-06-15 08:59:12           6465        219  1.0              
surajjha101/bigbasket-entire-product-list-28k-datapoints  BigBasket Entire Product List (~28K datapoints)      6MB  2022-06-22 12:51:18           2062         91  1.0              
victorsoeiro/netflix-tv-shows-and-movies                  Netflix TV Shows and Movies                          2MB  2022-05-15 00:01:23          16804        487  1.0              
sameepvani/nasa-nearest-earth-objects                     NASA - Nearest Earth Objects         

In [None]:
!kaggle competitions list

ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
contradictory-my-dear-watson                   2030-07-01 23:59:00  Getting Started     Prizes         51           False  
gan-getting-started                            2030-07-01 23:59:00  Getting Started     Prizes         99           False  
store-sales-time-series-forecasting            2030-06-30 23:59:00  Getting Started  Knowledge        622           False  
tpu-getting-started                            2030-06-03 23:59:00  Getting Started  Knowledge        124           False  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       1245           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge      15130           False  
house-pr

# Download data from kaggle

In [None]:
!kaggle competitions download -c amex-default-prediction 

Downloading amex-default-prediction.zip to /content
100% 20.5G/20.5G [02:00<00:00, 206MB/s]
100% 20.5G/20.5G [02:00<00:00, 182MB/s]


In [None]:
!kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format

Downloading amex-data-integer-dtypes-parquet-format.zip to /content
100% 4.05G/4.07G [00:17<00:00, 152MB/s]
100% 4.07G/4.07G [00:17<00:00, 251MB/s]


In [None]:
!unzip '/content/amex-default-prediction.zip'

Archive:  /content/amex-default-prediction.zip
  inflating: sample_submission.csv   
  inflating: test_data.csv           
  inflating: train_data.csv          
  inflating: train_labels.csv        


In [None]:
!unzip '/content/amex-data-integer-dtypes-parquet-format.zip'

Archive:  /content/amex-data-integer-dtypes-parquet-format.zip
  inflating: test.parquet            
  inflating: train.parquet           


In [None]:
import psutil
psutil.virtual_memory()

# Submit template

In [None]:
!kaggle competitions submit -c amex-default-prediction -f /content/sample_submission.csv -m test_submission1

100% 59.1M/59.1M [00:00<00:00, 69.7MB/s]
Successfully submitted to American Express - Default Prediction

# EDA

In [1]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools
import pyarrow.parquet as pq
from scipy import stats

## Check stable columns

### Characteristic Stability Index (CSI)

The Characteristic Stability Index (CSI) is used to evaluate the stability or drift of each feature so that we can find the problematic one. As PSI is concerned with the effects of the population drift on the model’s predictions, the CSI is concerned with understanding how the feature distributions have changed

- CSI < 0.1 = The characteristic hasn’t changed, and we can use to train the model
- 0.1 ≤ CS1 < 0.2 = The characteristic has slightly changed, and it is advisable to evaluate the impacts of these changes
- CSI ≥ 0.2 = The changes in characteristic are significant, and the model should not be used the characteristic in model.



In [None]:
def ks_test():
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    binary_features = ['B_31', 'D_87']

    cat_features = cat_features + binary_features
    statistic = []
    pvalue = []    

    num_features = list(set(train.columns) - set(cat_features))
    # num_features = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)    
    for i, name in enumerate(num_features):
        statistic_, pvalue_ = stats.ks_2samp(train[name], test[name])
        statistic.append(statistic_)
        pvalue.append(pvalue_) 
    return pd.DataFrame({'name':num_features, 'ks':statistic, 'pvalue':pvalue})

def psi(var):    
    x = train.groupby(var).size().to_frame()
    x.reset_index(inplace = True)        
    y = test.groupby(var).size().to_frame()
    y.reset_index(inplace = True)    
    psi_tbl = x.merge(y, how = 'inner', on = var)
    psi_tbl['perc_train'] = psi_tbl['0_x']/sum(psi_tbl['0_x'])
    psi_tbl['perc_test']= psi_tbl['0_y']/sum(psi_tbl['0_y'])
    psi_tbl['psi_sub']= (psi_tbl['perc_train']-psi_tbl['perc_test']) * np.log(psi_tbl['perc_train']/psi_tbl['perc_test'])    
    return sum(psi_tbl['psi_sub'])

def psi_test():
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    binary_features = ['B_31', 'D_87']
    cat_features = cat_features + binary_features
    psi_out = []
    for i, name in enumerate(cat_features):
        psi_out = psi(name)
        psi_out.append(psi_)                
    return pd.DataFrame({'name':cat_features, 'psi':psi_out})

In [None]:
train = pd.read_parquet('/content/drive/MyDrive/amex/train.parquet')
test = pd.read_parquet('/content/drive/MyDrive/amex/test.parquet')

In [None]:
psi_table = psi_test().sort_values('psi')

In [None]:
ks_table = ks_test().sort_values('ks')

In [None]:
ks_table

Unnamed: 0,name,ks,pvalue
102,R_18,2e-05,1.0
118,R_24,2.5e-05,1.0
92,R_13,8.7e-05,1.0
103,D_88,9.4e-05,1.0
162,R_28,0.000139,0.9999997
80,R_8,0.000216,0.9950105
34,R_4,0.000228,0.9902162
108,R_20,0.000245,0.9784611
101,R_17,0.000257,0.9665147
113,R_23,0.000269,0.9498369


# Feature engineering

In [2]:
def features_info():
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]

    binary_features = ['B_31', 'D_87']

    cat_features = cat_features + binary_features
    schema = pq.read_schema('/content/drive/MyDrive/amex/train.parquet', memory_map=True)
    features = list(schema.names)
    features = [col for col in features if col not in ['customer_ID', 'S_2']]
    num_features = [col for col in features if col not in cat_features]
    rm_var_by_ks = ['R_1', 'S_11', 'D_59', 'B_29', 'S_9']
    return features, cat_features, num_features, rm_var_by_ks


def preprocess_train_data(nlag):
    features, cat_features, num_features, rm_var_by_ks = features_info()
    num_features = list(set(num_features) - set(rm_var_by_ks))
    print('Starting training feature engineer...')
    train = pd.read_parquet('/content/drive/MyDrive/amex/train.parquet')
    train_num_agg = train.groupby("customer_ID").tail(nlag).groupby(
        "customer_ID")[num_features].agg({'mean', 'std', 'max', 'min', 'last'})
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace=True)
    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(
        ['count', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace=True)
    train_labels = pd.read_csv('/content/drive/MyDrive/amex/train_labels.csv')
    train = train_num_agg.merge(train_cat_agg, how='inner',
                                on='customer_ID').merge(train_labels,
                                                        how='inner',
                                                        on='customer_ID')
    gc.collect()
    print('Finished train feature engineer!')
    return train


def preprocess_test_data():
    features, cat_features, num_features, rm_var_by_ks = features_info()
    num_features = list(set(num_features) - set(rm_var_by_ks))
    gc.collect()
    test = pd.read_parquet('/content/drive/MyDrive/amex/test.parquet')
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID").tail(3).groupby(
        "customer_ID")[num_features].agg(['mean', 'std', 'max', 'min', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace=True)
    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(
        ['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace=True)
    test = test_num_agg.merge(test_cat_agg, how='inner', on='customer_ID')
    gc.collect()
    print('Finished test feature engineer!')
    return test

In [None]:
preprocess_train_data(nlag = 13).to_parquet('/content/drive/MyDrive/amex/train_data_fe.parquet')

Starting training feature engineer...
Finished train feature engineer!


In [None]:
preprocess_train_data(nlag = 6).to_parquet('/content/drive/MyDrive/amex/train_data_lag6_fe.parquet')

Starting training feature engineer...
Finished train feature engineer!


In [None]:
preprocess_test_data().to_parquet('/content/drive/MyDrive/amex/test_data_fe.parquet')

Starting test feature engineer...
Finished test feature engineer!


In [19]:
!pip install lightgbm==3.3.1 --install-option=--gpu # 2.2.3

  cmdoptions.check_install_build_global(options)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightgbm==3.3.1
  Using cached lightgbm-3.3.1.tar.gz (1.5 MB)
Skipping wheel build for lightgbm, due to binaries being disabled for it.
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 3.3.2
    Uninstalling lightgbm-3.3.2:
      Successfully uninstalled lightgbm-3.3.2
    Running setup.py install for lightgbm ... [?25l[?25hdone
Successfully installed lightgbm-3.3.1


# Modelling

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from itertools import combinations

# ====================================================
# Configurations
# ====================================================
class CFG:
    input_dir = '/content/drive/MyDrive/amex/'
    seed = 108
    n_folds = 5
    target = 'target'
    boosting_type = 'dart'
    metric = 'binary_logloss'

# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

# ====================================================
# Read data
# ====================================================
def read_data():
    #train = pd.read_parquet(
    #    CFG.input_dir + 'train_data_fe.parquet').groupby(
    #        'target', group_keys=False).apply(lambda x: x.sample(frac=0.8))  
         
    #train_lag6 = pd.read_parquet(
    #    CFG.input_dir + 'train_data_fe.parquet').groupby(
    #        'target', group_keys=False).apply(lambda x: x.sample(frac=0.3)) 
 
    #train = pd.concat([train_lag13, train_lag6])

    train = pd.read_parquet(CFG.input_dir + 'train_data_fe.parquet')
    test = pd.read_parquet(CFG.input_dir + 'test_data_fe.parquet')
    return train, test

# ====================================================
# Amex metric
# ====================================================
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

# ====================================================
# LGBM amex metric
# Output include:
# - eval_name
# - eval_result
# - is_higher_better
# ====================================================
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

# ====================================================
# Train & Evaluate
# ====================================================
def train_and_evaluate(train, test):
    features, cat_features, num_features, rm_var_by_ks = features_info()

    # Label encode categorical features        
    cat_features = [f"{cf}_last" for cf in cat_features]
    
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
    # Round last float features to 2 decimal place
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        test[col + '_round2'] = test[col].round(2)
    # Get the difference between last and mean
    num_cols = [col for col in train.columns if 'last' in col]
    num_cols = [col[:-5] for col in num_cols if 'round' not in col]
    for col in num_cols:
        try:
            train[f'{col}_last_mean_diff'] = train[f'{col}_last'] - train[f'{col}_mean']
            test[f'{col}_last_mean_diff'] = test[f'{col}_last'] - test[f'{col}_mean']
        except:
            pass
    # Transform float64 and float32 to float16
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    for col in tqdm(num_cols):
        train[col] = train[col].astype(np.float16)
        test[col] = test[col].astype(np.float16)
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
    params = {
        'objective': 'binary',
        'metric': CFG.metric,
        'boosting': CFG.boosting_type,
        'seed': CFG.seed,
        'num_leaves': 108,
        'learning_rate': 0.0158,
        'feature_fraction': 0.22,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 42,
        'max_bin':63,
        'device' : 'gpu'
        }
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 9500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 1500,
            verbose_eval = 500,            
            feval = lgb_amex_metric
            )
        
        # Save best model
        joblib.dump(model, f'/content/drive/MyDrive/amex/Models/lgbm_{CFG.boosting_type}_fold{fold}_seed{CFG.seed}.pkl')
        # Predict validation
        val_pred = model.predict(x_val)
        # Add to out of folds array
        oof_predictions[val_ind] = val_pred
        # Predict the test set
        test_pred = model.predict(test[features])
        test_predictions += test_pred / CFG.n_folds
        # Compute fold metric
        score = amex_metric(y_val, val_pred)
        print(f'Our fold {fold} CV score is {score}')
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        gc.collect()
    # Compute out of folds metric
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
    oof_df.to_csv(f'/content/drive/MyDrive/amex/OOF/oof_lgbm_{CFG.boosting_type}_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    test_df.to_csv(f'/content/drive/MyDrive/amex/Predictions/test_lgbm_{CFG.boosting_type}_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    print('Finished!')
    
seed_everything(CFG.seed)
train, test = read_data()
train_and_evaluate(train, test)

  0%|          | 0/870 [00:00<?, ?it/s]

 
--------------------------------------------------
Training fold 0 with 1149 features...
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 55868
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 1143
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 619 dense feature groups (217.08 MB) transferred to GPU in 0.128998 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
[500]	training's binary_logloss: 0.278605	training's amex_metric: 0.787885	valid_1's binary_logloss: 0.283354	valid_1's amex_metric: 0.772838
[1000]	training's binary_logloss: 0.220068

In [2]:
!pip install session_info

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting session_info
  Downloading session_info-1.0.0.tar.gz (24 kB)
Collecting stdlib_list
  Downloading stdlib_list-0.8.0-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[?25hBuilding wheels for collected packages: session-info
  Building wheel for session-info (setup.py) ... [?25l[?25hdone
  Created wheel for session-info: filename=session_info-1.0.0-py3-none-any.whl size=8048 sha256=5099a851f622bbe4be00ede41694810c3af7d5381e7df76517c1b8ed2b8e7214
  Stored in directory: /root/.cache/pip/wheels/bd/ad/14/6a42359351a18337a8683854cfbba99dd782271f2d1767f87f
Successfully built session-info
Installing collected packages: stdlib-list, session-info
Successfully installed session-info-1.0.0 stdlib-list-0.8.0


In [3]:
import session_info
import lightgbm as lgb
session_info.show()

# Submit


In [None]:
sub = pd.read_csv('../input/amex-sub/test_lgbm_baseline_5fold_seed_blend.csv')
sub.to_csv('test_lgbm_baseline_5fold_seed_blend.csv', index = False)

In [None]:
!kaggle competitions submit -c amex-default-prediction -f /content/drive/MyDrive/amex/Predictions/test_lgbm_dart_baseline_5fold_seed42_3monthlag.csv -m 3mth_lag

100% 75.2M/75.2M [00:02<00:00, 28.2MB/s]
Successfully submitted to American Express - Default Prediction