<a href="https://colab.research.google.com/github/nguyenngocbinh/amex/blob/master/AMEX_%7C_EDA_%7C_NNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
import pandas as pd

In [2]:
! pip install -q kaggle

In [3]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nanabi","key":"c6e5949212bedaf99b1c9049460a697e"}'}

In [4]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle datasets list

ref                                                       title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
ruchi798/data-science-job-salaries                        Data Science Job Salaries                            7KB  2022-06-15 08:59:12           5576        193  1.0              
surajjha101/bigbasket-entire-product-list-28k-datapoints  BigBasket Entire Product List (~28K datapoints)      6MB  2022-06-22 12:51:18           1692         80  1.0              
victorsoeiro/netflix-tv-shows-and-movies                  Netflix TV Shows and Movies                          2MB  2022-05-15 00:01:23          16318        475  1.0              
sameepvani/nasa-nearest-earth-objects                     NASA - Nearest Earth Objects         

In [6]:
!kaggle competitions list

ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
contradictory-my-dear-watson                   2030-07-01 23:59:00  Getting Started     Prizes         51           False  
gan-getting-started                            2030-07-01 23:59:00  Getting Started     Prizes         99           False  
store-sales-time-series-forecasting            2030-06-30 23:59:00  Getting Started  Knowledge        622           False  
tpu-getting-started                            2030-06-03 23:59:00  Getting Started  Knowledge        124           False  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       1245           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge      15130           False  
house-pr

# Download data from kaggle

In [7]:
!kaggle competitions download -c amex-default-prediction 

Downloading amex-default-prediction.zip to /content
100% 20.5G/20.5G [01:27<00:00, 240MB/s]
100% 20.5G/20.5G [01:27<00:00, 251MB/s]


In [8]:
!kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format

Downloading amex-data-integer-dtypes-parquet-format.zip to /content
100% 4.07G/4.07G [00:31<00:00, 140MB/s]
100% 4.07G/4.07G [00:31<00:00, 137MB/s]


In [9]:
!unzip '/content/amex-default-prediction.zip'

Archive:  /content/amex-default-prediction.zip
  inflating: sample_submission.csv   
  inflating: test_data.csv           
  inflating: train_data.csv          
  inflating: train_labels.csv        


In [10]:
!unzip '/content/amex-data-integer-dtypes-parquet-format.zip'

Archive:  /content/amex-data-integer-dtypes-parquet-format.zip
  inflating: test.parquet            
  inflating: train.parquet           


In [None]:
import psutil
psutil.virtual_memory()

# Submit template

In [None]:
!kaggle competitions submit -c amex-default-prediction -f /content/sample_submission.csv -m test_submission1

100% 59.1M/59.1M [00:00<00:00, 69.7MB/s]
Successfully submitted to American Express - Default Prediction

# EDA

In [18]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools
import pyarrow.parquet as pq

## Feature engineering

In [27]:
def features_info():
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    schema = pq.read_schema('/content/train.parquet', memory_map=True)
    features = list(schema.names)
    features = [col for col in features if col not in ['customer_ID', 'S_2']]
    num_features = [col for col in features if col not in cat_features]
    return features, cat_features, num_features


def preprocess_train_data(nlag):
    features, cat_features, num_features = features_info()
    print('Starting training feature engineer...')
    train = pd.read_parquet('/content/train.parquet')
    train_num_agg = train.groupby("customer_ID").tail(nlag).groupby(
        "customer_ID")[num_features].agg({'mean', 'std', 'min', 'max', 'last'})
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace=True)
    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(
        ['count', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace=True)
    train_labels = pd.read_csv('/content/train_labels.csv')
    train = train_num_agg.merge(train_cat_agg, how='inner',
                                on='customer_ID').merge(train_labels,
                                                        how='inner',
                                                        on='customer_ID')
    gc.collect()
    print('Finished train feature engineer!')
    return train


def preprocess_test_data():
    features, cat_features, num_features = features_info()
    gc.collect()
    test = pd.read_parquet('/content/test.parquet')
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID").tail(3).groupby(
        "customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace=True)
    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(
        ['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace=True)
    test = test_num_agg.merge(test_cat_agg, how='inner', on='customer_ID')
    gc.collect()
    print('Finished test feature engineer!')
    return test

In [None]:
preprocess_train_data(nlag = 13).to_parquet('/content/train_data_fe.parquet')

Starting training feature engineer...


In [None]:
preprocess_train_data(nlag = 6).to_parquet('/content/train_data_lag6_fe.parquet')

In [22]:
preprocess_test_data().to_parquet('/content/test_data_fe.parquet')

Starting test feature engineer...
Finished test feature engineer!


# Modelling

In [24]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from itertools import combinations

# ====================================================
# Configurations
# ====================================================
class CFG:
    input_dir = '/content/'
    seed = 42
    n_folds = 5
    target = 'target'
    boosting_type = 'dart'
    metric = 'binary_logloss'

# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

# ====================================================
# Read data
# ====================================================
def read_data():
    train_lag13 = pd.read_parquet(
        CFG.input_dir + 'train_data_fe.parquet').groupby(
            'target', group_keys=False).apply(lambda x: x.sample(frac=0.8))  
         
    train_lag6 = pd.read_parquet(
        CFG.input_dir + 'train_data_fe.parquet').groupby(
            'target', group_keys=False).apply(lambda x: x.sample(frac=0.3)) 
 
    train = pd.concat([train_lag13, train_lag6])
 
    test = pd.read_parquet(CFG.input_dir + 'test_data_fe.parquet')
    return train, test

# ====================================================
# Amex metric
# ====================================================
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

# ====================================================
# LGBM amex metric
# ====================================================
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

# ====================================================
# Train & Evaluate
# ====================================================
def train_and_evaluate(train, test):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    cat_features = [f"{cf}_last" for cf in cat_features]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
    # Round last float features to 2 decimal place
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        test[col + '_round2'] = test[col].round(2)
    # Get the difference between last and mean
    num_cols = [col for col in train.columns if 'last' in col]
    num_cols = [col[:-5] for col in num_cols if 'round' not in col]
    for col in num_cols:
        try:
            train[f'{col}_last_mean_diff'] = train[f'{col}_last'] - train[f'{col}_mean']
            test[f'{col}_last_mean_diff'] = test[f'{col}_last'] - test[f'{col}_mean']
        except:
            pass
    # Transform float64 and float32 to float16
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    for col in tqdm(num_cols):
        train[col] = train[col].astype(np.float16)
        test[col] = test[col].astype(np.float16)
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
    params = {
        'objective': 'binary',
        'metric': CFG.metric,
        'boosting': CFG.boosting_type,
        'seed': CFG.seed,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40,
        }
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 10500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 1500,
            verbose_eval = 500,
            feval = lgb_amex_metric
            )
        
        # Save best model
        joblib.dump(model, f'/content/drive/MyDrive/amex/Models/lgbm_{CFG.boosting_type}_fold{fold}_seed{CFG.seed}.pkl')
        # Predict validation
        val_pred = model.predict(x_val)
        # Add to out of folds array
        oof_predictions[val_ind] = val_pred
        # Predict the test set
        test_pred = model.predict(test[features])
        test_predictions += test_pred / CFG.n_folds
        # Compute fold metric
        score = amex_metric(y_val, val_pred)
        print(f'Our fold {fold} CV score is {score}')
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        gc.collect()
    # Compute out of folds metric
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
    oof_df.to_csv(f'/content/drive/MyDrive/amex/OOF/oof_lgbm_{CFG.boosting_type}_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    test_df.to_csv(f'/content/drive/MyDrive/amex/Predictions/test_lgbm_{CFG.boosting_type}_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    print('Finished!')
    
seed_everything(CFG.seed)
train, test = read_data()
train_and_evaluate(train, test)

  0%|          | 0/903 [00:00<?, ?it/s]

 
--------------------------------------------------
Training fold 0 with 1190 features...
[500]	training's binary_logloss: 0.292561	training's amex_metric: 0.925498	valid_1's binary_logloss: 0.295626	valid_1's amex_metric: 0.916977
[1000]	training's binary_logloss: 0.152723	training's amex_metric: 0.976737	valid_1's binary_logloss: 0.157753	valid_1's amex_metric: 0.970396
[1500]	training's binary_logloss: 0.106614	training's amex_metric: 0.98869	valid_1's binary_logloss: 0.113085	valid_1's amex_metric: 0.983545
[2000]	training's binary_logloss: 0.0825812	training's amex_metric: 0.993137	valid_1's binary_logloss: 0.0907175	valid_1's amex_metric: 0.987424
[2500]	training's binary_logloss: 0.0679775	training's amex_metric: 0.99598	valid_1's binary_logloss: 0.0772747	valid_1's amex_metric: 0.991003
[3000]	training's binary_logloss: 0.0551336	training's amex_metric: 0.997641	valid_1's binary_logloss: 0.0657066	valid_1's amex_metric: 0.992791
[3500]	training's binary_logloss: 0.0435062	trai

KeyError: ignored

# Submit


In [None]:
sub = pd.read_csv('../input/amex-sub/test_lgbm_baseline_5fold_seed_blend.csv')
sub.to_csv('test_lgbm_baseline_5fold_seed_blend.csv', index = False)

In [15]:
!kaggle competitions submit -c amex-default-prediction -f /content/drive/MyDrive/amex/Predictions/test_lgbm_dart_baseline_5fold_seed42_3monthlag.csv -m 3mth_lag

100% 75.2M/75.2M [00:02<00:00, 28.2MB/s]
Successfully submitted to American Express - Default Prediction