In [1]:
from collections import OrderedDict
import gc
import pathlib
from pprint import pprint
import random
from typing import Dict, List, Tuple, Union

from catboost import CatBoostClassifier
from joblib import load
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder


gc.enable()

In [2]:
competition_dir = pathlib.Path('/kaggle/input/home-credit-credit-risk-model-stability')
feature_dtype_dir = pathlib.Path('/kaggle/input/0-home-credit-make-dtype-csv')
model_dir = pathlib.Path('/kaggle/input/home-credit-04251529')

In [3]:
bool_features = pd.read_csv(feature_dtype_dir.joinpath('bool_features.csv'))
float64_features = pd.read_csv(feature_dtype_dir.joinpath('float64_features.csv'))
string_features = pd.read_csv(feature_dtype_dir.joinpath('string_features.csv'))
date_features = pd.read_csv(feature_dtype_dir.joinpath('date_features.csv'))

bool_features['cast_dtype'] = pl.Int8
float64_features['cast_dtype'] = pl.Float32
string_features['cast_dtype'] = pl.String
date_features['cast_dtype'] = pl.Date

In [4]:
from collections import OrderedDict
import pathlib


def get_depth_paths(load_dir: pathlib.Path, prefix: str):
    
    assert prefix in ['test', 'train']
    
    depth_paths = OrderedDict()

    depth_paths['static_0'] = []
    depth_paths['static_cb_0'] = []
    depth_paths['applprev_1'] = []
    depth_paths['other_1'] = []
    depth_paths['tax_registry_a_1'] = []
    depth_paths['tax_registry_b_1'] = []
    depth_paths['tax_registry_c_1'] = []
    depth_paths['credit_bureau_a_1'] = []
    depth_paths['credit_bureau_b_1'] = []
    depth_paths['deposit_1'] = []
    depth_paths['person_1'] = []
    depth_paths['debitcard_1'] = []
    depth_paths['applprev_2'] = []
    depth_paths['person_2'] = []
    depth_paths['credit_bureau_a_2'] = []
    depth_paths['credit_bureau_b_2'] = []

    for k in depth_paths.keys():
        depth_paths[k] = sorted(
            [p for p in load_dir.joinpath(f'{prefix}').glob(f'{prefix}_{k}*.parquet')]
        )
    return depth_paths

In [5]:
test_depth_paths = get_depth_paths(competition_dir.joinpath('parquet_files'), 'test')
print(f'the number of test paths: {sum(len(v1) for v1 in test_depth_paths.values())}')

the number of test paths: 35


In [6]:
from datetime import datetime
from typing import List

import polars as pl
import polars.selectors as cs


def process_date_features(data: pl.DataFrame) -> pl.DataFrame:
    data = data.with_columns(
        pl.col(pl.Date).dt.year().cast(pl.Float32).name.prefix('year_'),
        pl.col(pl.Date).dt.month().cast(pl.Float32).name.prefix('month_'),
        pl.col(pl.Date).dt.day().cast(pl.Float32).name.prefix('day_'),
    )
    data = data.drop(cs.date())
    return data

In [7]:
import gc
from itertools import combinations
import pathlib
from typing import Dict, List

import numpy as np
import pandas as pd
import polars as pl


def merge_dataset(
        base_data: pl.DataFrame,
        depth_paths: Dict[str, List[pathlib.Path]],
        bool_features: pd.DataFrame,
        float64_features: pd.DataFrame,
        string_features: pd.DataFrame,
        date_features: pd.DataFrame,
        useful_features: pd.DataFrame = None,
        depth: str = '012',
    ) -> pd.DataFrame:
    
    assert depth in ['0', '1', '2', '012']

    aggs = [
        pl.col(pl.Float32).max().cast(pl.Float32).name.prefix('max_'),
        pl.col(pl.Float32).median().cast(pl.Float32).name.prefix('median_'),
        pl.col(pl.Float32).sum().cast(pl.Float32).name.prefix('sum_'),
        pl.col(pl.Int32, pl.Int64).max().cast(pl.Int32).name.prefix('max_'),
        pl.col(pl.Int32, pl.Int64).median().cast(pl.Int32).name.prefix('median_'),
        pl.col(pl.Int32, pl.Int64).sum().cast(pl.Int32).name.prefix('sum_'),
        pl.col(pl.Date, pl.String).first().name.prefix('first_'),
    ]
    
    for i, (k, path_list) in enumerate(depth_paths.items()):
        
        if depth == '012':
            pass
        elif depth != k[-1]:
            continue
            
        print(f'loading `{k}`')
        depth_data = []
        for p in path_list:
            sub_data = pl.read_parquet(p).cast({'case_id': pl.Int64})

            '''
            cast dtypes
            '''
            for _, row in bool_features[['Variable', 'cast_dtype']].iterrows():
                col = row['Variable']
                cast_dtype = row['cast_dtype']
                if col in sub_data.columns:
                    sub_data = sub_data.with_columns(pl.col(col).fill_null(2).fill_nan(2).cast(cast_dtype))
                
            for _, row in float64_features[['Variable', 'cast_dtype']].iterrows():
                col = row['Variable']
                cast_dtype = row['cast_dtype']
                if col in sub_data.columns:
                    sub_data = sub_data.with_columns(pl.col(col).cast(cast_dtype))
                    
            for _, row in string_features[['Variable', 'cast_dtype']].iterrows():
                col = row['Variable']
                cast_dtype = row['cast_dtype']
                if col in sub_data.columns:
                    sub_data = sub_data.with_columns(pl.col(col).fill_null('none').cast(cast_dtype))

            for _, row in date_features[['Variable', 'cast_dtype']].iterrows():
                col = row['Variable']
                cast_dtype = row['cast_dtype']
                if col in sub_data.columns:
                    sub_data = sub_data.with_columns(pl.col(col).cast(cast_dtype))

            '''
            rename columns
            '''
            sub_data = sub_data.rename(lambda c: rename_column(c, k[-1]))
            
            '''
            delete useless features
            '''
            if useful_features is not None:
                sub_data = sub_data.drop(
                    [
                        col
                        for col in sub_data.columns
                        if (
                            (col not in useful_features['Variable'].to_list())
                            and
                            (col != 'case_id' and col != 'date_decision' and col != 'MONTH' and col != 'WEEK_NUM')
                        )
                    ]
                )
            
            '''
            aggregation
            '''
            if k[-1] == '1' :
                aggs_12 = aggs + [
                    (pl.col(col).first() / pl.col(col).max()).cast(pl.Float32).name.prefix('ratio_')
                    for col in sub_data.select(pl.col(pl.Float32)).columns
                ]
                
                sub_data = sub_data.group_by('case_id').agg(aggs_12).sort('case_id')
                # for col in sub_data.columns:
                #     if col.startswith('head'):
                #         sub_data = sub_data.with_columns(
                #             pl.col(col).list.get(0).name.prefix('first_'),
                #             pl.col(col).list.get(1).name.prefix('second_'),
                #         )
                #         sub_data = sub_data.drop(col)
            elif k[-1] == '2':
                aggs_12 = aggs + [
                    (pl.col(col).first() / pl.col(col).max()).cast(pl.Float32).name.prefix('ratio_')
                    for col in sub_data.select(pl.col(pl.Float32)).columns
                ]
                sub_data = sub_data.group_by(['case_id', 'num_group1']).agg(aggs_12).group_by('case_id').agg(aggs).sort('case_id')
            sub_data = sub_data.drop([col for col in sub_data.columns if 'num_group' in col])

            depth_data.append(sub_data)
            print(f'\t{sub_data.shape}')
            
            del sub_data
            gc.collect()
        
        depth_data = pl.concat(depth_data, how='vertical_relaxed')
        base_data = base_data.join(depth_data, how='left', on='case_id', suffix=f'_{i}')
        
        del depth_data
        gc.collect()

    # '''
    # add new features
    # '''
    # depth_0_P_high_fimp_features = [
    #     'avgdpdtolclosure24_3658938P_0',
    #     'maxdbddpdtollast12m_3658940P_0',
    #     'maxdpdlast3m_392P_0',
    # ]

    # depth_0_A_high_fimp_features = [
    #     'price_1097A_0',
    #     'pmtssum_45A_0',
    #     'annuity_780A_0',
    #     'credamount_770A_0',
    # ]

    # depth_0_L_high_fimp_features = [
    #     'pmtnum_254L_0',
    #     'mobilephncnt_593L_0',
    #     'days180_256L_0',
    #     'days120_123L_0',
    #     'eir_270L_0',
    #     'numrejects9m_859L_0',
    #     'isbidproduct_1095L_0',
    #     'days90_310L_0',
    #     'days360_512L_0',
    #     'pctinstlsallpaidlate1d_3546856L_0',
    #     'numinstpaidearly3d_3546850L_0',
    #     'pmtscount_423L_0',
    #     'numinstunpaidmax_3546851L_0',
    #     'cntpmts24_3658933L_0',
    #     'numinstlsallpaid_934L_0',
    # ]

    # display(base_data.select(depth_0_L_high_fimp_features))

    # P_aggs = []
    # for col1, col2 in combinations(depth_0_P_high_fimp_features, 2):
    #     P_aggs.append((pl.col(col1) - pl.col(col2)).alias(f'diff_{col1}_{col2}'))
    #     P_aggs.append((pl.col(col1) + pl.col(col2)).alias(f'sum_{col1}_{col2}'))

    # A_aggs = []
    # for col1, col2 in combinations(depth_0_A_high_fimp_features, 2):
    #     A_aggs.append((pl.col(col1) - pl.col(col2)).alias(f'diff_{col1}_{col2}'))
    #     A_aggs.append((pl.col(col1) + pl.col(col2)).alias(f'sum_{col1}_{col2}'))

    # L_aggs = []
    # for col1, col2 in combinations(depth_0_L_high_fimp_features, 2):
    #     L_aggs.append((pl.col(col1) - pl.col(col2)).alias(f'diff_{col1}_{col2}'))
    #     L_aggs.append((pl.col(col1) + pl.col(col2)).alias(f'sum_{col1}_{col2}'))

    # base_data = base_data.with_columns(*A_aggs)

    '''
    process date features
    '''
    base_data = process_date_features(base_data)

    '''
    convert polars DataFrame into pandas DataFrame
    '''
    base_data = base_data.to_pandas()

    return base_data


def rename_column(column, depth):
    if column in ['case_id', 'num_group1', 'num_group2']:
        return column
    else:
        return column + f'_{depth}'

In [8]:
test_base_data = pl.read_parquet(
    competition_dir.joinpath('parquet_files/test/test_base.parquet')
)
test_base_data = test_base_data.cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.String,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
    }
)
display(test_base_data)

case_id,date_decision,MONTH,WEEK_NUM
i64,str,i64,i64
57543,"""2021-05-14""",202201,100
57549,"""2022-01-17""",202201,100
57551,"""2020-11-27""",202201,100
57552,"""2020-11-27""",202201,100
57569,"""2021-12-20""",202201,100
57630,"""2021-03-16""",202201,100
57631,"""2022-06-04""",202201,100
57632,"""2022-02-05""",202201,100
57633,"""2022-01-25""",202201,100
57634,"""2021-01-27""",202201,100


In [9]:
%%time


depth_data = merge_dataset(
    test_base_data,
    test_depth_paths,
    bool_features,
    float64_features,
    string_features,
    date_features,
    None,
    '012',
)
display(depth_data)

loading `static_0`
	(10, 168)
	(10, 168)
	(10, 168)
loading `static_cb_0`
	(10, 53)
loading `applprev_1`
	(1, 92)
	(1, 92)
	(2, 92)
loading `other_1`
	(9, 21)
loading `tax_registry_a_1`
	(2, 7)
loading `tax_registry_b_1`
	(2, 7)
loading `tax_registry_c_1`
	(0, 7)
loading `credit_bureau_a_1`
	(1, 240)
	(1, 240)
	(1, 240)
	(1, 240)
	(1, 240)
loading `credit_bureau_b_1`
	(3, 134)
loading `deposit_1`
	(5, 7)
loading `person_1`
	(6, 46)
loading `debitcard_1`
	(5, 14)
loading `applprev_2`
	(1, 4)
loading `person_2`
	(3, 9)
loading `credit_bureau_a_2`
	(1, 127)
	(1, 127)
	(1, 127)
	(1, 127)
	(1, 127)
	(1, 127)
	(1, 127)
	(1, 127)
	(1, 127)
	(1, 127)
	(1, 127)
	(1, 127)
loading `credit_bureau_b_2`
	(1, 26)


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,applications30d_658L_0,...,day_first_contractmaturitydate_151D_1,day_first_lastupdate_260D_1,day_first_contractenddate_991D_1,day_first_openingdate_313D_1,day_first_birth_259D_1,day_first_birthdate_87D_1,day_first_empl_employedfrom_271D_1,day_first_openingdate_857D_1,day_first_first_empls_employedfrom_796D_2,day_first_first_pmts_date_1107D_2
0,57543,2021-05-14,202201,100,0.0,191767.359375,3674.600098,1218.200073,0.0,0.0,...,,,,,1.0,,,,,
1,57549,2022-01-17,202201,100,0.0,129704.398438,5742.600098,3546.600098,0.0,2.0,...,,,,,1.0,,,,,
2,57551,2020-11-27,202201,100,0.0,71036.398438,2844.600098,0.0,0.0,1.0,...,,,,,1.0,,8.0,,,
3,57552,2020-11-27,202201,100,0.0,183992.0,6298.800293,12155.400391,0.0,0.0,...,,,,,1.0,,,,,
4,57569,2021-12-20,202201,100,0.0,0.0,4682.600098,0.0,0.0,1.0,...,,,,,,1.0,,,,
5,57630,2021-03-16,202201,100,0.0,0.0,8905.0,0.0,0.0,0.0,...,,,,,1.0,,,,,
6,57631,2022-06-04,202201,100,0.0,,2540.600098,0.0,0.0,0.0,...,,,,,,,,,,
7,57632,2022-02-05,202201,100,0.0,63647.402344,4732.0,0.0,0.0,0.0,...,,,,,,,,,,
8,57633,2022-01-25,202201,100,0.0,,8273.0,0.0,0.0,0.0,...,,,,,,,,,,
9,57634,2021-01-27,202201,100,0.0,39948.800781,1165.800049,0.0,0.0,0.0,...,,,,,,,,,,


CPU times: user 7.1 s, sys: 172 ms, total: 7.27 s
Wall time: 7.42 s


In [10]:
models = []
for k in range(5):
    #models.append(lgb.Booster(model_file=model_dir.joinpath(f'lgb_fold{k+1}_04250706.txt')))
    models.append(CatBoostClassifier().load_model(model_dir.joinpath(f'catboost_fold{k+1}_04251529.cbm')))
    
training_features = np.load(model_dir.joinpath('training_features.npy'), allow_pickle=True)
depth_data = depth_data[training_features]
display(depth_data)

Unnamed: 0,first_incometype_1044T_1,year_first_birth_259D_1,first_sex_738L_1,price_1097A_0,avgdpdtolclosure24_3658938P_0,pmtnum_254L_0,sum_amount_4527230A_1,isbidproduct_1095L_0,mobilephncnt_593L_0,max_sum_pmts_month_158T_2,...,median_numberofoverdueinstls_725L_1,mindbdtollast24m_4525191P_0,first_district_544M_1,median_sum_pmts_year_1139T_2,sum_sum_pmts_month_706T_2,month_responsedate_1012D_0,first_housetype_905L_1,median_instlamount_768A_1,max_monthlyinstlamount_332A_1,sum_credamount_590A_1
0,SALARIED_GOVT,1980.0,F,0.0,1.0,6.0,,1,2.0,2.0,...,0.0,-7.0,P98_137_111,0.0,20.0,,none,4725.399902,4725.399902,368253.0
1,RETIRED_PENSIONER,1959.0,F,,0.0,18.0,,1,3.0,2.0,...,0.0,-2.0,P158_150_171,0.0,20.0,,none,13542.400391,13542.400391,187718.59375
2,SALARIED_GOVT,1982.0,F,27095.201172,1.0,12.0,,0,1.0,2.0,...,0.0,-4.0,,0.0,20.0,,none,16432.363281,17791.400391,
3,RETIRED_PENSIONER,1955.0,M,,0.0,24.0,,1,1.0,,...,,-13.0,,,,,OWNED,,,
4,none,,none,,2517.0,24.0,,1,2.0,0.0,...,,2783.0,,0.0,56.0,,none,,,
5,SALARIED_GOVT,1967.0,F,96174.0,0.0,12.0,,0,2.0,54.0,...,,,,9094.5,56.0,,OWNED,,,
6,,,,24920.0,,12.0,,0,3.0,54.0,...,,,,9085.5,56.0,,,,,
7,,,,25998.0,0.0,6.0,,0,1.0,65.0,...,,-9.0,,20190.0,0.0,,,,,
8,,,,0.0,,48.0,,0,1.0,5.0,...,0.0,,,2021.0,21.0,,,10307.400391,70888.398438,
9,,,,13998.0,0.0,12.0,,0,2.0,,...,,-26.0,,,,,,,,


In [11]:
# def encode_objects(depth_data: pd.DataFrame) -> pd.DataFrame:
#     object_columns = depth_data.dtypes.index[depth_data.dtypes==object].to_list()
#     if 'date_decision' in object_columns:
#         object_columns.remove('date_decision')
    
#     object_data = []
#     for col in object_columns:
#         encoder = load(model_dir.joinpath(f'encoder_{col}.joblib'))
#         object_data.append(encoder.transform(depth_data[col].values.reshape(-1, 1)).astype(np.float32))
#     depth_data.drop(columns=object_columns, inplace=True)
#     object_data = np.concatenate(object_data, axis=1)
#     object_data = pd.DataFrame(object_data, columns=object_columns)
    
#     return pd.concat([depth_data, object_data], axis=1)
    
# display(depth_data)
# depth_data = encode_objects(depth_data)
# display(depth_data)

In [12]:
def predict(
        models: List[object],
        submit: pd.DataFrame,
        X: pd.DataFrame,
        batch_size: int = 1024,
    ) -> pd.DataFrame:
    
    categorical_features = X.dtypes.index[X.dtypes==object].to_list()
    X.fillna({col: 'none' for col in categorical_features}, inplace=True)
    
    probas = np.zeros(len(X), dtype=np.float32)
    for i in range(0, len(X), batch_size):
        limit = i + batch_size if i + batch_size < len(X) else len(X)
        batch_X = X.iloc[i:limit]
        
        for model in models:
            # proba = model.predict(batch_X, num_iteration=model.best_iteration)
            proba = model.predict_proba(batch_X)[:, 1]
            probas[i:limit] += proba.astype(np.float32)
            
            del proba
            gc.collect()
            
        del batch_X
        gc.collect()
    
    probas /= len(models)
    
    submit['score'] = probas
    
    return submit

In [13]:
submission = predict(models, test_base_data.to_pandas(), depth_data, 2048)
display(submission)

submission = submission[['case_id', 'score']]
display(submission)

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,score
0,57543,2021-05-14,202201,100,0.021224
1,57549,2022-01-17,202201,100,0.093742
2,57551,2020-11-27,202201,100,0.007561
3,57552,2020-11-27,202201,100,0.009043
4,57569,2021-12-20,202201,100,0.501462
5,57630,2021-03-16,202201,100,0.006948
6,57631,2022-06-04,202201,100,0.01753
7,57632,2022-02-05,202201,100,0.004528
8,57633,2022-01-25,202201,100,0.114834
9,57634,2021-01-27,202201,100,0.014634


Unnamed: 0,case_id,score
0,57543,0.021224
1,57549,0.093742
2,57551,0.007561
3,57552,0.009043
4,57569,0.501462
5,57630,0.006948
6,57631,0.01753
7,57632,0.004528
8,57633,0.114834
9,57634,0.014634


# 

In [14]:
submission.to_csv('submission.csv', index=False)