In [1]:
from collections import OrderedDict
import gc
from pathlib import Path
import pickle
import sys
sys.path.append('..')
from typing import Dict, List, Tuple, Union, Literal
import warnings
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedGroupKFold

gc.enable()

In [2]:
KAGGLE = False
MODE:Literal['train', 'test'] = 'test'

In [3]:
if KAGGLE:
    COMPETITION_DIR = Path('/kaggle/input/home-credit-credit-risk-model-stability')
    MODEL_DIR = Path('/kaggle/input/240407-home-credit-test-submit')
    FEATURE_DTYPE_DIR = Path('/kaggle/input/home-credit-enhanced-feature-definitions')
else:
    COMPETITION_DIR = Path('../../data/inputs')
    MODEL_DIR = Path('../../data/models/submit-test')
    FEATURE_DTYPE_DIR = Path('../../data/configs/feature_definitions_dtypes_tables.parquet')

In [4]:
df_features = pd.read_parquet(FEATURE_DTYPE_DIR)
display(df_features)

bool_features = df_features.query('dtype == "Boolean"')
float64_features = df_features.query('dtype == "Float64"')
string_features = df_features.query('dtype == "String"')
date_features = df_features.query('dtype == "Date"')

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    bool_features['cast_dtype'] = pl.Boolean
    float64_features['cast_dtype'] = pl.Float32
    string_features['cast_dtype'] = pl.String

cast_features = pd.concat([bool_features, float64_features, string_features])
display(cast_features)

Unnamed: 0,Variable,Description,dtype,tables
0,actualdpd_943P,Days Past Due (DPD) of previous contract (actu...,Float64,"[train_applprev_1_0, train_applprev_1_1]"
1,actualdpdtolerance_344P,DPD of client with tolerance.,Float64,"[train_static_0_0, train_static_0_1]"
2,addres_district_368M,District of the person's address.,String,[train_person_2]
3,addres_role_871L,Role of person's address.,String,[train_person_2]
4,addres_zip_823M,Zip code of the address.,String,[train_person_2]
...,...,...,...,...
460,totinstallast1m_4525188A,Total amount of monthly instalments paid in th...,Float64,"[train_static_0_0, train_static_0_1]"
461,twobodfilling_608L,Type of application process.,String,"[train_static_0_0, train_static_0_1]"
462,type_25L,Contact type of a person.,String,[train_person_1]
463,typesuite_864L,Persons accompanying the client during the loa...,String,"[train_static_0_0, train_static_0_1]"


Unnamed: 0,Variable,Description,dtype,tables,cast_dtype
78,contaddr_matchlist_1032L,Indicates whether the contact address is found...,Boolean,[train_person_1],Boolean
79,contaddr_smempladdr_334L,Indicates whether the contact address is the s...,Boolean,[train_person_1],Boolean
174,equalitydataagreement_891L,Flag indicating sudden changes in client's soc...,Boolean,"[train_static_0_0, train_static_0_1]",Boolean
175,equalityempfrom_62L,Flag indicating a sudden change in the client'...,Boolean,"[train_static_0_0, train_static_0_1]",Boolean
219,isbidproduct_1095L,Flag indicating if the product is a cross-sell.,Boolean,"[train_static_0_0, train_static_0_1]",Boolean
...,...,...,...,...,...
444,subjectroles_name_541M,Name of subject role in closed credit contract...,String,"[train_credit_bureau_a_2_0, train_credit_burea...",String
445,subjectroles_name_838M,Name of subject role in active credit contract...,String,"[train_credit_bureau_a_2_0, train_credit_burea...",String
461,twobodfilling_608L,Type of application process.,String,"[train_static_0_0, train_static_0_1]",String
462,type_25L,Contact type of a person.,String,[train_person_1],String


In [5]:
def get_depth_paths(load_dir: Path, prefix: Literal['test', 'train']):
    
    assert prefix in ['test', 'train']
    
    depth_paths = OrderedDict()

    depth_paths['static_0'] = []
    depth_paths['static_cb_0'] = []
    depth_paths['applprev_1'] = []
    depth_paths['other_1'] = []
    depth_paths['tax_registry_a_1'] = []
    depth_paths['tax_registry_b_1'] = []
    depth_paths['tax_registry_c_1'] = []
    depth_paths['credit_bureau_a_1'] = []
    depth_paths['credit_bureau_b_1'] = []
    depth_paths['deposit_1'] = []
    depth_paths['person_1'] = []
    depth_paths['debitcard_1'] = []
    depth_paths['applprev_2'] = []
    depth_paths['person_2'] = []
    depth_paths['credit_bureau_a_2'] = []
    depth_paths['credit_bureau_b_2'] = []

    for k in depth_paths.keys():
        depth_paths[k] = sorted(
            [p for p in load_dir.joinpath(f'parquet_files/{prefix}').glob(f'{prefix}_{k}*.parquet')]
        )
    return depth_paths

In [6]:
depth_paths = get_depth_paths(COMPETITION_DIR, MODE)

print(f'the number of {MODE} paths: {sum(len(v1) for v1 in depth_paths.values())}')

the number of test paths: 35


In [7]:
def process_D_features(data: pl.DataFrame, D_columns: List[str]):
    for col in D_columns:
        data = data.with_columns(
            pl.col(col)
            .fill_null('0000-00-00')
            .str.split_exact('-', n=2)
            .struct.rename_fields(['year_' + col, 'month_' + col, 'day_' + col])
            .cast(pl.Int16)
            .alias('fields')
        )
        data = data.drop(col).unnest('fields')
    return data

In [8]:
def merge_dataset(
        base_data: pl.DataFrame,
        depth_paths: Dict[str, List[Path]],
        bool_features: pd.DataFrame,
        float64_features: pd.DataFrame,
        string_features: pd.DataFrame,
        depth: Literal['0', '1', '2', '012'],
    ) -> pd.DataFrame:
    
    assert depth in ['0', '1', '2', '012']
    
    for i, (k, path_list) in enumerate(depth_paths.items()):
        
        if depth == '012':
            pass
        elif depth != k[-1]:
            continue
            
        print(f'loading `{k}`')
        depth_data = []
        for p in path_list:
            sub_data = pl.read_parquet(p).cast({'case_id': pl.Int64})
            if k[-1] == '1':
                sub_data = sub_data.drop('num_group1').group_by('case_id').max().sort('case_id')
            elif k[-1] == '2':
                sub_data = sub_data.drop(['num_group1', 'num_group2']).group_by('case_id').max().sort('case_id')
                
            depth_data.append(sub_data)
            
            print(f'\t{p.stem}: {sub_data.shape}')
            
            del sub_data
            gc.collect()
        
        depth_data = pl.concat(depth_data, how='vertical_relaxed')
        base_data = base_data.join(depth_data, how='left', on='case_id', suffix=f'_{i}')
        
        del depth_data
        gc.collect()
    
    '''
    cast dtypes
    '''
    for _, row in bool_features[['Variable', 'cast_dtype']].iterrows():
        col = row['Variable']
        cast_dtype = row['cast_dtype']
        if col in base_data.columns:
            base_data = base_data.with_columns(pl.col(col).fill_null(np.nan).cast(cast_dtype))
        
    for _, row in float64_features[['Variable', 'cast_dtype']].iterrows():
        col = row['Variable']
        cast_dtype = row['cast_dtype']
        if col in base_data.columns:
            base_data = base_data.with_columns(pl.col(col).cast(cast_dtype))
            
    for _, row in string_features[['Variable', 'cast_dtype']].iterrows():
        col = row['Variable']
        cast_dtype = row['cast_dtype']
        if col in base_data.columns:
            base_data = base_data.with_columns(pl.col(col).cast(cast_dtype))
    
    '''
    process D features
    '''
    D_features = [col for col in base_data.columns if col[-1] == 'D']
    base_data = process_D_features(base_data, D_features)
    base_data = base_data.to_pandas()
    return base_data


def encode(depth_data:pd.DataFrame, mode:Literal['train', 'test'], save_dir:Path) -> pd.DataFrame:

    assert mode in ['train', 'test']

    object_columns = depth_data.dtypes.index[depth_data.dtypes==object].to_list()
    if 'date_decision' in object_columns:
        object_columns.remove('date_decision')

    if mode == 'train':
        object_data = []
        for col in object_columns:
            encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            encoder.fit(depth_data[col].values.reshape(-1, 1))
            # save encoder as joblib
            save_path = save_dir.joinpath(f'encoder_{col}.joblib')
            joblib.dump(encoder, save_path)
            object_data.append(encoder.transform(depth_data[col].values.reshape(-1, 1)).astype(np.float32))
        depth_data.drop(columns=object_columns, inplace=True)
        object_data = np.concatenate(object_data, axis=1)
        object_data = pd.DataFrame(object_data, columns=object_columns)
    else:
        object_data = []
        for col in object_columns:
            # load encoder
            load_path = save_dir.joinpath(f'encoder_{col}.joblib')
            encoder = joblib.load(load_path)
            object_data.append(encoder.transform(depth_data[col].values.reshape(-1, 1)).astype(np.float32))
        depth_data.drop(columns=object_columns, inplace=True)
        object_data = np.concatenate(object_data, axis=1)
        object_data = pd.DataFrame(object_data, columns=object_columns)

    return pd.concat([depth_data, object_data], axis=1)


In [9]:
base_data = pl.read_parquet(
    COMPETITION_DIR.joinpath(f'parquet_files/{MODE}/{MODE}_base.parquet')
).cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.String,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
    }
)

print(MODE)
display(base_data)

test


case_id,date_decision,MONTH,WEEK_NUM
i64,str,i64,i64
57543,"""2020-10-06""",202010,92
57549,"""2020-10-06""",202010,92
57551,"""2020-10-06""",202010,92
57552,"""2020-10-07""",202010,92
57569,"""2020-10-06""",202010,92
57630,"""2020-10-06""",202010,92
57631,"""2020-10-06""",202010,92
57632,"""2020-10-06""",202010,92
57633,"""2020-10-06""",202010,92
57634,"""2020-10-06""",202010,92


In [10]:
depth_data = merge_dataset(
    base_data,
    depth_paths,
    bool_features,
    float64_features,
    string_features,
    '012'
)
display(depth_data)

loading `static_0`
	test_static_0_0: (10, 168)


	test_static_0_1: (10, 168)
	test_static_0_2: (10, 168)
loading `static_cb_0`
	test_static_cb_0: (10, 53)
loading `applprev_1`
	test_applprev_1_0: (6, 40)
	test_applprev_1_1: (4, 40)
	test_applprev_1_2: (4, 40)
loading `other_1`
	test_other_1: (10, 6)
loading `tax_registry_a_1`
	test_tax_registry_a_1: (2, 4)
loading `tax_registry_b_1`
	test_tax_registry_b_1: (2, 4)
loading `tax_registry_c_1`
	test_tax_registry_c_1: (0, 4)
loading `credit_bureau_a_1`
	test_credit_bureau_a_1_0: (2, 78)
	test_credit_bureau_a_1_1: (2, 78)
	test_credit_bureau_a_1_2: (1, 78)
	test_credit_bureau_a_1_3: (2, 78)
	test_credit_bureau_a_1_4: (1, 78)
loading `credit_bureau_b_1`
	test_credit_bureau_b_1: (5, 44)
loading `deposit_1`
	test_deposit_1: (10, 4)
loading `person_1`
	test_person_1: (3, 36)
loading `debitcard_1`
	test_debitcard_1: (10, 5)
loading `applprev_2`
	test_applprev_2: (4, 4)
loading `person_2`
	test_person_2: (10, 9)
loading `credit_bureau_a_2`
	test_credit_bureau_a_2_0: (1, 17)
	test_credit_bureau_a

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,...,day_empl_employedfrom_271D,year_openingdate_857D,month_openingdate_857D,day_openingdate_857D,year_empls_employedfrom_796D,month_empls_employedfrom_796D,day_empls_employedfrom_796D,year_pmts_date_1107D,month_pmts_date_1107D,day_pmts_date_1107D
0,57543,2020-10-06,202010,92,,,7637.200195,0.0,0.0,0.0,...,15,0,0,0,0,0,0,0,0,0
1,57549,2020-10-06,202010,92,,,902.600037,0.0,0.0,0.0,...,4,0,0,0,0,0,0,0,0,0
2,57551,2020-10-06,202010,92,,,3610.199951,0.0,0.0,0.0,...,15,0,0,0,0,0,0,0,0,0
3,57552,2020-10-07,202010,92,,,6964.399902,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,57569,2020-10-06,202010,92,,,5553.399902,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,57630,2020-10-06,202010,92,,,7404.800293,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,57631,2020-10-06,202010,92,,,2872.800049,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,57632,2020-10-06,202010,92,,,6225.800293,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
8,57633,2020-10-06,202010,92,0.0,,7917.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,57634,2020-10-06,202010,92,,,5894.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
depth_data = encode(depth_data, MODE, MODEL_DIR.joinpath('encoders'))
display(depth_data)

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,...,conts_role_79M,empls_economicalst_849M,empls_employer_name_740M,relatedpersons_role_762T,collater_typofvalofguarant_298M,collater_typofvalofguarant_407M,collaterals_typeofguarante_359M,collaterals_typeofguarante_669M,subjectroles_name_541M,subjectroles_name_838M
0,57543,2020-10-06,202010,92,,,7637.200195,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,1.0,2.0,1.0,3.0,0.0,1.0
1,57549,2020-10-06,202010,92,,,902.600037,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
2,57551,2020-10-06,202010,92,,,3610.199951,0.0,0.0,0.0,...,5.0,6.0,1.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
3,57552,2020-10-07,202010,92,,,6964.399902,0.0,0.0,0.0,...,4.0,5.0,0.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
4,57569,2020-10-06,202010,92,,,5553.399902,0.0,0.0,0.0,...,4.0,5.0,0.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
5,57630,2020-10-06,202010,92,,,7404.800293,0.0,0.0,0.0,...,4.0,5.0,0.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
6,57631,2020-10-06,202010,92,,,2872.800049,0.0,0.0,0.0,...,4.0,5.0,0.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
7,57632,2020-10-06,202010,92,,,6225.800293,0.0,0.0,1.0,...,4.0,5.0,0.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
8,57633,2020-10-06,202010,92,0.0,,7917.0,0.0,0.0,0.0,...,4.0,5.0,0.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0
9,57634,2020-10-06,202010,92,,,5894.0,0.0,0.0,0.0,...,4.0,5.0,0.0,10.0,2.0,5.0,5.0,5.0,4.0,4.0


In [12]:
if MODE == 'train':
    X, y= (
        depth_data.drop(columns=['case_id', 'target', 'date_decision']),
        depth_data[['case_id', 'target', 'WEEK_NUM']],
    )
else:
    X = depth_data.drop(columns=['case_id', 'date_decision'])
    y = depth_data[['case_id', 'WEEK_NUM']]

In [None]:
params = {
    'objective': 'binary',
    'metric': ['binary_logoss', 'auc', 'average_precision'],
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'seed': 42,
    'max_depth': 5,
    'num_leaves': 31,
}

In [13]:
def train(X, y):
    n_splits = 5
    skf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof = np.zeros(X.shape[0])

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y['target'], y['WEEK_NUM'])):
        print(f'Fold {fold+1}')

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_data = lgb.Dataset(X_train, label=y_train['target'])
        valid_data = lgb.Dataset(X_valid, label=y_valid['target'])

        model:lgb.Booster = lgb.train(
            params,
            train_data,
            num_boost_round=100_000,
            valid_sets=[train_data, valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100),
                lgb.log_evaluation(period=100),
            ]
        )

        oof[valid_idx] = model.predict(X_valid)

        model_path = MODEL_DIR.joinpath(f'model_{fold}.pkl')
        with open(model_path, 'wb') as model_path:
            pickle.dump(model, model_path)

    return oof


def evaluate(y_true:pd.DataFrame, y_pred, idx_valid):
    from scripts import data
    evaluator = data.Evaluator(y_true.iloc[idx_valid], y_pred, save_path=MODEL_DIR)

    evaluator.plot_pred(is_log=True)
    evaluator.plot_roc()
    df_gini_weeks, stability = evaluator.plot_gini()
    return df_gini_weeks, stability


def predict(
        models: List[lgb.Booster],
        submit: pd.DataFrame,
        X: pd.DataFrame,
        batch_size: int = 128,
    ) -> pd.DataFrame:
    
    probas = np.zeros(len(X), dtype=np.float32)
    for i in range(0, len(X), batch_size):
        limit = i + batch_size if i + batch_size < len(X) else len(X)
        batch_X = X.iloc[i:limit]
        
        for model in models:
            proba = model.predict(batch_X, num_iteration=model.best_iteration)
            probas[i:limit] += proba.astype(np.float32)
            
            del proba
            gc.collect()
            
        del batch_X
        gc.collect()
    
    probas /= len(models)
    
    submit['score'] = probas
    
    return submit

In [16]:
if MODE == 'train':
    oof = train(X, y)
    df_gini_weeks, stability = evaluate(y, oof, np.arange(X.shape[0]))
    display(df_gini_weeks)
    print(stability)
else:
    models = []
    for i in range(5):
        model_path = MODEL_DIR.joinpath(f'model_{i}.pkl')
        with open(model_path, 'rb') as model_path:
            model = pickle.load(model_path)
            models.append(model)
    submit = pd.read_csv(COMPETITION_DIR.joinpath('sample_submission.csv'))
    submit = predict(models, submit, X)
    submit.to_csv(Path('submission.csv'), index=False)
    display(submit)

Unnamed: 0,case_id,score
0,57543,0.357589
1,57549,0.199786
2,57551,0.074901
3,57552,0.041611
4,57569,0.027802
5,57630,0.072267
6,57631,0.055109
7,57632,0.126449
8,57633,0.096409
9,57634,0.141531
