In [1]:
from collections import OrderedDict
import gc
from itertools import combinations
from pathlib import Path
import pickle
import sys
sys.path.append('..')
from typing import Dict, List, Tuple, Union, Literal
import warnings


import joblib
import lightgbm as lgb
import numpy as np
import optuna
import pandas as pd
import polars as pl
import polars.selectors as cs
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedGroupKFold, train_test_split

gc.enable()

In [2]:
KAGGLE = False
MODE:Literal['train', 'test'] = 'train'

In [3]:
if KAGGLE:
    COMPETITION_DIR = Path('/kaggle/input/home-credit-credit-risk-model-stability')
    MODEL_DIR = Path('/kaggle/input/240407-home-credit-test-submit')
    FEATURE_DTYPE_DIR = Path('/kaggle/input/home-credit-enhanced-feature-definitions')
else:
    COMPETITION_DIR = Path('../../data/inputs')
    MODEL_DIR = Path('../../data/models/lgbm-dart-revised-dataset')
    if not MODEL_DIR.exists():
        MODEL_DIR.mkdir(parents=True)
        MODEL_DIR.joinpath('encoders').mkdir()
        print(MODEL_DIR, 'created')
    FEATURE_DTYPE_DIR = Path('../../data/configs/feature_definitions_dtypes_tables.parquet')

In [4]:
df_features = pd.read_parquet(FEATURE_DTYPE_DIR)
display(df_features)

bool_features = df_features.query('dtype == "Boolean"')
float64_features = df_features.query('dtype == "Float64"')
string_features = df_features.query('dtype == "String"')
date_features = df_features.query('dtype == "Date"')

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    bool_features['cast_dtype'] = pl.Boolean
    float64_features['cast_dtype'] = pl.Float32
    string_features['cast_dtype'] = pl.String

cast_features = pd.concat([bool_features, float64_features, string_features])
display(cast_features)

Unnamed: 0,Variable,Description,dtype,tables
0,actualdpd_943P,Days Past Due (DPD) of previous contract (actu...,Float64,"[train_applprev_1_0, train_applprev_1_1]"
1,actualdpdtolerance_344P,DPD of client with tolerance.,Float64,"[train_static_0_0, train_static_0_1]"
2,addres_district_368M,District of the person's address.,String,[train_person_2]
3,addres_role_871L,Role of person's address.,String,[train_person_2]
4,addres_zip_823M,Zip code of the address.,String,[train_person_2]
...,...,...,...,...
460,totinstallast1m_4525188A,Total amount of monthly instalments paid in th...,Float64,"[train_static_0_0, train_static_0_1]"
461,twobodfilling_608L,Type of application process.,String,"[train_static_0_0, train_static_0_1]"
462,type_25L,Contact type of a person.,String,[train_person_1]
463,typesuite_864L,Persons accompanying the client during the loa...,String,"[train_static_0_0, train_static_0_1]"


Unnamed: 0,Variable,Description,dtype,tables,cast_dtype
78,contaddr_matchlist_1032L,Indicates whether the contact address is found...,Boolean,[train_person_1],Boolean
79,contaddr_smempladdr_334L,Indicates whether the contact address is the s...,Boolean,[train_person_1],Boolean
174,equalitydataagreement_891L,Flag indicating sudden changes in client's soc...,Boolean,"[train_static_0_0, train_static_0_1]",Boolean
175,equalityempfrom_62L,Flag indicating a sudden change in the client'...,Boolean,"[train_static_0_0, train_static_0_1]",Boolean
219,isbidproduct_1095L,Flag indicating if the product is a cross-sell.,Boolean,"[train_static_0_0, train_static_0_1]",Boolean
...,...,...,...,...,...
444,subjectroles_name_541M,Name of subject role in closed credit contract...,String,"[train_credit_bureau_a_2_0, train_credit_burea...",String
445,subjectroles_name_838M,Name of subject role in active credit contract...,String,"[train_credit_bureau_a_2_0, train_credit_burea...",String
461,twobodfilling_608L,Type of application process.,String,"[train_static_0_0, train_static_0_1]",String
462,type_25L,Contact type of a person.,String,[train_person_1],String


In [5]:
def get_depth_paths(load_dir: Path, prefix: Literal['test', 'train']):
    
    assert prefix in ['test', 'train']
    
    depth_paths = OrderedDict()

    depth_paths['static_0'] = []
    depth_paths['static_cb_0'] = []
    depth_paths['applprev_1'] = []
    depth_paths['other_1'] = []
    depth_paths['tax_registry_a_1'] = []
    depth_paths['tax_registry_b_1'] = []
    depth_paths['tax_registry_c_1'] = []
    depth_paths['credit_bureau_a_1'] = []
    depth_paths['credit_bureau_b_1'] = []
    depth_paths['deposit_1'] = []
    depth_paths['person_1'] = []
    depth_paths['debitcard_1'] = []
    depth_paths['applprev_2'] = []
    depth_paths['person_2'] = []
    depth_paths['credit_bureau_a_2'] = []
    depth_paths['credit_bureau_b_2'] = []

    for k in depth_paths.keys():
        depth_paths[k] = sorted(
            [p for p in load_dir.joinpath(f'parquet_files/{prefix}').glob(f'{prefix}_{k}*.parquet')]
        )
    return depth_paths

In [6]:
depth_paths = get_depth_paths(COMPETITION_DIR, MODE)

print(f'the number of {MODE} paths: {sum(len(v1) for v1 in depth_paths.values())}')

the number of train paths: 31


In [7]:
def merge_dataset(
        base_data: pl.DataFrame,
        depth_paths: Dict[str, List[Path]],
        depth: str = '012',
    ) -> pd.DataFrame:
    
    assert depth in ['0', '1', '2', '012']
    
    drop_features = [
        'employername_160M',
        'name_4527232M',
        'name_4917606M',
        'empls_employer_name_740M',
        'registaddr_zipcode_184M',
        'contaddr_zipcode_807M',
        'empladdr_zipcode_114M',
        'profession_152M',
        'addres_zip_823M',
    ]

    aggs = [
        cs.ends_with('P', 'A', 'D', 'M', 'T', 'L').last().name.prefix('last_'),
        cs.ends_with('P', 'A', 'D', 'M', 'T', 'L').max().name.prefix('max_'),
        cs.ends_with('P', 'A', 'D').mean().name.prefix('mean_'),
        # (cs.ends_with('P', 'A').std() / cs.ends_with('P', 'A')).mean().name.prefix('cv_'),
        # cs.ends_with('P', 'A').sum().name.prefix('sum_'),
        #cs.ends_with('M').mode().first().name.prefix('mode_'),
    ]
    
    for i, (k, path_list) in enumerate(depth_paths.items()):
        
        if depth == '012':
            pass
        elif depth != k[-1]:
            continue
        
        print(f'loading `{k}`')
        depth_data = []
        for p in path_list:
            sub_data = pl.read_parquet(p).cast({'case_id': pl.Int64})

            '''
            drop columns
            '''
            drop_features = [
                'employername_160M',
                'name_4527232M',
                'name_4917606M',
                'empls_employer_name_740M',
                'registaddr_zipcode_184M',
                'contaddr_zipcode_807M',
                'empladdr_zipcode_114M',
                'profession_152M',
                'addres_zip_823M',
            ]

            sub_data = sub_data.drop(drop_features)

            '''
            cast raw data dtypes
            '''
            sub_data = sub_data.pipe(cast_dtypes)

            '''
            aggregation
            '''
            if k[-1] == '1':
                sub_data = sub_data.group_by('case_id').agg(aggs).sort('case_id')

            elif k[-1] == '2':
                sub_data = sub_data.group_by(['case_id', 'num_group1']).agg(aggs).group_by('case_id').agg(aggs).sort('case_id')

            '''
            cast aggregated data dtypes
            '''
            sub_data = sub_data.with_columns(
                pl.col(pl.Float32, pl.Float64).cast(pl.Float32),
                pl.col(pl.Int32, pl.Int64).exclude('case_id').cast(pl.Int32)
            )
            
            '''
            rename columns
            '''
            sub_data = sub_data.rename(lambda c: rename_column(c, k[-1]))

            '''
            drop num_groupN features
            '''
            sub_data = sub_data.drop([col for col in sub_data.columns if 'num_group' in col])

            depth_data.append(sub_data)
            print(f'\t{sub_data.shape}')
            
            del sub_data
            gc.collect()
        
        depth_data = pl.concat(depth_data, how='vertical_relaxed')
        base_data = base_data.join(depth_data, how='left', on='case_id', suffix=f'_{i}')
        
        del depth_data
        gc.collect()

    '''
    add new features
    '''
    depth_0_P_high_fimp_features = [
        'avgdpdtolclosure24_3658938P_0',
        'maxdbddpdtollast12m_3658940P_0',
        'maxdpdlast3m_392P_0',
    ]

    depth_0_A_high_fimp_features = [
        'price_1097A_0',
        'pmtssum_45A_0',
        'annuity_780A_0',
        'credamount_770A_0',
    ]

    depth_0_L_high_fimp_features = [
        'pmtnum_254L_0',
        'mobilephncnt_593L_0',
        'days180_256L_0',
        'days120_123L_0',
        'eir_270L_0',
        'numrejects9m_859L_0',
        'isbidproduct_1095L_0',
        'days90_310L_0',
        'days360_512L_0',
        'pctinstlsallpaidlate1d_3546856L_0',
        'numinstpaidearly3d_3546850L_0',
        'pmtscount_423L_0',
        'numinstunpaidmax_3546851L_0',
        'cntpmts24_3658933L_0',
        'numinstlsallpaid_934L_0',
    ]

    P_aggs = []
    for col1, col2 in combinations(depth_0_P_high_fimp_features, 2):
        P_aggs.append((pl.col(col1) - pl.col(col2)).alias(f'diff_{col1}_{col2}'))
        P_aggs.append((pl.col(col1) + pl.col(col2)).alias(f'sum_{col1}_{col2}'))

    A_aggs = []
    for col1, col2 in combinations(depth_0_A_high_fimp_features, 2):
        A_aggs.append((pl.col(col1) - pl.col(col2)).alias(f'diff_{col1}_{col2}'))
        A_aggs.append((pl.col(col1) + pl.col(col2)).alias(f'sum_{col1}_{col2}'))

    L_aggs = []
    for col1, col2 in combinations(depth_0_L_high_fimp_features, 2):
        L_aggs.append((pl.col(col1) - pl.col(col2)).alias(f'diff_{col1}_{col2}'))
        L_aggs.append((pl.col(col1) + pl.col(col2)).alias(f'sum_{col1}_{col2}'))

    base_data = base_data.with_columns([*P_aggs, *A_aggs, *L_aggs])

    '''
    process date_decision
    '''
    base_data = base_data.pipe(process_date)

    '''
    convert polars DataFrame into pandas DataFrame
    '''
    base_data = base_data.to_pandas()

    return base_data


def cast_dtypes(df: pl.DataFrame):
    df = df.with_columns(cs.ends_with('P', 'A').cast(pl.Float64))
    df = df.with_columns(cs.ends_with('M').cast(pl.String))
    df = df.with_columns(cs.ends_with('D').cast(pl.Date))
    return df


def rename_column(column:List[str], depth):
    if column in ['case_id', 'num_group1', 'num_group2']:
        return column
    else:
        return column + f'_{depth}'
    

def process_date(df: pl.DataFrame):
    df = df.with_columns(
        [
            pl.col('date_decision').dt.year().cast(pl.Int16).name.prefix('year_'),
            pl.col('date_decision').dt.month().cast(pl.Int8).name.prefix('month_'),
            pl.col('date_decision').dt.day().cast(pl.Int8).name.prefix('day_'),
            pl.col('date_decision').dt.weekday().cast(pl.Int8).name.prefix('weekday_'),
        ]
    )

    D_features = df.select(pl.col(pl.Date)).columns
    if 'date_decision' in D_features:
        D_features.remove('date_decision')
    df = df.with_columns(pl.col(D_features).sub(('date_decision')).dt.total_days().cast(pl.Float32))

    return df

In [8]:
def encode(depth_data:pd.DataFrame, mode:Literal['train', 'test'], save_dir:Path) -> pd.DataFrame:

    assert mode in ['train', 'test']

    object_columns = depth_data.dtypes.index[depth_data.dtypes==object].to_list()
    if 'date_decision' in object_columns:
        object_columns.remove('date_decision')

    if mode == 'train':
        object_data = []
        print(f'num of object columns: {len(object_columns)}')
        for col in object_columns:
            encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            encoder.fit(depth_data[col].values.reshape(-1, 1))
            # save encoder as joblib
            save_path = save_dir.joinpath(f'encoder_{col}.joblib')
            joblib.dump(encoder, save_path)
            object_data.append(encoder.transform(depth_data[col].values.reshape(-1, 1)).astype(np.float32))
        depth_data.drop(columns=object_columns, inplace=True)
        print(f'num of object columns: {len(object_data)}')
        object_data = np.concatenate(object_data, axis=1)
        object_data = pd.DataFrame(object_data, columns=object_columns)
    else:
        object_data = []
        for col in object_columns:
            # load encoder
            load_path = save_dir.joinpath(f'encoder_{col}.joblib')
            encoder = joblib.load(load_path)
            object_data.append(encoder.transform(depth_data[col].values.reshape(-1, 1)).astype(np.float32))
        depth_data.drop(columns=object_columns, inplace=True)
        object_data = np.concatenate(object_data, axis=1)
        object_data = pd.DataFrame(object_data, columns=object_columns)

    return pd.concat([depth_data, object_data], axis=1)

In [9]:
base_data = pl.read_parquet(
    COMPETITION_DIR.joinpath(f'parquet_files/{MODE}/{MODE}_base.parquet')
).cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.Date,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
    }
)

print(MODE)
display(base_data)

train


case_id,date_decision,MONTH,WEEK_NUM,target
i64,date,i64,i64,i64
0,2019-01-03,201901,0,0
1,2019-01-03,201901,0,0
2,2019-01-04,201901,0,0
3,2019-01-03,201901,0,0
4,2019-01-04,201901,0,1
…,…,…,…,…
2703450,2020-10-05,202010,91,0
2703451,2020-10-05,202010,91,0
2703452,2020-10-05,202010,91,0
2703453,2020-10-05,202010,91,0


In [10]:
depth_data = merge_dataset(
    base_data,
    depth_paths,
    '012'
)
display(depth_data)

loading `static_0`
	(1003757, 168)
	(522902, 168)
loading `static_cb_0`
	(1500476, 53)
loading `applprev_1`
	(782997, 97)
	(438525, 97)
loading `other_1`
	(51109, 16)
loading `tax_registry_a_1`
	(457934, 7)
loading `tax_registry_b_1`
	(150732, 7)
loading `tax_registry_c_1`
	(482265, 7)
loading `credit_bureau_a_1`
	(335275, 193)
	(549263, 193)
	(325127, 193)
	(176608, 193)
loading `credit_bureau_b_1`
	(36500, 110)
loading `deposit_1`
	(105111, 10)
loading `person_1`
	(1526659, 69)
loading `debitcard_1`
	(111772, 13)
loading `applprev_2`
	(1221522, 13)
loading `person_2`
	(1435105, 30)
loading `credit_bureau_a_2`
	(98303, 85)
	(118481, 85)
	(23734, 85)
	(156749, 85)
	(190486, 85)
	(190313, 85)
	(231250, 85)
	(150426, 85)
	(45056, 85)
	(77457, 85)
	(103033, 85)
loading `credit_bureau_b_2`
	(36447, 28)


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,diff_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,sum_numinstunpaidmax_3546851L_0_cntpmts24_3658933L_0,diff_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,sum_numinstunpaidmax_3546851L_0_numinstlsallpaid_934L_0,diff_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,sum_cntpmts24_3658933L_0_numinstlsallpaid_934L_0,year_date_decision,month_date_decision,day_date_decision,weekday_date_decision
0,0,2019-01-03,201901,0,0,,,1917.599976,0.000000,0.0,...,,,,,,,2019,1,3,4
1,1,2019-01-03,201901,0,0,,,3134.000000,0.000000,0.0,...,,,,,,,2019,1,3,4
2,2,2019-01-04,201901,0,0,,,4937.000000,0.000000,0.0,...,,,,,,,2019,1,4,5
3,3,2019-01-03,201901,0,0,,,4643.600098,0.000000,0.0,...,,,,,,,2019,1,3,4
4,4,2019-01-04,201901,0,1,,,3390.199951,0.000000,0.0,...,,,,,,,2019,1,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0,0.0,176561.359375,3675.400146,0.000000,0.0,...,-24.0,24.0,-112.0,112.0,-88.0,136.0,2020,10,5,1
1526655,2703451,2020-10-05,202010,91,0,0.0,301276.468750,7088.600098,6191.600098,0.0,...,-10.0,32.0,-62.0,84.0,-52.0,94.0,2020,10,5,1
1526656,2703452,2020-10-05,202010,91,0,0.0,14232.400391,7788.800293,0.000000,0.0,...,-7.0,7.0,-6.0,6.0,1.0,13.0,2020,10,5,1
1526657,2703453,2020-10-05,202010,91,0,0.0,197371.578125,1195.400024,2827.199951,0.0,...,-6.0,40.0,-72.0,106.0,-66.0,112.0,2020,10,5,1


In [11]:
depth_data = encode(depth_data, MODE, MODEL_DIR.joinpath('encoders'))
display(depth_data)

num of object columns: 202
num of object columns: 202


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,max_last_collaterals_typeofguarante_359M_2,max_last_collaterals_typeofguarante_669M_2,max_last_subjectroles_name_541M_2,max_last_subjectroles_name_838M_2,max_max_collater_typofvalofguarant_298M_2,max_max_collater_typofvalofguarant_407M_2,max_max_collaterals_typeofguarante_359M_2,max_max_collaterals_typeofguarante_669M_2,max_max_subjectroles_name_541M_2,max_max_subjectroles_name_838M_2
0,0,2019-01-03,201901,0,0,,,1917.599976,0.000000,0.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
1,1,2019-01-03,201901,0,0,,,3134.000000,0.000000,0.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
2,2,2019-01-04,201901,0,0,,,4937.000000,0.000000,0.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
3,3,2019-01-03,201901,0,0,,,4643.600098,0.000000,0.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
4,4,2019-01-04,201901,0,1,,,3390.199951,0.000000,0.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0,0.0,176561.359375,3675.400146,0.000000,0.0,...,1.0,1.0,0.0,0.0,1.0,2.0,3.0,3.0,1.0,1.0
1526655,2703451,2020-10-05,202010,91,0,0.0,301276.468750,7088.600098,6191.600098,0.0,...,1.0,1.0,0.0,0.0,1.0,2.0,3.0,3.0,1.0,1.0
1526656,2703452,2020-10-05,202010,91,0,0.0,14232.400391,7788.800293,0.000000,0.0,...,1.0,1.0,0.0,0.0,1.0,2.0,3.0,3.0,1.0,1.0
1526657,2703453,2020-10-05,202010,91,0,0.0,197371.578125,1195.400024,2827.199951,0.0,...,1.0,1.0,0.0,0.0,1.0,2.0,3.0,3.0,1.0,1.0


In [12]:
depth_data.dtypes.value_counts()

float32           1117
int64                4
int8                 3
datetime64[ms]       1
bool                 1
int16                1
Name: count, dtype: int64

In [13]:
if MODE == 'train':
    X, y= (
        depth_data.drop(columns=['case_id', 'target', 'date_decision']),
        depth_data[['case_id', 'target', 'WEEK_NUM']],
    )
else:
    X = depth_data.drop(columns=['case_id', 'date_decision'])
    y = depth_data[['case_id', 'WEEK_NUM']]

In [14]:
X

Unnamed: 0,MONTH,WEEK_NUM,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,applications30d_658L_0,applicationscnt_1086L_0,applicationscnt_464L_0,...,max_last_collaterals_typeofguarante_359M_2,max_last_collaterals_typeofguarante_669M_2,max_last_subjectroles_name_541M_2,max_last_subjectroles_name_838M_2,max_max_collater_typofvalofguarant_298M_2,max_max_collater_typofvalofguarant_407M_2,max_max_collaterals_typeofguarante_359M_2,max_max_collaterals_typeofguarante_669M_2,max_max_subjectroles_name_541M_2,max_max_subjectroles_name_838M_2
0,201901,0,,,1917.599976,0.000000,0.0,0.0,0.0,0.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
1,201901,0,,,3134.000000,0.000000,0.0,0.0,0.0,0.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
2,201901,0,,,4937.000000,0.000000,0.0,0.0,0.0,0.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
3,201901,0,,,4643.600098,0.000000,0.0,1.0,0.0,2.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
4,201901,0,,,3390.199951,0.000000,0.0,1.0,0.0,0.0,...,5.0,4.0,4.0,1.0,2.0,5.0,5.0,5.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,202010,91,0.0,176561.359375,3675.400146,0.000000,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,2.0,3.0,3.0,1.0,1.0
1526655,202010,91,0.0,301276.468750,7088.600098,6191.600098,0.0,0.0,5.0,0.0,...,1.0,1.0,0.0,0.0,1.0,2.0,3.0,3.0,1.0,1.0
1526656,202010,91,0.0,14232.400391,7788.800293,0.000000,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,2.0,3.0,3.0,1.0,1.0
1526657,202010,91,0.0,197371.578125,1195.400024,2827.199951,0.0,0.0,36.0,0.0,...,1.0,1.0,0.0,0.0,1.0,2.0,3.0,3.0,1.0,1.0


In [15]:
# https://zenn.dev/nishimoto/articles/8d575924cc619d に従いoptunaによるチューニングを行う

params = {
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc', 'average_precision'],
    'verbosity': -1,
    'boosting_type': 'dart',
    'seed': 42,
    'max_depth': 5,
    'num_leaves': 31,
    # the below params are tuned by optuna
    'feature_fraction': 0.9992344024268899,
    'lambda_l1': 9.52345545278879,
    'lambda_l2': 0.16866740544791267,
    'num_leaves': 206,
    'bagging_fraction': 0.9781735248476767,
    'bagging_freq': 6,
    'drop_rate': 0.255100360976506,
    'skip_drop': 0.45682901135353887,
}

In [16]:
# return logloss
def log_loss(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def objective(trial: optuna.Trial, X: pd.DataFrame, y: pd.DataFrame):
    params = {
        'objective': 'binary',
        'metric': ['binary_logloss', 'auc', 'average_precision'],
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'seed': 42,
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 12, 256, 2),
        'baggig_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        # 'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    n_splits = 5
    skf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof = np.zeros(X.shape[0])

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y['target'], y['WEEK_NUM'])):
        print(f'Fold {fold+1}')

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_data = lgb.Dataset(X_train, label=y_train['target'])
        valid_data = lgb.Dataset(X_valid, label=y_valid['target'])

        model:lgb.Booster = lgb.train(
            params,
            train_data,
            num_boost_round=100_000,
            valid_sets=[train_data, valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100),
                # lgb.log_evaluation(period=100),
            ]
        )

        oof[valid_idx] = model.predict(X_valid)

        # model_path = MODEL_DIR.joinpath(f'model_{fold}.pkl')
        # with open(model_path, 'wb') as model_path:
        #     pickle.dump(model, model_path)

    return log_loss(y['target'], oof)

def objective_no_cv(trial: optuna.Trial, train_data: lgb.Dataset, valid_data: lgb.Dataset):
    params = {
        'objective': 'binary',
        'metric': ['binary_logloss', 'auc', 'average_precision'],
        'verbosity': -1,
        'boosting_type': 'dart',
        'seed': 42,
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 12, 256, step=2),
        'baggig_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        # only for dart
        'drop_rate': trial.suggest_float('drop_rate', 0.1, 0.5),
        'skip_drop': trial.suggest_float('skip_drop', 0.1, 0.5),
    }

    model = lgb.train(
        params,
        train_data,
        num_boost_round=1_000,
        valid_sets=[train_data, valid_data],
        callbacks=[
            # lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100),
        ]
    )
    log_loss = model.best_score['valid_1']['binary_logloss']

    return log_loss
    

def optimize(X: pd.DataFrame, y: pd.DataFrame, cv: bool = True, reduce_rate:float=0.99)-> optuna.study.Study:
    study = optuna.create_study(
        direction='minimize',
        study_name='lgbm-group-tune',
        storage=f'sqlite:///{MODEL_DIR}/study.db',
        load_if_exists=True,
    )
    if cv:
        study.optimize(lambda trial: objective(trial, X, y), n_trials=100)
    else:
        print(f'{(1 - reduce_rate)*100:.2f}% of data will be used in tuning.')
        if reduce_rate != 0:
            X_, _, y_, _ = train_test_split(X, y, test_size=reduce_rate, random_state=42, stratify=y['target']) # reduce data size
        else:
            X_, y_ = X, y
        X_train, X_valid, y_train, y_valid = train_test_split(X_, y_, test_size=0.2, random_state=42, stratify=y_['target'])
        train_data = lgb.Dataset(X_train, label=y_train['target'])
        valid_data = lgb.Dataset(X_valid, label=y_valid['target'])
        study.optimize(lambda trial: objective_no_cv(trial, train_data, valid_data), n_trials=100)

    # save study data
    study_path = MODEL_DIR.joinpath('study.pkl')
    with open(study_path, 'wb') as study_path:
        pickle.dump(study, study_path)

    return study

def train(X, y):
    """Train without tuning."""
    n_splits = 5
    skf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof = np.zeros(X.shape[0])

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y['target'], y['WEEK_NUM'])):
        print(f'Fold {fold+1}')

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_data = lgb.Dataset(X_train, label=y_train['target'])
        valid_data = lgb.Dataset(X_valid, label=y_valid['target'])

        model:lgb.Booster = lgb.train(
            params,
            train_data,
            num_boost_round=1_000,
            valid_sets=[train_data, valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100),
                lgb.log_evaluation(period=1),
            ]
        )

        oof[valid_idx] = model.predict(X_valid)

        model_path = MODEL_DIR.joinpath(f'model_{fold}.pkl')
        with open(model_path, 'wb') as model_path:
            pickle.dump(model, model_path)

    return oof


def evaluate(y_true:pd.DataFrame, y_pred, idx_valid):
    from scripts import data
    evaluator = data.Evaluator(y_true.iloc[idx_valid], y_pred, save_path=MODEL_DIR)

    evaluator.plot_pred(is_log=True)
    evaluator.plot_roc()
    df_gini_weeks, stability = evaluator.plot_gini()
    return df_gini_weeks, stability


def predict(
        models: List[lgb.Booster],
        submit: pd.DataFrame,
        X: pd.DataFrame,
        batch_size: int = 128,
    ) -> pd.DataFrame:
    
    probas = np.zeros(len(X), dtype=np.float32)
    for i in range(0, len(X), batch_size):
        limit = i + batch_size if i + batch_size < len(X) else len(X)
        batch_X = X.iloc[i:limit]
        
        for model in models:
            proba = model.predict(batch_X, num_iteration=model.best_iteration)
            probas[i:limit] += proba.astype(np.float32)
            
            del proba
            gc.collect()
            
        del batch_X
        gc.collect()
    
    probas /= len(models)
    
    submit['score'] = probas
    
    return submit

In [17]:
if MODE == 'train':
    # study = optimize(X, y, cv=False, reduce_rate=0)
    # print(study.best_params)

    # # retrain with the best params
    # params.update(study.best_params)
    oof = train(X, y)
    df_gini_weeks, stability = evaluate(y, oof, y.index)
    display(df_gini_weeks)
    display(stability)

else:
    models = []
    for i in range(5):
        model_path = MODEL_DIR.joinpath(f'model_{i}.pkl')
        with open(model_path, 'rb') as model_path:
            model = pickle.load(model_path)
            models.append(model)
    submit = pd.read_csv(COMPETITION_DIR.joinpath('sample_submission.csv'))
    submit = predict(models, submit, X)
    submit.to_csv(Path('submission.csv'), index=False)
    display(submit)

Fold 1


: 