In [1]:
# import neccessary dependencies
import os
import random
import itertools
import pickle
from pathlib import Path

import polars as pl
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import TargetEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import f_classif, mutual_info_classif

import lightgbm as lgb
import xgboost as xgb
import catboost
from catboost import CatBoostClassifier
import tensorflow as tf

from scipy.stats import linregress
from scipy.stats import ks_2samp, chi2_contingency

2026-01-16 02:10:37.281548: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-01-16 02:10:37.322277: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-16 02:10:38.511589: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [2]:
# random seeding
tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)

In [3]:
# list train files
train_dir='parquet_files/train/'
os.listdir(train_dir)

['train_applprev_1_0.parquet',
 'train_applprev_1_1.parquet',
 'train_applprev_2.parquet',
 'train_base.parquet',
 'train_credit_bureau_a_1_0.parquet',
 'train_credit_bureau_a_1_1.parquet',
 'train_credit_bureau_a_1_2.parquet',
 'train_credit_bureau_a_1_3.parquet',
 'train_credit_bureau_a_2_0.parquet',
 'train_credit_bureau_a_2_1.parquet',
 'train_credit_bureau_a_2_10.parquet',
 'train_credit_bureau_a_2_2.parquet',
 'train_credit_bureau_a_2_3.parquet',
 'train_credit_bureau_a_2_4.parquet',
 'train_credit_bureau_a_2_5.parquet',
 'train_credit_bureau_a_2_6.parquet',
 'train_credit_bureau_a_2_7.parquet',
 'train_credit_bureau_a_2_8.parquet',
 'train_credit_bureau_a_2_9.parquet',
 'train_credit_bureau_b_1.parquet',
 'train_credit_bureau_b_2.parquet',
 'train_debitcard_1.parquet',
 'train_deposit_1.parquet',
 'train_other_1.parquet',
 'train_person_1.parquet',
 'train_person_2.parquet',
 'train_static_0_0.parquet',
 'train_static_0_1.parquet',
 'train_static_cb_0.parquet',
 'train_tax_regis

In [4]:
# wildcard pattern of the the file names for loading data
files=['train_base.parquet',
       'train_person_1.parquet',
       'train_person_2.parquet',                                                                                 
       'train_applprev_1*.parquet',
       'train_applprev_2.parquet',
       'train_credit_bureau_a_1*.parquet',
       'train_credit_bureau_a_2*.parquet',
       'train_credit_bureau_b_1.parquet',
       'train_credit_bureau_b_2.parquet',
       'train_debitcard_1.parquet',
       'train_deposit_1.parquet',
       'train_other_1.parquet',
       'train_static_0*.parquet',
       'train_static_cb_0.parquet',
       'train_tax_registry_a_1.parquet',
       'train_tax_registry_b_1.parquet',
       'train_tax_registry_c_1.parquet']

In [5]:
pl.read_parquet("parquet_files/train/train_base.parquet")

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1
…,…,…,…,…
2703450,"""2020-10-05""",202010,91,0
2703451,"""2020-10-05""",202010,91,0
2703452,"""2020-10-05""",202010,91,0
2703453,"""2020-10-05""",202010,91,0


In [6]:
# cast to the correct data type
def cast_data(df):
    for col in df.columns:
        if (col[-1]=='D') | (col=='date_decision'):
            df=df.with_columns(pl.col(col).cast(pl.Date))
        elif df[col].dtype==pl.Boolean:
            df=df.with_columns(pl.col(col).cast(pl.String))
    return df

In [7]:
# aggregate the data (because each observations (customer) in a file may have more than one data row)
def aggregate_data(df):
    base_cols=[col for col in df.columns if col in ['case_id', 'date_decision', 'WEEK_NUM', 'MONTH', 'target']]
    base_agg=[pl.col(col).max() for col in base_cols if col !='case_id']
    
    cat_cols=[col for col in df.columns if (df[col].dtype ==pl.String) | (df[col].dtype==pl.Null) ]
    cat_max=[pl.col(col).max().alias('max_'+col) for col in cat_cols]
    cat_first=[pl.col(col).first().alias('first_'+col) for col in cat_cols]
    cat_last=[pl.col(col).last().alias('last_'+col) for col in cat_cols]
    cat_n_unique=[pl.col(col).n_unique().alias('n_unique_'+col) for col in cat_cols]
    
    d_cols=[col for col in df.columns if (df[col].dtype ==pl.Date)&(col!='date_decision')]
    d_max=[pl.col(col).max().alias('max_'+col) for col in d_cols]
    d_min=[pl.col(col).min().alias('min_'+col) for col in d_cols]
    d_median=[pl.col(col).median().alias('median_'+col) for col in d_cols]

    other_cols=[col for col in df.columns if col not in base_cols+cat_cols+d_cols]
    max_other=[pl.col(col).max().alias('max_'+col) for col in other_cols]
    mean_other=[pl.col(col).mean().alias('mean_'+col) for col in other_cols]
    min_other=[pl.col(col).min().alias('min_'+col) for col in other_cols]
    total=base_agg+cat_max+cat_first+cat_last+cat_n_unique+d_max+d_min+d_median+max_other+mean_other+min_other
    return total

In [8]:
# open, cast, aggregate, and concatinate data
def load_data(files, dir_):
    PATH=Path(dir_)
    for file in files:
        paths=PATH.glob(file)
        chunks=[]
        for path in paths:
            print(path)
            df=pl.read_parquet(path)
            df=cast_data(df)
            if ('1' in file)|('2' in file): 
                df=df.group_by('case_id').agg(aggregate_data(df))
            chunks.append(df)        
        df=pl.concat(chunks, how='vertical_relaxed')
        if file in ['train_base.parquet','test_base.parquet']:
            base=df
            name=file
        else:
            for col in df.columns:
                if 'num_group' in col:
                    if name=='train_base.parquet':
                        df=df.rename({col:col+file[5:]})
                    if name=='test_base.parquet':
                        df=df.rename({col:col+file[4:]})        
            base=base.join(df, on='case_id', how='left')
    return base

In [9]:
df=load_data(files, train_dir)

parquet_files/train/train_base.parquet
parquet_files/train/train_person_1.parquet
parquet_files/train/train_person_2.parquet
parquet_files/train/train_applprev_1_0.parquet
parquet_files/train/train_applprev_1_1.parquet
parquet_files/train/train_applprev_2.parquet
parquet_files/train/train_credit_bureau_a_1_0.parquet
parquet_files/train/train_credit_bureau_a_1_1.parquet
parquet_files/train/train_credit_bureau_a_1_2.parquet
parquet_files/train/train_credit_bureau_a_1_3.parquet
parquet_files/train/train_credit_bureau_a_2_0.parquet
parquet_files/train/train_credit_bureau_a_2_1.parquet
parquet_files/train/train_credit_bureau_a_2_10.parquet
parquet_files/train/train_credit_bureau_a_2_2.parquet
parquet_files/train/train_credit_bureau_a_2_3.parquet
parquet_files/train/train_credit_bureau_a_2_4.parquet
parquet_files/train/train_credit_bureau_a_2_5.parquet
parquet_files/train/train_credit_bureau_a_2_6.parquet
parquet_files/train/train_credit_bureau_a_2_7.parquet
parquet_files/train/train_credit_

In [10]:
# filter duplicated, constant, high_cardinality and high null columns
def filter_cols(df):
    hashes=[df[col].hash().to_numpy().tobytes() for col in df.columns]
    seen_hashes=[]
    duplicated_cols=[]
    for i in range(len(hashes)):
        if hashes[i] not in seen_hashes:
            seen_hashes.append(hashes[i])
        else:
            duplicated_cols.append(df.columns[i])
    print(f'number of duplicated columns is {len(duplicated_cols)}')
    df=df.drop(duplicated_cols)

    constant_cols=[]
    for col in df.columns:
        if df[col].n_unique()<=1:
            constant_cols.append(col)
    print(f'number of constant columns is {len(constant_cols)}')        
    df=df.drop(constant_cols)

    high_cardinality_cols=[]
    for col in df.columns:
        if (df[col].n_unique()>50)&(df[col].dtype==pl.String):
            high_cardinality_cols.append(col)
    print(f'number of high_cardinality columns is {len(high_cardinality_cols)}')        
    df=df.drop(high_cardinality_cols)

    high_null_cols=[]
    for col in df.columns:
        if df[col].null_count()/len(df)>0.95:
            high_null_cols.append(col)
    print(f'number of high null columns is {len(high_null_cols)}')        
    df=df.drop(high_null_cols)
    return df

In [11]:
df=filter_cols(df)

number of duplicated columns is 57
number of constant columns is 3
number of high_cardinality columns is 58
number of high null columns is 237


In [12]:
# optimize RAM used by casting numbers and dates to the minimum data type
def optimize_memory(df):
    for col in df.columns:
        if str(df[col].dtype)[:3]=='Int':
            max_=df[col].max()
            min_=df[col].min()
            if (min_>np.iinfo(np.int8).min) and (max_<np.iinfo(np.int8).max):
                df=df.with_columns(pl.col(col).cast(pl.Int8))
            elif (min_>np.iinfo(np.int16).min) and (max_<np.iinfo(np.int16).max):
                df=df.with_columns(pl.col(col).cast(pl.Int16))
            elif (min_>np.iinfo(np.int32).min) and (max_<np.iinfo(np.int32).max):
                df=df.with_columns(pl.col(col).cast(pl.Int32))
            elif (min_>np.iinfo(np.int64).min) and (max_<np.iinfo(np.int64).max):
                df=df.with_columns(pl.col(col).cast(pl.Int64))
        if str(df[col].dtype)[:3]=='UIn':
            max_=df[col].max()
            min_=df[col].min()
            if (min_>np.iinfo(np.uint8).min) and (max_<np.iinfo(np.uint8).max):
                df=df.with_columns(pl.col(col).cast(pl.UInt8))
            elif (min_>np.iinfo(np.uint16).min) and (max_<np.iinfo(np.uint16).max):
                df=df.with_columns(pl.col(col).cast(pl.UInt16))
            elif (min_>np.iinfo(np.uint32).min) and (max_<np.iinfo(np.uint32).max):
                df=df.with_columns(pl.col(col).cast(pl.UInt32))
        if str(df[col].dtype)[:3]=='Flo':
            max_=df[col].max()
            min_=df[col].min()
            if(min_>np.finfo(np.float32).min) and (max_<np.finfo(np.float32).max):
                df=df.with_columns(pl.col(col).cast(pl.Float32))
            elif(min_>np.finfo(np.float64).min) and (max_<np.finfo(np.float64).max):
                df=df.with_columns(pl.col(col).cast(pl.Float64))
                print(col)
        if df[col].dtype==pl.Datetime:
            df=df.with_columns(pl.col(col).cast(pl.Date))
            
    return df
df=optimize_memory(df)

In [13]:
# handle missing data
def impute_na(df):
    for col in df.columns:
        if df[col].dtype==pl.String:
            df=df.with_columns(pl.col(col).fill_null('None'))
        elif df[col].dtype==pl.Date:
            df=df.with_columns(pl.col(col).fill_null(df[col].median()))
        else:
            df=df.with_columns(pl.col(col).fill_null(df[col].mean()))
    return df
df=impute_na(df)

In [14]:
# convert date columns to numbers
def handle_dates(df):
    for col in df.columns:
        if col[-1]=='D':
            df=df.with_columns((pl.col(col)-pl.col('date_decision')).dt.total_days())
    return df
df=handle_dates(df)    

In [15]:
# create two more features
def feature_eng(df):
    df=df.with_columns(
        month_decision=pl.col('date_decision').dt.month(),
        weekday_decision=pl.col('date_decision').dt.weekday()        
    )
    return df
df=feature_eng(df)

In [16]:
# remove unesscessary columns
df=df.drop('case_id')
df=df.drop('date_decision')

In [17]:
target=df['target']
df=df.drop('target')

In [18]:
df=df.to_pandas()

In [19]:
target=target.to_pandas()

In [20]:
# mark the customers with loan approval from week 78
mask=df['WEEK_NUM']>=78
sum(mask)/len(df)

0.09937189640908677

In [21]:
# split X, y for dev and test set
X=df[mask]
y=target[mask]

In [22]:
groups=X['WEEK_NUM'].unique()

In [23]:
# split X, y into development and test set
X_dev=[]
y_dev=[]
X_test=[]
y_test=[]
for group in groups:
    X_inter=X[X['WEEK_NUM']==group]
    y_inter=y[X['WEEK_NUM']==group]
    X_inter_dev, X_inter_test, y_inter_dev, y_inter_test=train_test_split(X_inter, y_inter, test_size=0.5, random_state=0, stratify=y_inter)
    X_dev.append(X_inter_dev)
    X_test.append(X_inter_test)
    y_dev.append(y_inter_dev)
    y_test.append(y_inter_test)
X_dev=pd.concat(X_dev)
X_test=pd.concat(X_test)
y_dev=pd.concat(y_dev)
y_test=pd.concat(y_test)

In [24]:
# X, y for training
df=df[~mask]
target=target[~mask]

In [25]:
# filter category and number columns
cat_cols=[col for col in df.columns if df[col].dtype=='O']
num_cols=[col for col in df.columns if col not in cat_cols]

In [26]:
# check if devset and testset have the same distribution
# 1. missing numerical columns

# X_dev[]




In [27]:
# encode category columns
encoder=TargetEncoder(random_state=0)
df[cat_cols]=encoder.fit_transform(df[cat_cols], target)
X_dev[cat_cols]=encoder.transform(X_dev[cat_cols])
X_test[cat_cols]=encoder.transform(X_test[cat_cols])


# encoder=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
# df[cat_cols]=encoder.fit_transform(df[cat_cols])
# X_dev[cat_cols]=encoder.transform(X_dev[cat_cols])
# X_test[cat_cols]=encoder.transform(X_test[cat_cols])

In [28]:
# with open('encoder.pkl', 'wb') as f:
#     pickle.dump(encoder,f)    

In [29]:
# for col in df.columns:
#     if df[col].dtype=='O':
#         df[col]=df[col].astype('category')

In [30]:
# optimize pandas dataframe 
def optimize_memory(df):
    for col in df.columns:
        if str(df[col].dtype)[:3]=='int':
            max_=df[col].max()
            min_=df[col].min()
            if (min_>np.iinfo(np.int8).min) and (max_<np.iinfo(np.int8).max):
                df[col]=df[col].astype('int8')
            elif (min_>np.iinfo(np.int16).min) and (max_<np.iinfo(np.int16).max):
                df[col]=df[col].astype('int16')
            elif (min_>np.iinfo(np.int32).min) and (max_<np.iinfo(np.int32).max):
                df[col]=df[col].astype('int32')
            elif (min_>np.iinfo(np.int64).min) and (max_<np.iinfo(np.int64).max):
                df[col]=df[col].astype('int64')
        if str(df[col].dtype)[:3]=='flo':
            max_=df[col].max()
            min_=df[col].min()
            if(min_>np.finfo(np.float16).min) and (max_<np.finfo(np.float16).max):
                df[col]=df[col].astype('float16')
            elif(min_>np.finfo(np.float32).min) and (max_<np.finfo(np.float32).max):
                df[col]=df[col].astype('float32')
            elif(min_>np.finfo(np.float64).min) and (max_<np.finfo(np.float64).max):
                df[col]=df[col].astype('float64')
    return df
df=optimize_memory(df)

In [31]:
# get train, dev and test weeks
train_weeks=df['WEEK_NUM']
dev_weeks=X_dev['WEEK_NUM']
test_weeks=X_test['WEEK_NUM']

In [32]:
# rescale number columns
scaler=StandardScaler()
df[num_cols]=scaler.fit_transform(df[num_cols])
X_dev[num_cols]=scaler.transform(X_dev[num_cols])
X_test[num_cols]=scaler.transform(X_test[num_cols])

In [33]:
# predict function
def predict(models, X_test):
    y_preds=[]
    for model in models:
        if isinstance(model, xgb.Booster):
            xgb_test=xgb.DMatrix(X_test, enable_categorical=True)
            y_pred=model.predict(xgb_test)
        elif isinstance(model, catboost.CatBoost):
            y_pred=model.predict_proba(X_test)[:,1]
        elif isinstance(model, tf.keras.Sequential):
            y_pred=model.predict(X_test).ravel()
        elif isinstance(model, lgb.Booster):
            y_pred=model.predict(X_test)
        else:
                y_pred=model.predict_proba(X_test)[:,1]
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [34]:
# base model
early_stopping=tf.keras.callbacks.EarlyStopping(patience=10)
model=tf.keras.Sequential([tf.keras.layers.Dense(1, activation='sigmoid')])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])
model.fit(df, target, 
      epochs=100, 
      batch_size=512, 
      validation_data=(X_dev, y_dev), 
      callbacks=[early_stopping],
      verbose=0)

E0000 00:00:1768529564.516452   15841 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1768529564.529735   15841 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2026-01-16 02:12:47.666534: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 4058858304 exceeds 10% of free system memory.


<keras.src.callbacks.history.History at 0x7545a6150c20>

In [35]:
# metric score
def stability_metric(model, X_test, y_test, test_week):
    weeks=[]
    ginis=[]
    for week in sorted(test_week.unique()):
        mask=test_week==week
        y_pred=predict(model, X_test[mask])
        y_true=y_test[mask]
        if len(np.unique(y_true)) < 2:  
            print(f"Skipping week {week}: Only one class in y_true.")
            continue
        gini=2*roc_auc_score(y_true, y_pred)-1
        weeks.append(week)
        ginis.append(gini)
    slope,intercept,_,_,_=linregress(weeks, ginis)            
    std=np.std([slope*week+intercept-gini for week, gini in zip(weeks, ginis)])
    final_score= np.mean(ginis)+88*min(0,slope)-0.5*std
    return final_score

In [36]:
# base model score
stability_metric([model], X_dev, y_dev, dev_weeks)

[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 457us/step
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 454us/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 428us/step
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 417us/step
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442us/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 411us/step
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 385us/step
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 360us/step
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 365us/step
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 358us/step
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 370us/step
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 375us/step
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386us/step
[1m1

np.float64(0.631080412935162)

In [37]:
# ANOVA F-value for each features and target
f_statistic,_ =f_classif(df, target)
f_dict={key: value for key, value in zip(df.columns, f_statistic) if not np.isnan(value)}
f_dict=dict(sorted(f_dict.items(), key=lambda x: x[1], reverse=True))
f_dict

  f = msb / msw


{'pctinstlsallpaidlate1d_3546856L': np.float64(15625.231884411156),
 'pctinstlsallpaidlate4d_3546849L': np.float64(14797.797094136415),
 'pctinstlsallpaidlate6d_3546844L': np.float64(13932.100665627757),
 'pctinstlsallpaidlat10d_839L': np.float64(12599.077746978164),
 'lastrejectreason_759M': np.float64(11378.900209902546),
 'min_dpdmaxdateyear_596T': np.float64(8005.250106444751),
 'numinstlswithdpd10_728L': np.float64(7901.032902240045),
 'lastst_736L': np.float64(7746.018022076062),
 'pctinstlsallpaidearl3d_427L': np.float64(7600.590875670289),
 'last_rejectreason_755M': np.float64(7333.251035401568),
 'mean_dpdmaxdateyear_596T': np.float64(7279.1214376612),
 'lastrejectreasonclient_4145040M': np.float64(7248.640879116201),
 'last_status_219L': np.float64(7215.780127486743),
 'last_rejectreasonclient_4145042M': np.float64(6795.124341954999),
 'days120_123L': np.float64(6721.050105809204),
 'days90_310L': np.float64(6489.297136620554),
 'days180_256L': np.float64(6442.915543810666),


In [38]:
# choose features base on ANOVA F-value
for threshold in [0.2, 0.4, 0.6, 0.8, 1]:
    n_features= round(threshold*len(f_dict))
    chosen_cols=list(f_dict.keys())[:n_features]
    early_stopping=tf.keras.callbacks.EarlyStopping(patience=10)
    model=tf.keras.Sequential([tf.keras.layers.Dense(1, activation='sigmoid')])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])
    model.fit(df[chosen_cols], target, 
      epochs=100, 
      batch_size=512, 
      validation_data=(X_dev[chosen_cols], y_dev), 
      callbacks=[early_stopping],
      verbose=0)
    print(f"threshold: {threshold*100}, stability score: {stability_metric([model], X_dev[chosen_cols], y_dev, dev_weeks)}")

[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471us/step
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 439us/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425us/step
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420us/step
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 436us/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419us/step
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 385us/step
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 369us/step
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 367us/step
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 372us/step
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 371us/step
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 370us/step
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 379us/step
[1m1

2026-01-16 02:16:49.630368: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 1583944704 exceeds 10% of free system memory.


[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 457us/step
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 445us/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 423us/step
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 415us/step
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 447us/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 415us/step
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398us/step
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 360us/step
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 362us/step
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353us/step
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 377us/step
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395us/step
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 379us/step
[1m1

2026-01-16 02:17:24.013315: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 2381416864 exceeds 10% of free system memory.


[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 466us/step
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 470us/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 426us/step
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 415us/step
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 448us/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 423us/step
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 393us/step
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 375us/step
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 378us/step
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 360us/step
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 404us/step
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 407us/step
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 400us/step
[1m1

2026-01-16 02:17:55.083400: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 3173389216 exceeds 10% of free system memory.


[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 481us/step
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460us/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 438us/step
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 430us/step
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 447us/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 409us/step
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395us/step
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 367us/step
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374us/step
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 371us/step
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 375us/step
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374us/step
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381us/step
[1m1

2026-01-16 02:18:30.053430: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 3965361568 exceeds 10% of free system memory.


[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460us/step
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 452us/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 422us/step
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 426us/step
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 450us/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 436us/step
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 403us/step
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 368us/step
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 363us/step
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 372us/step
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 373us/step
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 384us/step
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383us/step
[1m1

In [39]:
cpu_num=os.cpu_count()
cpu_num

16

In [40]:
# mutual information for each features and target
mi=mutual_info_classif(df, target, n_jobs=cpu_num)
mi_dict={key: value for key, value in zip(df.columns, mi) if not np.isnan(value)}
mi_dict=dict(sorted(mi_dict.items(), key=lambda x: x[1], reverse=True))
mi_dict



{'max_contaddr_matchlist_1032L': np.float64(0.28160545170477014),
 'paytype1st_925L': np.float64(0.2803939065778157),
 'first_contaddr_matchlist_1032L': np.float64(0.2795118683226505),
 'max_empladdr_district_926M': np.float64(0.2774325574495482),
 'max_empladdr_zipcode_114M': np.float64(0.2726370765292868),
 'max_type_25L': np.float64(0.2637049691804706),
 'max_education_927M': np.float64(0.25820273256061754),
 'last_education_927M': np.float64(0.25657416956928325),
 'max_safeguarantyflag_411L': np.float64(0.20510850584955553),
 'first_safeguarantyflag_411L': np.float64(0.202042697408634),
 'max_contaddr_smempladdr_334L': np.float64(0.19952114999892856),
 'first_contaddr_smempladdr_334L': np.float64(0.19375771736446978),
 'first_empls_economicalst_849M': np.float64(0.18442461230942697),
 'last_conts_role_79M': np.float64(0.18431779816356353),
 'max_conts_role_79M': np.float64(0.1838866757618286),
 'first_conts_role_79M': np.float64(0.18280579428767585),
 'education_88M': np.float64(0.

In [41]:
# choose features base on mutual information
for threshold in [0.2, 0.4, 0.6, 0.8, 1]:
    n_features= round(threshold*len(mi_dict))
    chosen_cols=list(mi_dict.keys())[:n_features]
    early_stopping=tf.keras.callbacks.EarlyStopping(patience=10)
    model=tf.keras.Sequential([tf.keras.layers.Dense(1, activation='sigmoid')])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])
    model.fit(df[chosen_cols], target, 
      epochs=100, 
      batch_size=512, 
      validation_data=(X_dev[chosen_cols], y_dev), 
      callbacks=[early_stopping],
      verbose=0)
    print(f"threshold: {threshold*100}, stability score: {stability_metric([model], X_dev[chosen_cols], y_dev, dev_weeks)}")

[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 472us/step
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 456us/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 422us/step
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419us/step
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 434us/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 410us/step
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395us/step
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398us/step
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 407us/step
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 359us/step
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386us/step
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395us/step
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 417us/step
[1m1

In [42]:
# DNN with 2 layers
early_stopping=tf.keras.callbacks.EarlyStopping(patience=10)
model=tf.keras.Sequential([tf.keras.layers.Dense(50, activation='sigmoid'),
                          tf.keras.layers.Dense(1, activation='sigmoid')])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])
model.fit(df, target, 
  epochs=100, 
  batch_size=512, 
  validation_data=(X_dev, y_dev), 
  callbacks=[early_stopping],
  verbose=0)
print(stability_metric([model], X_dev, y_dev, dev_weeks))

[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 563us/step
[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 518us/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 494us/step
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 481us/step
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 514us/step
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 468us/step
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 452us/step
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419us/step
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425us/step
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 421us/step
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 433us/step
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 486us/step
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 433us/step
[1m1

In [43]:
# lightgbm
lgb_train=lgb.Dataset(df, target)
lgb_valid=lgb.Dataset(X_dev, y_dev, reference=lgb_train)
params={'objective':'binary', 
                'metrics':'auc',
        'n_estimators':1000,
        'num_leaves':41,
        'learning_rate':0.03,
        'colsample_bytree':0.8,
        'colsample_bynode':0.8,
        'bagging_fraction':0.8,
            'bagging_freq':2,
                 'seed':0,
                'device_type':'cpu',
       'verbose':0}
lgb_model=lgb.train(params,
     lgb_train,
     valid_sets=lgb_valid,
     callbacks=[lgb.early_stopping(10)])
print(stability_metric([lgb_model], X_dev, y_dev, dev_weeks))

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[104]	valid_0's auc: 0.85941
0.6902055853251603


In [44]:
# xgboost
xgb_train=xgb.DMatrix(df, target)
xgb_valid=xgb.DMatrix(X_dev, y_dev)
params={'objective':'binary:logistic', 
                'eval_metric':'auc',
                 'seed':0,
        'learning_rate':0.05,
        'max_depth':6,
        'colsample_bytree':0.8,
        'colsample_bynode':0.8,
        'subsample':0.8,
                'device':'cpu',
       'verbose':-1}
xgb_model=xgb.train(params,
     xgb_train,
     evals=[(xgb_valid,'xgb_valid')],
     num_boost_round=1000,
     early_stopping_rounds=10)
print(stability_metric([xgb_model], X_dev, y_dev, dev_weeks))

Parameters: { "verbose" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	xgb_valid-auc:0.75780
[1]	xgb_valid-auc:0.78945
[2]	xgb_valid-auc:0.79142
[3]	xgb_valid-auc:0.80152
[4]	xgb_valid-auc:0.80279
[5]	xgb_valid-auc:0.80684
[6]	xgb_valid-auc:0.81474
[7]	xgb_valid-auc:0.81914
[8]	xgb_valid-auc:0.82030
[9]	xgb_valid-auc:0.82180
[10]	xgb_valid-auc:0.82424
[11]	xgb_valid-auc:0.82559
[12]	xgb_valid-auc:0.82803
[13]	xgb_valid-auc:0.83006
[14]	xgb_valid-auc:0.83177
[15]	xgb_valid-auc:0.83304
[16]	xgb_valid-auc:0.83348
[17]	xgb_valid-auc:0.83454
[18]	xgb_valid-auc:0.83536
[19]	xgb_valid-auc:0.83683
[20]	xgb_valid-auc:0.83781
[21]	xgb_valid-auc:0.83847
[22]	xgb_valid-auc:0.83987
[23]	xgb_valid-auc:0.84068
[24]	xgb_valid-auc:0.84177
[25]	xgb_valid-auc:0.84294
[26]	xgb_valid-auc:0.84374
[27]	xgb_valid-auc:0.84401
[28]	xgb_valid-auc:0.84476
[29]	xgb_valid-auc:0.84558
[30]	xgb_valid-auc:0.84603
[31]	xgb_valid-auc:0.84671
[32]	xgb_valid-auc:0.84739
[33]	xgb_valid-auc:0.84779
[34]	xgb_valid-auc:0.84831
[35]	xgb_valid-auc:0.84890
[36]	xgb_valid-auc:0.84922
[37]	xgb_va

In [45]:
# catboost
train_pool=catboost.Pool(df, target)
valid_pool=catboost.Pool(X_dev, y_dev)
cat_model=CatBoostClassifier(eval_metric='AUC', 
                             learning_rate=0.05,
                             max_depth=6,
                         random_seed=0,
                         task_type='CPU'
                        )
cat_model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=10, verbose=0)
print(stability_metric([cat_model], X_dev, y_dev, dev_weeks))

0.7262556996416473


In [46]:
# combine the three boost models with different ratio
best_score=0
for w0, w1 in itertools.product([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], repeat=2):
    weeks=[]
    ginis=[]
    for week in sorted(dev_weeks.unique()):
        mask=dev_weeks==week
        y_pred=w0*predict([lgb_model], X_dev[mask])+w1*predict([xgb_model], X_dev[mask])+(1-w0-w1)*predict([cat_model], X_dev[mask])
        y_true=y_dev[mask]
        gini=2*roc_auc_score(y_true, y_pred)-1
        weeks.append(week)
        ginis.append(gini)
    slope,intercept,_,_,_=linregress(weeks, ginis)            
    std=np.std([slope*week+intercept-gini for week, gini in zip(weeks, ginis)])
    final_score= np.mean(ginis)+88*min(0,slope)-0.5*std
    if final_score>best_score:
        best_score=final_score
        best_w0=w0
        best_w1=w1
        best_w2=21-w0-w1
        print(w0, w1, 1-w0-w1, best_score)

0 0 1 0.7262556996416473
0 0.1 0.9 0.7275257209641786
0 0.2 0.8 0.7283256778500252
0 0.3 0.7 0.7289146427436755
0 0.4 0.6 0.7291942713617094
0 0.5 0.5 0.729215861388172


In [47]:
# best combination
weeks=[]
ginis=[]
for week in sorted(dev_weeks.unique()):
    mask=dev_weeks==week
    y_pred=best_w0*predict([lgb_model], X_dev[mask])+best_w1*predict([xgb_model], X_dev[mask])+best_w2*predict([cat_model], X_dev[mask])
    y_true=y_dev[mask]
    gini=2*roc_auc_score(y_true, y_pred)-1
    weeks.append(week)
    ginis.append(gini)
slope,intercept,_,_,_=linregress(weeks, ginis)            
std=np.std([slope*week+intercept-gini for week, gini in zip(weeks, ginis)])
final_score= np.mean(ginis)+88*min(0,slope)-0.5*std
print(final_score)

0.7266271394392513


In [48]:
# lightgbm on testset
stability_metric([lgb_model], X_test, y_test, test_weeks)

np.float64(0.6833542459581605)

In [49]:
# xgboost on testset
stability_metric([xgb_model], X_test, y_test, test_weeks)

np.float64(0.7080029402137891)

In [50]:
# catboost on testset
stability_metric([cat_model], X_test, y_test, test_weeks)

np.float64(0.7098028376044893)

In [51]:
# combine model on testset
weeks=[]
ginis=[]
for week in sorted(test_weeks.unique()):
    mask=test_weeks==week
    y_pred=0*predict([lgb_model], X_test[mask])+0.8*predict([xgb_model], X_test[mask])+0.2*predict([cat_model], X_test[mask])
    y_true=y_test[mask]
    gini=2*roc_auc_score(y_true, y_pred)-1
    weeks.append(week)
    ginis.append(gini)
slope,intercept,_,_,_=linregress(weeks, ginis)            
std=np.std([slope*week+intercept-gini for week, gini in zip(weeks, ginis)])
final_score= np.mean(ginis)+88*min(0,slope)-0.5*std

In [52]:
final_score

np.float64(0.711086270122179)