In [1]:
import polars as pl
import pandas as pd
from pathlib import Path
import lightgbm as lgb
import xgboost as xgb
import catboost
import numpy as np

In [2]:
train_dir='parquet_files/train/'

In [3]:
import os
os.listdir(train_dir)

['train_applprev_1_0.parquet',
 'train_applprev_1_1.parquet',
 'train_applprev_2.parquet',
 'train_base.parquet',
 'train_credit_bureau_a_1_0.parquet',
 'train_credit_bureau_a_1_1.parquet',
 'train_credit_bureau_a_1_2.parquet',
 'train_credit_bureau_a_1_3.parquet',
 'train_credit_bureau_a_2_0.parquet',
 'train_credit_bureau_a_2_1.parquet',
 'train_credit_bureau_a_2_10.parquet',
 'train_credit_bureau_a_2_2.parquet',
 'train_credit_bureau_a_2_3.parquet',
 'train_credit_bureau_a_2_4.parquet',
 'train_credit_bureau_a_2_5.parquet',
 'train_credit_bureau_a_2_6.parquet',
 'train_credit_bureau_a_2_7.parquet',
 'train_credit_bureau_a_2_8.parquet',
 'train_credit_bureau_a_2_9.parquet',
 'train_credit_bureau_b_1.parquet',
 'train_credit_bureau_b_2.parquet',
 'train_debitcard_1.parquet',
 'train_deposit_1.parquet',
 'train_other_1.parquet',
 'train_person_1.parquet',
 'train_person_2.parquet',
 'train_static_0_0.parquet',
 'train_static_0_1.parquet',
 'train_static_cb_0.parquet',
 'train_tax_regis

In [4]:
files=['train_base.parquet',
       'train_person_1.parquet',
       'train_person_2.parquet',                                                                                 
    'train_applprev_1*.parquet',
             'train_applprev_2.parquet',
             'train_credit_bureau_a_1*.parquet',
             'train_credit_bureau_a_2*.parquet',
             'train_credit_bureau_b_1.parquet',
             'train_credit_bureau_b_2.parquet',
             'train_debitcard_1.parquet',
             'train_deposit_1.parquet',
             'train_other_1.parquet',
             'train_static_0*.parquet',
             'train_static_cb_0.parquet',
             'train_tax_registry_a_1.parquet',
             'train_tax_registry_b_1.parquet',
             'train_tax_registry_c_1.parquet']

In [5]:
def cast_data(df):
    for col in df.columns:
        if (col[-1]=='D') | (col=='date_decision'):
            df=df.with_columns(pl.col(col).cast(pl.Date))
        elif df[col].dtype==pl.Boolean:
            df=df.with_columns(pl.col(col).cast(pl.String))
    return df

In [6]:
def aggregate_data(df):
    base_cols=[col for col in df.columns if col in ['case_id', 'date_decision', 'WEEK_NUM', 'MONTH', 'target']]
    base_agg=[pl.col(col).max() for col in base_cols if col !='case_id']
    
    cat_cols=[col for col in df.columns if (df[col].dtype ==pl.String) | (df[col].dtype==pl.Null) ]
    cat_max=[pl.col(col).max().alias('max_'+col) for col in cat_cols]
    cat_first=[pl.col(col).first().alias('first_'+col) for col in cat_cols]
    cat_last=[pl.col(col).last().alias('last_'+col) for col in cat_cols]
    cat_n_unique=[pl.col(col).n_unique().alias('n_unique_'+col) for col in cat_cols]
    
    d_cols=[col for col in df.columns if (df[col].dtype ==pl.Date)&(col!='date_decision')]
    d_max=[pl.col(col).max().alias('max_'+col) for col in d_cols]
    d_min=[pl.col(col).min().alias('min_'+col) for col in d_cols]
    d_median=[pl.col(col).median().alias('median_'+col) for col in d_cols]

    other_cols=[col for col in df.columns if col not in base_cols+cat_cols+d_cols]
    max_other=[pl.col(col).max().alias('max_'+col) for col in other_cols]
    mean_other=[pl.col(col).mean().alias('mean_'+col) for col in other_cols]
    min_other=[pl.col(col).min().alias('min_'+col) for col in other_cols]
    total=base_agg+cat_max+cat_first+cat_last+cat_n_unique+d_max+d_min+d_median+max_other+mean_other+min_other
    return total

In [7]:
def load_data(files, dir_):
    PATH=Path(dir_)
    for file in files:
        paths=PATH.glob(file)
        chunks=[]
        for path in paths:
            print(path)
            df=pl.read_parquet(path)
            df=cast_data(df)
            if ('1' in file)|('2' in file): 
                df=df.group_by('case_id').agg(aggregate_data(df))
            chunks.append(df)        
        df=pl.concat(chunks, how='vertical_relaxed')
        if file in ['train_base.parquet','test_base.parquet']:
            base=df
            name=file
        else:
            for col in df.columns:
                if 'num_group' in col:
                    if name=='train_base.parquet':
                        df=df.rename({col:col+file[5:]})
                    if name=='test_base.parquet':
                        df=df.rename({col:col+file[4:]})        
            base=base.join(df, on='case_id', how='left')
    return base

In [8]:
df=load_data(files, train_dir)

parquet_files/train/train_base.parquet
parquet_files/train/train_person_1.parquet
parquet_files/train/train_person_2.parquet
parquet_files/train/train_applprev_1_0.parquet
parquet_files/train/train_applprev_1_1.parquet
parquet_files/train/train_applprev_2.parquet
parquet_files/train/train_credit_bureau_a_1_0.parquet
parquet_files/train/train_credit_bureau_a_1_1.parquet
parquet_files/train/train_credit_bureau_a_1_2.parquet
parquet_files/train/train_credit_bureau_a_1_3.parquet
parquet_files/train/train_credit_bureau_a_2_0.parquet
parquet_files/train/train_credit_bureau_a_2_1.parquet
parquet_files/train/train_credit_bureau_a_2_10.parquet
parquet_files/train/train_credit_bureau_a_2_2.parquet
parquet_files/train/train_credit_bureau_a_2_3.parquet
parquet_files/train/train_credit_bureau_a_2_4.parquet
parquet_files/train/train_credit_bureau_a_2_5.parquet
parquet_files/train/train_credit_bureau_a_2_6.parquet
parquet_files/train/train_credit_bureau_a_2_7.parquet
parquet_files/train/train_credit_

In [9]:
def filter_cols(df):
    hashes=[df[col].hash().to_numpy().tobytes() for col in df.columns]
    seen_hashes=[]
    duplicated_cols=[]
    for i in range(len(hashes)):
        if hashes[i] not in seen_hashes:
            seen_hashes.append(hashes[i])
        else:
            duplicated_cols.append(df.columns[i])
    print(f'number of duplicated columns is {len(duplicated_cols)}')
    df=df.drop(duplicated_cols)

    constant_cols=[]
    for col in df.columns:
        if df[col].n_unique()<=1:
            constant_cols.append(col)
    print(f'number of constant columns is {len(constant_cols)}')        
    df=df.drop(constant_cols)

    high_cardinality_cols=[]
    for col in df.columns:
        if (df[col].n_unique()>50)&(df[col].dtype==pl.String):
            high_cardinality_cols.append(col)
    print(f'number of high_cardinality columns is {len(high_cardinality_cols)}')        
    df=df.drop(high_cardinality_cols)

    high_null_cols=[]
    for col in df.columns:
        if df[col].null_count()/len(df)>0.95:
            high_null_cols.append(col)
    print(f'number of high null columns is {len(high_null_cols)}')        
    df=df.drop(high_null_cols)
    return df

In [10]:
df=filter_cols(df)

number of duplicated columns is 57
number of constant columns is 3
number of high_cardinality columns is 58
number of high null columns is 237


In [11]:
def optimize_memory(df):
    for col in df.columns:
        if str(df[col].dtype)[:3]=='Int':
            max_=df[col].max()
            min_=df[col].min()
            if (min_>np.iinfo(np.int8).min) and (max_<np.iinfo(np.int8).max):
                df=df.with_columns(pl.col(col).cast(pl.Int8))
            elif (min_>np.iinfo(np.int16).min) and (max_<np.iinfo(np.int16).max):
                df=df.with_columns(pl.col(col).cast(pl.Int16))
            elif (min_>np.iinfo(np.int32).min) and (max_<np.iinfo(np.int32).max):
                df=df.with_columns(pl.col(col).cast(pl.Int32))
            elif (min_>np.iinfo(np.int64).min) and (max_<np.iinfo(np.int64).max):
                df=df.with_columns(pl.col(col).cast(pl.Int64))
        if str(df[col].dtype)[:3]=='UIn':
            max_=df[col].max()
            min_=df[col].min()
            if (min_>np.iinfo(np.uint8).min) and (max_<np.iinfo(np.uint8).max):
                df=df.with_columns(pl.col(col).cast(pl.UInt8))
            elif (min_>np.iinfo(np.uint16).min) and (max_<np.iinfo(np.uint16).max):
                df=df.with_columns(pl.col(col).cast(pl.UInt16))
            elif (min_>np.iinfo(np.uint32).min) and (max_<np.iinfo(np.uint32).max):
                df=df.with_columns(pl.col(col).cast(pl.UInt32))
        if str(df[col].dtype)[:3]=='Flo':
            max_=df[col].max()
            min_=df[col].min()
            if(min_>np.finfo(np.float32).min) and (max_<np.finfo(np.float32).max):
                df=df.with_columns(pl.col(col).cast(pl.Float32))
            elif(min_>np.finfo(np.float64).min) and (max_<np.finfo(np.float64).max):
                df=df.with_columns(pl.col(col).cast(pl.Float64))
                print(col)
        if df[col].dtype==pl.Datetime:
            df=df.with_columns(pl.col(col).cast(pl.Date))
            
    return df

In [12]:
df=optimize_memory(df)

In [13]:
def impute_na(df):
    for col in df.columns:
        if df[col].dtype==pl.String:
            df=df.with_columns(pl.col(col).fill_null('None'))
        elif df[col].dtype==pl.Date:
            df=df.with_columns(pl.col(col).fill_null(df[col].median()))
        else:
            df=df.with_columns(pl.col(col).fill_null(df[col].mean()))
    return df

In [None]:
df=impute_na(df)

In [14]:
def handle_dates(df):
    for col in df.columns:
        if col[-1]=='D':
            df=df.with_columns((pl.col(col)-pl.col('date_decision')).dt.total_days())
    return df

In [15]:
df=handle_dates(df)

In [16]:
def feature_eng(df):
    df=df.with_columns(
        month_decision=pl.col('date_decision').dt.month(),
        weekday_decision=pl.col('date_decision').dt.weekday()        
    )
    return df

In [17]:
df=feature_eng(df)

In [18]:
df=df.drop('case_id')

In [19]:
df=df.drop('date_decision')

In [20]:
target=df['target']
df=df.drop('target')

In [21]:
df=df.to_pandas()

In [22]:
target=target.to_pandas()

In [23]:
mask=df['WEEK_NUM']>=78

In [24]:
sum(mask)/len(df)

0.09937189640908677

In [25]:
X=df[mask]
y=target[mask]

In [26]:
groups=X['WEEK_NUM'].unique()

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_dev=[]
y_dev=[]
X_test=[]
y_test=[]
for group in groups:
    X_inter=X[X['WEEK_NUM']==group]
    y_inter=y[X['WEEK_NUM']==group]
    X_inter_dev, X_inter_test, y_inter_dev, y_inter_test=train_test_split(X_inter, y_inter, test_size=0.5, random_state=0, stratify=y_inter)
    X_dev.append(X_inter_dev)
    X_test.append(X_inter_test)
    y_dev.append(y_inter_dev)
    y_test.append(y_inter_test)

In [29]:
X_dev=pd.concat(X_dev)
X_test=pd.concat(X_test)
y_dev=pd.concat(y_dev)
y_test=pd.concat(y_test)

In [30]:
df=df[~mask]
target=target[~mask]

In [31]:
cat_cols=[col for col in df.columns if df[col].dtype=='O' ]

In [None]:
from sklearn.preprocessing import TargetEncoder

In [None]:
encoder=TargetEncoder(random_state=0)

In [None]:
df[cat_cols]=encoder.fit_transform(df[cat_cols], target)
X_dev[cat_cols]=encoder.transform(X_dev[cat_cols])
X_test[cat_cols]=encoder.transform(X_test[cat_cols])

In [32]:
from sklearn.preprocessing import OrdinalEncoder

In [33]:
encoder=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

In [34]:
df[cat_cols]=encoder.fit_transform(df[cat_cols])
X_dev[cat_cols]=encoder.transform(X_dev[cat_cols])
X_test[cat_cols]=encoder.transform(X_test[cat_cols])

In [None]:
import pickle

In [None]:
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder,f)    

In [None]:
# for col in df.columns:
#     if df[col].dtype=='O':
#         df[col]=df[col].astype('category')

In [35]:
import numpy as np

In [36]:
def optimize_memory(df):
    for col in df.columns:
        if str(df[col].dtype)[:3]=='int':
            max_=df[col].max()
            min_=df[col].min()
            if (min_>np.iinfo(np.int8).min) and (max_<np.iinfo(np.int8).max):
                df[col]=df[col].astype('int8')
            elif (min_>np.iinfo(np.int16).min) and (max_<np.iinfo(np.int16).max):
                df[col]=df[col].astype('int16')
            elif (min_>np.iinfo(np.int32).min) and (max_<np.iinfo(np.int32).max):
                df[col]=df[col].astype('int32')
            elif (min_>np.iinfo(np.int64).min) and (max_<np.iinfo(np.int64).max):
                df[col]=df[col].astype('int64')
        if str(df[col].dtype)[:3]=='flo':
            max_=df[col].max()
            min_=df[col].min()
            if(min_>np.finfo(np.float16).min) and (max_<np.finfo(np.float16).max):
                df[col]=df[col].astype('float16')
            elif(min_>np.finfo(np.float32).min) and (max_<np.finfo(np.float32).max):
                df[col]=df[col].astype('float32')
            elif(min_>np.finfo(np.float64).min) and (max_<np.finfo(np.float64).max):
                df[col]=df[col].astype('float64')
    return df

In [37]:
df=optimize_memory(df)

In [38]:
train_weeks=df['WEEK_NUM']
dev_weeks=X_dev['WEEK_NUM']
test_weeks=X_test['WEEK_NUM']

In [39]:
num_cols=[col for col in df.columns if col not in cat_cols]

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()

In [None]:
df[num_cols]=scaler.fit_transform(df[num_cols])
X_dev[num_cols]=scaler.transform(X_dev[num_cols])
X_test[num_cols]=scaler.transform(X_test[num_cols])

In [40]:
import tensorflow as tf

2025-05-08 03:48:16.223625: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-08 03:48:16.231926: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746676096.238936    1214 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746676096.240987    1214 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746676096.246592    1214 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [41]:
tf.random.set_seed(0)
import random
random.seed(0)
np.random.seed(0)

In [None]:
early_stopping=tf.keras.callbacks.EarlyStopping(patience=10)
model=tf.keras.Sequential([tf.keras.layers.Dense(1, activation='sigmoid')])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])
model.fit(df, target, 
      epochs=100, 
      batch_size=512, 
      validation_data=(X_dev, y_dev), 
      callbacks=[early_stopping])

In [42]:
def predict(models, X_test):
    y_preds=[]
    for model in models:
        if isinstance(model, xgb.Booster):
            xgb_test=xgb.DMatrix(X_test, enable_categorical=True)
            y_pred=model.predict(xgb_test)
        elif isinstance(model, catboost.CatBoost):
            y_pred=model.predict_proba(X_test)[:,1]
        elif isinstance(model, tf.keras.Sequential):
            y_pred=model.predict(X_test).ravel()
        elif isinstance(model, lgb.Booster):
            y_pred=model.predict(X_test)
        else:
                y_pred=model.predict_proba(X_test)[:,1]
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [43]:
from scipy.stats import linregress
from sklearn.metrics import roc_auc_score

In [44]:
def stability_metric(model, X_test, y_test, test_week):
    weeks=[]
    ginis=[]
    for week in sorted(test_week.unique()):
        mask=test_week==week
        y_pred=predict(model, X_test[mask])
        y_true=y_test[mask]
        if len(np.unique(y_true)) < 2:  
            print(f"Skipping week {week}: Only one class in y_true.")
            continue
        gini=2*roc_auc_score(y_true, y_pred)-1
        weeks.append(week)
        ginis.append(gini)
    slope,intercept,_,_,_=linregress(weeks, ginis)            
    std=np.std([slope*week+intercept-gini for week, gini in zip(weeks, ginis)])
    final_score= np.mean(ginis)+88*min(0,slope)-0.5*std
    return final_score

In [None]:
stability_metric([model], X_dev, y_dev, dev_weeks)

In [None]:
from sklearn.feature_selection import f_classif

In [None]:
f_statistic,_ =f_classif(df, target)
f_dict={key: value for key, value in zip(df.columns, f_statistic) if not np.isnan(value)}
f_dict=dict(sorted(f_dict.items(), key=lambda x: x[1], reverse=True))

In [None]:
for threshold in [0.2, 0.4, 0.6, 0.8, 1]:
    n_features= round(threshold*len(f_dict))
    chosen_cols=list(f_dict.keys())[:n_features]
    early_stopping=tf.keras.callbacks.EarlyStopping(patience=10)
    model=tf.keras.Sequential([tf.keras.layers.Dense(1, activation='sigmoid')])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])
    model.fit(df[chosen_cols], target, 
      epochs=100, 
      batch_size=512, 
      validation_data=(X_dev[chosen_cols], y_dev), 
      callbacks=[early_stopping])
    print(threshold, stability_metric([model], X_dev[chosen_cols], y_dev, dev_weeks))

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
os.cpu_count()

In [None]:
mi=mutual_info_classif(df, target, n_jobs=16)
mi_dict={key: value for key, value in zip(df.columns, mi) if not np.isnan(value)}
mi_dict=dict(sorted(mi_dict.items(), key=lambda x: x[1], reverse=True))

In [None]:
for threshold in [0.2, 0.4, 0.6, 0.8, 1]:
    n_features= round(threshold*len(mi_dict))
    chosen_cols=list(mi_dict.keys())[:n_features]
    early_stopping=tf.keras.callbacks.EarlyStopping(patience=10)
    model=tf.keras.Sequential([tf.keras.layers.Dense(1, activation='sigmoid')])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])
    model.fit(df[chosen_cols], target, 
      epochs=100, 
      batch_size=512, 
      validation_data=(X_dev[chosen_cols], y_dev), 
      callbacks=[early_stopping])
    print(threshold, stability_metric([model], X_dev[chosen_cols], y_dev, dev_weeks))

In [None]:
len(chosen_cols)==len(df.columns)

In [None]:
early_stopping=tf.keras.callbacks.EarlyStopping(patience=10)
model=tf.keras.Sequential([tf.keras.layers.Dense(50, activation='sigmoid'),
                          tf.keras.layers.Dense(1, activation='sigmoid')])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['auc'])
model.fit(df, target, 
  epochs=100, 
  batch_size=512, 
  validation_data=(X_dev, y_dev), 
  callbacks=[early_stopping])
print(stability_metric([model], X_dev, y_dev, dev_weeks))

In [45]:
import lightgbm as lgb

In [46]:
lgb_train=lgb.Dataset(df, target)
lgb_valid=lgb.Dataset(X_dev, y_dev, reference=lgb_train)
params={'objective':'binary', 
                'metrics':'auc',
        'n_estimators':1000,
        'num_leaves':41,
        'learning_rate':0.03,
        'colsample_bytree':0.8,
        'colsample_bynode':0.8,
        'bagging_fraction':0.8,
            'bagging_freq':2,
                 'seed':0,
                'device_type':'cpu',
       'verbose':2}
lgb_model=lgb.train(params,
     lgb_train,
     valid_sets=lgb_valid,
     callbacks=[lgb.early_stopping(10)])

[LightGBM] [Info] Number of positive: 44779, number of negative: 1330173
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.850916
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.386258
[LightGBM] [Debug] init for col-wise cost 0.140942 seconds, init for row-wise cost 1.872083 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.977118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 87608
[LightGBM] [Info] Number of data points in the train set: 1374952, number of used features: 737
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.032568 -> initscore=-3.391325
[LightGBM] [Info] Start training from score -3.391325
[LightGBM] [Debug] Re-bagging, using 1099036 data to train
[LightGBM] [Debug] Trained a tree with leaves = 41 and depth = 9
Training until validation scores don't improve for 10 rounds
[LightGBM] [Debug] Trained a tree with leaves = 41 and dep

In [47]:
stability_metric([lgb_model], X_dev, y_dev, dev_weeks)

np.float64(0.7301301558075108)

In [48]:
import xgboost as xgb

In [49]:
xgb_train=xgb.DMatrix(df, target)
xgb_valid=xgb.DMatrix(X_dev, y_dev)
params={'objective':'binary:logistic', 
                'eval_metric':'auc',
                 'seed':0,
        'learning_rate':0.05,
        'max_depth':6,
        'colsample_bytree':0.8,
        'colsample_bynode':0.8,
        'subsample':0.8,
                'device':'cpu'}
xgb_model=xgb.train(params,
     xgb_train,
     evals=[(xgb_valid,'xgb_valid')],
     num_boost_round=1000,
     early_stopping_rounds=10)

[0]	xgb_valid-auc:0.76278
[1]	xgb_valid-auc:0.78701
[2]	xgb_valid-auc:0.79545
[3]	xgb_valid-auc:0.80625
[4]	xgb_valid-auc:0.80927
[5]	xgb_valid-auc:0.81343
[6]	xgb_valid-auc:0.81681
[7]	xgb_valid-auc:0.81916
[8]	xgb_valid-auc:0.82122
[9]	xgb_valid-auc:0.82235
[10]	xgb_valid-auc:0.82285
[11]	xgb_valid-auc:0.82377
[12]	xgb_valid-auc:0.82374
[13]	xgb_valid-auc:0.82508
[14]	xgb_valid-auc:0.82638
[15]	xgb_valid-auc:0.82824
[16]	xgb_valid-auc:0.82934
[17]	xgb_valid-auc:0.82958
[18]	xgb_valid-auc:0.83152
[19]	xgb_valid-auc:0.83217
[20]	xgb_valid-auc:0.83308
[21]	xgb_valid-auc:0.83453
[22]	xgb_valid-auc:0.83618
[23]	xgb_valid-auc:0.83768
[24]	xgb_valid-auc:0.83837
[25]	xgb_valid-auc:0.83907
[26]	xgb_valid-auc:0.83972
[27]	xgb_valid-auc:0.84032
[28]	xgb_valid-auc:0.84120
[29]	xgb_valid-auc:0.84164
[30]	xgb_valid-auc:0.84239
[31]	xgb_valid-auc:0.84316
[32]	xgb_valid-auc:0.84399
[33]	xgb_valid-auc:0.84449
[34]	xgb_valid-auc:0.84510
[35]	xgb_valid-auc:0.84565
[36]	xgb_valid-auc:0.84626
[37]	xgb_va

In [50]:
stability_metric([xgb_model], X_dev, y_dev, dev_weeks)

np.float64(0.7377228526025422)

In [51]:
from catboost import CatBoostClassifier

In [52]:
train_pool=catboost.Pool(df, target)
valid_pool=catboost.Pool(X_dev, y_dev)
cat_model=CatBoostClassifier(eval_metric='AUC', 
                             learning_rate=0.05,
                             max_depth=6,
                         random_seed=0,
                         task_type='CPU'
                        )
cat_model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=10)

0:	test: 0.6104762	best: 0.6104762 (0)	total: 204ms	remaining: 3m 23s
1:	test: 0.6582930	best: 0.6582930 (1)	total: 377ms	remaining: 3m 8s
2:	test: 0.6889485	best: 0.6889485 (2)	total: 525ms	remaining: 2m 54s
3:	test: 0.7295184	best: 0.7295184 (3)	total: 692ms	remaining: 2m 52s
4:	test: 0.7474547	best: 0.7474547 (4)	total: 897ms	remaining: 2m 58s
5:	test: 0.7559199	best: 0.7559199 (5)	total: 1.08s	remaining: 2m 59s
6:	test: 0.7536881	best: 0.7559199 (5)	total: 1.28s	remaining: 3m 1s
7:	test: 0.7587315	best: 0.7587315 (7)	total: 1.5s	remaining: 3m 6s
8:	test: 0.7673417	best: 0.7673417 (8)	total: 1.68s	remaining: 3m 5s
9:	test: 0.7666463	best: 0.7673417 (8)	total: 1.83s	remaining: 3m
10:	test: 0.7657717	best: 0.7673417 (8)	total: 2.02s	remaining: 3m 1s
11:	test: 0.7690631	best: 0.7690631 (11)	total: 2.19s	remaining: 3m
12:	test: 0.7753941	best: 0.7753941 (12)	total: 2.37s	remaining: 2m 59s
13:	test: 0.7805817	best: 0.7805817 (13)	total: 2.55s	remaining: 2m 59s
14:	test: 0.7857532	best: 0

<catboost.core.CatBoostClassifier at 0x74b8e877c310>

In [53]:
stability_metric([cat_model], X_dev, y_dev, dev_weeks)

np.float64(0.7303913883610675)

In [54]:
import itertools

In [55]:
best_score=0
for w0, w1 in itertools.product([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], repeat=2):
    weeks=[]
    ginis=[]
    for week in sorted(dev_weeks.unique()):
        mask=dev_weeks==week
        y_pred=w0*predict([lgb_model], X_dev[mask])+w1*predict([xgb_model], X_dev[mask])+(1-w0-w1)*predict([cat_model], X_dev[mask])
        y_true=y_dev[mask]
        gini=2*roc_auc_score(y_true, y_pred)-1
        weeks.append(week)
        ginis.append(gini)
    slope,intercept,_,_,_=linregress(weeks, ginis)            
    std=np.std([slope*week+intercept-gini for week, gini in zip(weeks, ginis)])
    final_score= np.mean(ginis)+88*min(0,slope)-0.5*std
    if final_score>best_score:
        best_score=final_score
        print(w0, w1, 1-w0-w1, best_score)

0 0 1 0.7303913883610675
0 0.1 0.9 0.7327129523206786
0 0.2 0.8 0.7344863422700869
0 0.3 0.7 0.7360531322309033
0 0.4 0.6 0.7372951191424512
0 0.5 0.5 0.7380950110556241
0 0.6 0.4 0.7387161372094925
0 0.7 0.30000000000000004 0.7389370787076727
0 0.8 0.19999999999999996 0.738982646668712


In [56]:
weeks=[]
ginis=[]
for week in sorted(dev_weeks.unique()):
    mask=dev_weeks==week
    y_pred=0*predict([lgb_model], X_dev[mask])+0.8*predict([xgb_model], X_dev[mask])+0.2*predict([cat_model], X_dev[mask])
    y_true=y_dev[mask]
    gini=2*roc_auc_score(y_true, y_pred)-1
    weeks.append(week)
    ginis.append(gini)
slope,intercept,_,_,_=linregress(weeks, ginis)            
std=np.std([slope*week+intercept-gini for week, gini in zip(weeks, ginis)])
final_score= np.mean(ginis)+88*min(0,slope)-0.5*std

In [59]:
y_pred=0*predict([lgb_model], X_dev)+0.8*predict([xgb_model], X_dev)+0.2*predict([cat_model], X_dev)

In [60]:
fi_dict={col:value for col,value in zip(X_dev.columns,lgb_model.feature_importance())}

In [61]:
fi_dict=dict(sorted(fi_dict.items(), key= lambda x: x[1], reverse=True))

In [62]:
fi_cols=[col for col, value in fi_dict.items() if value!=0]

In [63]:
mask=abs(y_dev-y_pred)>0.8

In [66]:
sum(mask)/len(X_dev)*100

1.7310481212920237

In [70]:
df1=X_dev[mask]

In [71]:
df2=X_dev[~mask]

In [72]:
from scipy.stats import ks_2samp

In [73]:
ks_dict={}
for col in [col for col in num_cols if col not in []]:
    stat, pval = ks_2samp(df1[col][~df1[col].isnull()],df2[col][~df2[col].isnull()])
    ks_dict[col]= stat, pval

  stat, pval = ks_2samp(df1[col][~df1[col].isnull()],df2[col][~df2[col].isnull()])


In [74]:
ks_dict=dict(sorted(ks_dict.items(), key= lambda x: x[1], reverse=True))

In [75]:
ks_cols=[col for col, value in ks_dict.items() if value[1] <0.05]

In [76]:
len(ks_cols)

321

In [77]:
len([col for col in ks_cols if col in fi_cols])

315

In [None]:
[col for col in ks_cols if col in fi_cols]

In [79]:
cols=[]
count=0
for col in [col for col in ks_cols if col in fi_cols]:
    if fi_cols.index(col)<10:
        count+=1
        cols.append(col)
print(count)

8


In [80]:
from scipy.stats import chi2_contingency

In [81]:
chi2_dict={}
for col in cat_cols:
    contingency_table=pd.concat([df1[col].value_counts(),df2[col].value_counts()], axis=1).fillna(0)
    stat, pval, _, _ = chi2_contingency(contingency_table)
    chi2_dict[col]=stat, pval

In [82]:
chi2_dict=dict(sorted(chi2_dict.items(), key=lambda x: x[1], reverse=True))

In [83]:
chi2_cols=[col for col, value in chi2_dict.items() if value[1] <0.05]

In [84]:
len(chi2_cols)

125

In [None]:
len([col for col in chi2_cols if col in fi_cols])

In [None]:
[col for col in chi2_cols if col in fi_cols]

In [86]:
count=0
for col in [col for col in chi2_cols if col in fi_cols]:
    if fi_cols.index(col)<10:
        count+=1
print(count)

0


In [87]:
stability_metric([lgb_model], X_test, y_test, test_weeks)

np.float64(0.7229123224740361)

In [88]:
stability_metric([xgb_model], X_test, y_test, test_weeks)

np.float64(0.7227238532362731)

In [89]:
stability_metric([cat_model], X_test, y_test, test_weeks)

np.float64(0.7137661608965555)

In [90]:
weeks=[]
ginis=[]
for week in sorted(test_weeks.unique()):
    mask=test_weeks==week
    y_pred=0*predict([lgb_model], X_test[mask])+0.8*predict([xgb_model], X_test[mask])+0.2*predict([cat_model], X_test[mask])
    y_true=y_test[mask]
    gini=2*roc_auc_score(y_true, y_pred)-1
    weeks.append(week)
    ginis.append(gini)
slope,intercept,_,_,_=linregress(weeks, ginis)            
std=np.std([slope*week+intercept-gini for week, gini in zip(weeks, ginis)])
final_score= np.mean(ginis)+88*min(0,slope)-0.5*std

In [91]:
final_score

np.float64(0.7238556591144566)