In [1]:
from IPython.display import clear_output
from google.colab import files
files.upload()
!pip install -qU catboost category_encoders optuna
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c cat-in-the-dat
!unzip cat-in-the-dat.zip
!rm cat-in-the-dat.zip
clear_output()

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import classification_report, accuracy_score, roc_curve
from sklearn.model_selection import StratifiedKFold

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as Pipe

from catboost import Pool, CatBoostClassifier
from lightgbm import LGBMClassifier
import optuna
from optuna.samplers import TPESampler

import category_encoders as ce

np.random.seed(123)

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
TARGET_NAME = 'target'

train.set_index('id', drop=True, inplace=True)
test.set_index('id', drop=True, inplace=True)

In [21]:
train.head().transpose()

id,0,1,2,3,4
bin_0,0,0,0,0,0
bin_1,0,1,0,1,0
bin_2,0,0,0,0,0
bin_3,T,T,F,F,F
bin_4,Y,Y,Y,Y,N
nom_0,Green,Green,Blue,Red,Red
nom_1,Triangle,Trapezoid,Trapezoid,Trapezoid,Trapezoid
nom_2,Snake,Hamster,Lion,Snake,Lion
nom_3,Finland,Russia,Russia,Canada,Canada
nom_4,Bassoon,Piano,Theremin,Oboe,Oboe


In [10]:
train[TARGET_NAME].value_counts(normalize=True)

0    0.69412
1    0.30588
Name: target, dtype: float64

In [15]:
train.describe(include=['object'])

Unnamed: 0,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_1,ord_2,ord_3,ord_4,ord_5
count,300000,300000,300000,300000,300000,300000,300000,300000,300000,300000,300000,300000,300000,300000,300000,300000,300000
unique,2,2,3,6,6,6,4,222,522,1220,2215,11981,5,6,15,26,192
top,T,Y,Green,Trapezoid,Lion,Russia,Oboe,f7821e391,d173ac7ca,3a114adea,c720f85ca,163cc60fa,Novice,Freezing,g,L,od
freq,153535,191633,127341,101181,101295,101123,92293,2801,1218,534,299,72,126583,99816,36405,19066,5019


In [4]:
def cyclical_encode(X: pd.Series, x_max: float) -> pd.DataFrame:
    return pd.DataFrame(data=pd.concat([
                      np.cos(2*np.pi * X / x_max),
                      np.sin(2*np.pi * X / x_max)
    ], axis=1).reset_index(drop=True).values, columns=[f'{X.name}_cos', f'{X.name}_sin'])

In [5]:
def report(y_train, y_train_pred, y_test, y_test_pred, y_train_proba=None, y_test_proba=None):
    print('Train\n', classification_report(y_train, y_train_pred, digits=3))
    print('Test\n', classification_report(y_test, y_test_pred, digits=3))
    if y_train_proba is not None and y_test_proba is not None:
        roc_train, roc_test = roc_auc_score(y_train, y_train_proba), roc_auc_score(y_test, y_test_proba)
        print(f'Train ROC_AUC: {roc_train:.3f}, Test ROC_AUC: {roc_test:.3f}')
    print('Confusion Matrix', '\n', pd.crosstab(y_test, y_test_pred))

In [6]:
class CyclicEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.max_month = X['month'].max()
        self.max_day = X['day'].max()
        return self
    def transform(self, X):
        index = X.index
        X_ = X.reset_index(drop=True)
        month = pd.Series(data=X_['month'], index=X_.index, name='month')
        day = pd.Series(data=X_['day'], index=X_.index, name='day')
        m_df = cyclical_encode(month, self.max_month)
        d_df = cyclical_encode(day, self.max_day)
        X_ = pd.concat([X_, m_df], axis=1)
        X_ = pd.concat([X_, d_df], axis=1)
        #X_.drop(['month', 'day'], axis=1, inplace=True)
        X_.index = index
        return X_

class DataFrameR(BaseEstimator, TransformerMixin):
    def __init__(self, columns, index=None):
        self.columns = columns
        self.index = index

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert len(self.columns) == X.shape[1], f'Number of columns {len(self.columns)} is not equal to data shape {X.shape[1]}'
        if self.index is not None:
            X_ = pd.DataFrame(X, columns=self.columns, index=self.index)
        else:
            X_ = pd.DataFrame(X, columns=self.columns)
        return X_

In [7]:
X, y = train.drop(TARGET_NAME, axis=1), train[TARGET_NAME]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y, random_state=123)

In [9]:
cols = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2',
       'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_0',
       'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'month_cos', 'month_sin',
       'day_cos', 'day_sin']

In [10]:
# cyclic encoding, undersmapler, transform to df (pipeline is not very comfortable here)
# to make final predictions we have to obtain proper index from train ans valid
test_index = test.index

encoder = CyclicEncoder()
sampler = RandomUnderSampler(random_state=123)
df = DataFrameR(cols)


X_train = encoder.fit_transform(X_train, y_train)
#X_train, y_train = sampler.fit_resample(X_train, y_train)  # uncomment for to use sampler
#X_train = df.fit_transform(X_train) # uncomment if sampler

X_valid = encoder.transform(X_valid)

In [11]:
cat_cols = X_train.select_dtypes(include=['object', np.int64]).columns.tolist()

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [132]:
# define catboost base model with default params and undersampling
base = CatBoostClassifier(verbose=0)  # define default model
base.fit(train_pool, eval_set=valid_pool)  # fit model
base_train_pred = base.predict(X_train)
base_test_pred = base.predict(X_valid)
base_train_proba = base.predict_proba(X_train)[:,1]
base_test_proba = base.predict_proba(X_valid)[:,1]
# display metrics and statistics
report(y_train, base_train_pred, y_valid, base_test_pred, base_train_proba, base_test_proba)

Train
               precision    recall  f1-score   support

           0      0.775     0.741     0.758     64235
           1      0.752     0.784     0.768     64235

    accuracy                          0.763    128470
   macro avg      0.763     0.763     0.763    128470
weighted avg      0.763     0.763     0.763    128470

Test
               precision    recall  f1-score   support

           0      0.860     0.698     0.771     62471
           1      0.520     0.743     0.612     27529

    accuracy                          0.712     90000
   macro avg      0.690     0.721     0.691     90000
weighted avg      0.756     0.712     0.722     90000

Train ROC_AUC: 0.846, Test ROC_AUC: 0.797
Confusion Matrix 
 col_0       0      1
target              
0       43604  18867
1        7073  20456


In [143]:
# constant model parameters
params_model = {
    'eval_metric': 'AUC', 
    'iterations': 1000,
    'auto_class_weights': 'Balanced',
    'silent': True,
    'one_hot_max_size': 30,
    'early_stopping_rounds': 10,
    'grow_policy': 'SymmetricTree',
    'allow_writing_files': False,
    'use_best_model': True,
    'random_seed': 123,
    'max_ctr_complexity': 8,
    'combinations_ctr': ['Counter:CtrBorderCount=40:Prior=0.5/1'],
    'simple_ctr': ['Counter:CtrBorderCount=40:Prior=0.5/1'], 
    }

In [146]:
def objective(trial):
    param_trials = {
                    'depth': trial.suggest_int('depth', 3, 9),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
                    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1., 5.),
                    'bagging_temperature': trial.suggest_float('bagging_temperature', 1., 3.),
                    'subsample': trial.suggest_float('subsample', 0.5, 1.),
                    'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.),
                    'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                    }
    param_trials.update(params_model)
    opt_model = CatBoostClassifier(**param_trials)
    opt_model.fit(train_pool, eval_set=valid_pool)
    
    y_pred = opt_model.predict_proba(X_valid)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    
    return score

In [None]:
study = optuna.create_study(sampler=TPESampler(seed=123), direction="maximize")
study.optimize(objective, n_trials=100, timeout=6000)

print(f'Number of completed trials: {len(study.trials)}')
print('Best trial')
trial = study.best_trial
print(f'Best score: {trial.value}')
print('Best params')
for key, value in trial.params.items():
    print(f'{key}: {value}')

In [12]:
params_model = {
                'eval_metric': 'AUC', 
                'iterations': 1000,
                'auto_class_weights': 'Balanced',
                'silent': True,
                'one_hot_max_size': 30,
                'early_stopping_rounds': 10,
                'grow_policy': 'SymmetricTree',
                'allow_writing_files': False,
                'use_best_model': True,
                'random_seed': 123,
                'max_ctr_complexity': 8,
                'combinations_ctr': ['Counter:CtrBorderCount=40:Prior=0.5/1'],
                'simple_ctr': ['Counter:CtrBorderCount=40:Prior=0.5/1'],
                'depth': 8,
                'learning_rate': 0.23959129649837493,
                'l2_leaf_reg': 4.033554446569162,
                'bagging_temperature': 2.5818845042059597,
                'subsample': 0.9242531365577448,
                'colsample_bylevel': 0.972655865355131,
                'boosting_type': 'Ordered',
            }

In [14]:
cat = CatBoostClassifier(**params_model)
cat.fit(train_pool, eval_set=valid_pool)

cat_train_pred = cat.predict(X_train)
cat_test_pred = cat.predict(X_valid)
cat_train_proba = cat.predict_proba(X_train)[:,1]
cat_test_proba = cat.predict_proba(X_valid)[:,1]

report(y_train, cat_train_pred, y_valid, cat_test_pred, cat_train_proba, cat_test_proba)

Train
               precision    recall  f1-score   support

           0      0.847     0.699     0.766    145765
           1      0.511     0.713     0.595     64235

    accuracy                          0.704    210000
   macro avg      0.679     0.706     0.681    210000
weighted avg      0.744     0.704     0.714    210000

Test
               precision    recall  f1-score   support

           0      0.840     0.695     0.761     62471
           1      0.503     0.701     0.586     27529

    accuracy                          0.697     90000
   macro avg      0.672     0.698     0.673     90000
weighted avg      0.737     0.697     0.707     90000

Train ROC_AUC: 0.780, Test ROC_AUC: 0.770
Confusion Matrix 
 col_0       0      1
target              
0       43417  19054
1        8242  19287


In [17]:
test = encoder.transform(test)

In [22]:
submission = pd.DataFrame({'id': test.index, TARGET_NAME: cat.predict_proba(test)[:,1]})
submission.to_csv('submission.csv', index=False, encoding='utf-8')

In [23]:
!kaggle competitions submit -c cat-in-the-dat -f submission.csv -m 'test1'
# Score: 0.76877
# Private score: 0.76349

100% 5.00M/5.00M [00:03<00:00, 1.60MB/s]
Successfully submitted to Categorical Feature Encoding Challenge

The data contains binary features (bin_*), nominal features (nom_*), ordinal features (ord_*) as well as (potentially cyclical) day (of the week) and month features. The string ordinal features ord_{3-5} are lexically ordered according to string.ascii_letters.

In [8]:
# new features
train['ord_5_n1'] = train['ord_5'].apply(lambda x:(string.ascii_letters.find(x[0])+1)).astype(np.float32)
train['ord_5_n2'] = train['ord_5'].apply(lambda x:(string.ascii_letters.find(x[1])+1)).astype(np.float32)

test['ord_5_n1'] = test['ord_5'].apply(lambda x:(string.ascii_letters.find(x[0])+1)).astype(np.float32)
test['ord_5_n2'] = test['ord_5'].apply(lambda x:(string.ascii_letters.find(x[1])+1)).astype(np.float32)

In [9]:
binary_cols = [col for col in train.columns if train[col].dtype == 'object' and col.startswith('bin')]
nominal_one_hot = [col for col in train.columns if train[col].dtype == 'object' and train[col].nunique() < 10 and col.startswith('nom')]
nominal_high_card = [col for col in train.columns if train[col].dtype == 'object' and train[col].nunique() >= 10 and col.startswith('nom')]
ordinal_cols = [col for col in train.columns if col.startswith('ord')]
cycle_cols = ['month', 'day']

In [10]:
X, y = train.drop(TARGET_NAME, axis=1), train[TARGET_NAME]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y, random_state=123)

In [97]:
encoder = CyclicEncoder()
X_train = encoder.fit_transform(X_train, y_train)
X_valid = encoder.transform(X_valid)
test_ = encoder.transform(test)
final_cols = X_train.columns.tolist()
test_index = test.index

In [98]:
columns_transformer = ColumnTransformer(transformers=[
                              ('binary', ce.binary.BinaryEncoder(), binary_cols),
                              ('one-hot', ce.one_hot.OneHotEncoder(), nominal_one_hot),
                              ('target', ce.cat_boost.CatBoostEncoder(random_state=123, a=3), nominal_high_card),
                              ('ordinal', ce.ordinal.OrdinalEncoder(), ordinal_cols),
                                                      ], remainder='passthrough')

X_train_ = columns_transformer.fit_transform(X_train, y_train)
X_valid_ = columns_transformer.transform(X_valid)
test_ = columns_transformer.transform(test_)

In [29]:
model_params = {
                'objective': 'cross_entropy', # binary
                'n_estimators': 800,
                'n_jobs': -1,
                'is_unbalanced': True,
                'random_state': 123
}
fit_params = {'early_stopping_rounds': 10,  
              'eval_set': [(X_valid_, y_valid)],  
              'eval_metric': 'auc',
              'verbose': False
}

In [30]:
def objective(trial):
    param_trials = {
                    'max_depth': trial.suggest_int('max_depth', 3, 9),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
                    'reg_alpha': trial.suggest_float('reg_alpha', 1., 5.),
                    'reg_lambda': trial.suggest_float('reg_lambda', 1., 5.),
                    'num_leaves': trial.suggest_int('num_leaves', 20, 265),
                    'subsample': trial.suggest_float('subsample', 0.3, 1.),
                    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.),
                    'max_bin': trial.suggest_int('max_bin', 60, 260),
                    'min_child_samples': trial.suggest_int('min_child_samples', 120, 260),
                    'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 0.1),
                    'boosting_type': trial.suggest_categorical('boosting_type', ['goss', 'gbdt']),
                    }
    param_trials.update(model_params)
    opt_model = LGBMClassifier(**param_trials)
    opt_model.fit(X_train_, y_train, **fit_params)
    
    y_pred = opt_model.predict_proba(X_valid_)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    
    return score

In [None]:
study = optuna.create_study(sampler=TPESampler(seed=123), direction="maximize")
study.optimize(objective, n_trials=100, timeout=6000)

print(f'Number of completed trials: {len(study.trials)}')
print('Best trial')
trial = study.best_trial
print(f'Best score: {trial.value}')
print('Best params')
for key, value in trial.params.items():
    print(f'{key}: {value}')

In [106]:
# best params without month, day (only cyclical), without ord_5_n1, ord_5_n2, a=2 (in CatBoostEncoder)
model_params = {
                'max_depth': 3,
                'learning_rate': 0.29375900834037568, # 0.19375900834037568
                'reg_alpha': 2.126096090703706,
                'reg_lambda': 4.0535041350353165,
                'num_leaves': 221,
                'subsample': 0.6556369768270103, # 0.5556369768270103
                'colsample_bytree': 0.3020488780017279,
                'max_bin': 253,
                'min_child_samples': 208,
                'min_child_weight': 0.0201242511324658,
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'n_estimators': 800,
                'n_jobs': -1,
                'is_unbalanced': True,
                'random_state': 123,
}
fit_params = {'early_stopping_rounds': 20, # 10  
              'eval_set': [(X_valid_, y_valid)],  
              'eval_metric': 'auc',
              'verbose': False
}

In [13]:
# best params with month and day, ord_5_n1, ord_5_n2, a=3 in CatBoostEncoder (Smoothing)
model_params = {
                'objective': 'cross_entropy',
                'n_estimators': 800,
                'n_jobs': -1,
                'is_unbalanced': True,
                'random_state': 123,
                'max_depth': 3,
                'learning_rate': 0.08558504736138631, # 0.08558504736138631
                'reg_alpha': 4.6185587467243945,
                'reg_lambda': 4.063505933330164,
                'num_leaves': 255,
                'subsample': 0.4461280745191487,
                'colsample_bytree': 0.3032833481032431,
                'max_bin': 254,
                'min_child_samples': 214,
                'min_child_weight': 0.09950864454542213,
                'boosting_type': 'gbdt',
}

In [None]:
fit_params = {'early_stopping_rounds': 50, # 10  
              'eval_set': [(X_valid_, y_valid)],  
              'eval_metric': 'auc',
              'verbose': False
}

In [110]:
lgbm = LGBMClassifier(**model_params)
lgbm.fit(X_train_, y_train, **fit_params)

lgbm_train_pred = lgbm.predict(X_train_)
lgbm_test_pred = lgbm.predict(X_valid_)
lgbm_train_proba = lgbm.predict_proba(X_train_)[:,1]
lgbm_test_proba = lgbm.predict_proba(X_valid_)[:,1]

report(y_train, lgbm_train_pred, y_valid, lgbm_test_pred, lgbm_train_proba, lgbm_test_proba)

Train
               precision    recall  f1-score   support

           0      0.783     0.905     0.840    145765
           1      0.666     0.432     0.525     64235

    accuracy                          0.760    210000
   macro avg      0.725     0.669     0.682    210000
weighted avg      0.748     0.760     0.743    210000

Test
               precision    recall  f1-score   support

           0      0.784     0.909     0.842     62471
           1      0.676     0.433     0.528     27529

    accuracy                          0.763     90000
   macro avg      0.730     0.671     0.685     90000
weighted avg      0.751     0.763     0.746     90000

Train ROC_AUC: 0.798, Test ROC_AUC: 0.800
Confusion Matrix 
 col_0       0      1
target              
0       56762   5709
1       15615  11914


In [111]:
submission = pd.DataFrame({'id': test_index, TARGET_NAME: lgbm.predict_proba(test_)[:,1]})
submission.to_csv('submission.csv', index=False, encoding='utf-8')

In [112]:
!kaggle competitions submit -c cat-in-the-dat -f submission.csv -m 'test1'

100% 5.06M/5.06M [00:07<00:00, 675kB/s]
Successfully submitted to Categorical Feature Encoding Challenge

In [69]:
# load train, test
# make new features
# define X,y, columns lists
# define model_params with cross_entropy objective

In [14]:
# average result with KFolds in pipeline style
test_index = test.index

test_results = []
skf = StratifiedKFold(n_splits=3)
encoder = CyclicEncoder()

columns_transformer = ColumnTransformer(transformers=[
                              ('binary', ce.binary.BinaryEncoder(), binary_cols),
                              ('one-hot', ce.one_hot.OneHotEncoder(), nominal_one_hot),
                              ('target', ce.cat_boost.CatBoostEncoder(random_state=123, a=3), nominal_high_card),
                              ('ordinal', ce.ordinal.OrdinalEncoder(), ordinal_cols),
                                                      ], remainder='passthrough')

for train_index, valid_index in skf.split(X.values, y.values):
    X_train, X_valid = X.iloc[train_index, :], X.iloc[valid_index, :]
    y_train, y_valid = y.values[train_index], y.values[valid_index]

    X_train = encoder.fit_transform(X_train, y_train)
    X_train = columns_transformer.fit_transform(X_train, y_train)
    X_valid = encoder.transform(X_valid)
    X_valid = columns_transformer.transform(X_valid)

    test_ = encoder.transform(test)
    test_ = columns_transformer.transform(test_)

    lgbm = LGBMClassifier(**model_params)
    fit_params = {'early_stopping_rounds': 10,  
              'eval_set': [(X_valid, y_valid)],  
              'eval_metric': 'auc',
              'verbose': False
              }
    lgbm.fit(X_train, y_train, **fit_params)

    pred_test = lgbm.predict_proba(test_)[:,1]
    test_results.append(pred_test)

final_test = np.array(test_results).mean(axis=0)

In [15]:
submission = pd.DataFrame({'id': test_index, TARGET_NAME: final_test})
submission.to_csv('submission.csv', index=False, encoding='utf-8')

In [16]:
!kaggle competitions submit -c cat-in-the-dat -f submission.csv -m 'test1'

100% 5.05M/5.05M [00:00<00:00, 18.1MB/s]
Successfully submitted to Categorical Feature Encoding Challenge



```
encoder = ce.BackwardDifferenceEncoder(cols=[...])
encoder = ce.BaseNEncoder(cols=[...])
encoder = ce.BinaryEncoder(cols=[...])
encoder = ce.CatBoostEncoder(cols=[...])
encoder = ce.CountEncoder(cols=[...])
encoder = ce.GLMMEncoder(cols=[...])
encoder = ce.HashingEncoder(cols=[...])
encoder = ce.HelmertEncoder(cols=[...])
encoder = ce.JamesSteinEncoder(cols=[...])
encoder = ce.LeaveOneOutEncoder(cols=[...])
encoder = ce.MEstimateEncoder(cols=[...])
encoder = ce.OneHotEncoder(cols=[...])
encoder = ce.OrdinalEncoder(cols=[...])
encoder = ce.SumEncoder(cols=[...])
encoder = ce.PolynomialEncoder(cols=[...])
encoder = ce.TargetEncoder(cols=[...])
encoder = ce.WOEEncoder(cols=[...])
encoder = ce.QuantileEncoder(cols=[...])

encoder.fit(X, y)
X_cleaned = encoder.transform(X_dirty)
```




Best params, Private - 0.79485, Public - 0.79988
```
model_params = {
                'objective': 'cross_entropy',
                'n_estimators': 800,
                'n_jobs': -1,
                'is_unbalanced': True,
                'random_state': 123,
                'max_depth': 3,
                'learning_rate': 0.08558504736138631, 
                'reg_alpha': 4.6185587467243945,
                'reg_lambda': 4.063505933330164,
                'num_leaves': 255,
                'subsample': 0.4461280745191487,
                'colsample_bytree': 0.3032833481032431,
                'max_bin': 254,
                'min_child_samples': 214,
                'min_child_weight': 0.09950864454542213,
                'boosting_type': 'gbdt',
}

fit_params = {'early_stopping_rounds': 10,  
              'eval_set': [(X_valid_, y_valid)],  
              'eval_metric': 'auc',
              'verbose': False
}
```

