This notebook is a shortened version of a notebook used for internal Kaggle competition. The task was to identyfy user based on their browsing patterns. In essence this was a binary classification task. The dataset was highly imbalanced. Quite a lot of cells have been removed for clarity and only ones that contributed to succesful results were left. To me, the most optimal solution resulted from applying undersampling, using CatBoost classifier and stratifiedKfold to reduce number of features

In [None]:
!pip install tldextract

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import files
import io
import re
import pandas as pd
import numpy as np
import plotly.express as px
import tldextract
import lightgbm as lgb
import tensorflow as tf
import gc
from tensorflow import keras
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, auc, average_precision_score, precision_recall_curve, plot_precision_recall_curve, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from urllib.parse import urlparse
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, anneal, rand
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        


In [None]:
uploaded = files.upload()

Saving id_map.parquet to id_map (1).parquet
Saving test.csv to test (1).csv
Saving train.csv to train (1).csv


In [None]:
train = pd.read_csv(io.BytesIO(uploaded['train.csv']))
train.head()

Unnamed: 0,session_id,webpage1,time1,webpage2,time2,webpage3,time3,webpage4,time4,webpage5,...,time6,webpage7,time7,webpage8,time8,webpage9,time9,webpage10,time10,target
0,0,9486,2019-02-20 05:57:45,,,,,,,,...,,,,,,,,,,0
1,1,11722,2019-02-22 07:14:50,12385.0,2019-02-22 07:14:50,50163.0,2019-02-22 07:14:51,12385.0,2019-02-22 07:14:51,12398.0,...,2019-02-22 07:14:51,50163.0,2019-02-22 07:14:52,50150.0,2019-02-22 07:14:52,19860.0,2019-02-22 07:15:15,19886.0,2019-02-22 07:15:16,0
2,2,192149,2018-12-16 12:35:17,659.0,2018-12-16 12:35:18,192136.0,2018-12-16 12:35:19,192149.0,2018-12-16 12:35:19,633.0,...,2018-12-16 12:35:19,192136.0,2018-12-16 12:35:20,192136.0,2018-12-16 12:35:21,192136.0,2018-12-16 12:35:22,192136.0,2018-12-16 12:35:24,0
3,3,10591,2019-02-13 12:40:35,451.0,2019-02-13 12:40:35,77580.0,2019-02-13 12:40:35,227821.0,2019-02-13 12:40:35,633.0,...,2019-02-13 12:42:14,10591.0,2019-02-13 12:42:14,227834.0,2019-02-13 12:42:15,227834.0,2019-02-13 12:42:16,227834.0,2019-02-13 12:42:17,0
4,4,438,2018-04-12 06:22:26,425.0,2018-04-12 06:22:26,529.0,2018-04-12 06:22:28,65685.0,2018-04-12 06:22:29,187638.0,...,2018-04-12 06:22:29,425.0,2018-04-12 06:22:29,65685.0,2018-04-12 06:22:31,187625.0,2018-04-12 06:22:31,187625.0,2018-04-12 06:22:32,0


In [None]:
test = pd.read_csv(io.BytesIO(uploaded['test.csv']))


# Feature engineering part 1 - handling webpage IDs and replacing them with domain groups 

In [None]:
id_map = pd.read_parquet(io.BytesIO(uploaded['id_map.parquet']))
id_map.head()

Unnamed: 0,id,webpage
0,326127,www.abmecatronique.com
1,182113,groups.live.com
2,551820,majeureliguefootball.wordpress.com
3,401995,cdt46.media.tourinsoft.eu
4,105504,www.hdwallpapers.eu


In [None]:
id_map.dtypes

id          int64
webpage    object
dtype: object

In [None]:
id_map.sort_values('id')


Unnamed: 0,id,webpage
17196,165,fpdownload2.macromedia.com
827,178,hotmail.fr
9719,191,login.live.com
16707,204,mail.live.com
4511,217,dub122.mail.live.com
...,...,...
13324,628923,www.mathjax.org
20897,628936,lemonde-educ.blog.lemonde.fr
7131,628949,evcs-crl.ws.symantec.com
31509,628962,www.tunisie.campusfrance.org


In [None]:
id_map.head()

Unnamed: 0,id,webpage
0,326127,www.abmecatronique.com
1,182113,groups.live.com
2,551820,majeureliguefootball.wordpress.com
3,401995,cdt46.media.tourinsoft.eu
4,105504,www.hdwallpapers.eu


In [None]:
id_map['domain'] = id_map['webpage'].str.split(".", n=2, expand=True)[1]
id_map['id_group'], uniques = pd.factorize(id_map['domain'])

In [None]:
id_map.head(10)

Unnamed: 0,id,webpage,domain,id_group
0,326127,www.abmecatronique.com,abmecatronique,0
1,182113,groups.live.com,live,1
2,551820,majeureliguefootball.wordpress.com,wordpress,2
3,401995,cdt46.media.tourinsoft.eu,media,3
4,105504,www.hdwallpapers.eu,hdwallpapers,4
5,487535,img378.imageshack.us,imageshack,5
6,71158,ecologie.nature.free.fr,nature,6
7,460677,www.ibcn.intec.ugent.be,ibcn,7
8,392115,kissanime.com,com,8
9,497636,www.carolineconduiteformation.com,carolineconduiteformation,9


In [None]:
def replace_website_ids(row, id_map):
    for i in range(1, 11):
        col = 'webpage{}'.format(i)
        if row[col] in id_map:
            row[col] = id_map[row[col]]
    return row

train = train.apply(replace_website_ids, axis=1, args=(id_map,))

In [None]:
test = test.apply(replace_website_ids, axis=1, args=(id_map,))

In [None]:
train.columns

Index(['session_id', 'webpage1', 'time1', 'webpage2', 'time2', 'webpage3',
       'time3', 'webpage4', 'time4', 'webpage5', 'time5', 'webpage6', 'time6',
       'webpage7', 'time7', 'webpage8', 'time8', 'webpage9', 'time9',
       'webpage10', 'time10', 'target'],
      dtype='object')

In [None]:
train.dtypes

session_id      int64
webpage1        int64
time1          object
webpage2      float64
time2          object
webpage3      float64
time3          object
webpage4      float64
time4          object
webpage5      float64
time5          object
webpage6      float64
time6          object
webpage7      float64
time7          object
webpage8      float64
time8          object
webpage9      float64
time9          object
webpage10     float64
time10         object
target          int64
dtype: object

# Feature engineering pt. 2 - creating timedate based features

In [None]:
time_columns = [col for col in train.columns if 'time' in col]
for col in time_columns:
    train[col] = pd.to_datetime(train[col])

In [None]:
time_columns = [col for col in test.columns if 'time' in col]
for col in time_columns:
    test[col] = pd.to_datetime(test[col])

In [None]:
train.dtypes

session_id             int64
webpage1               int64
time1         datetime64[ns]
webpage2             float64
time2         datetime64[ns]
webpage3             float64
time3         datetime64[ns]
webpage4             float64
time4         datetime64[ns]
webpage5             float64
time5         datetime64[ns]
webpage6             float64
time6         datetime64[ns]
webpage7             float64
time7         datetime64[ns]
webpage8             float64
time8         datetime64[ns]
webpage9             float64
time9         datetime64[ns]
webpage10            float64
time10        datetime64[ns]
target                 int64
dtype: object

In [None]:
def calculate_session_duration(row):
    for i in range(10, 0, -1):
        if pd.notnull(row['time' + str(i)]):
            return (row['time' + str(i)] - row['time1']).total_seconds()
    return np.nan

train['session_duration'] = train.apply(calculate_session_duration, axis=1)

time_cols = ['time1', 'time2', 'time3', 'time4', 'time5', 'time6', 'time7', 'time8', 'time9', 'time10']
time_diff_cols = ['time_diff_1_2', 'time_diff_2_3', 'time_diff_3_4', 'time_diff_4_5',
                  'time_diff_5_6', 'time_diff_6_7', 'time_diff_7_8', 'time_diff_8_9', 'time_diff_9_10']
for i in range(1, 10):
    train[time_diff_cols[i-1]] = (train[time_cols[i]] - train[time_cols[i-1]]).dt.total_seconds()

train['avg_time_per_site'] = train.session_duration / (train[time_diff_cols].count(axis=1) + 1)

In [None]:
test['session_duration'] = test.apply(calculate_session_duration, axis=1)
for i in range(1, 10):
    test[time_diff_cols[i-1]] = (test[time_cols[i]] - test[time_cols[i-1]]).dt.total_seconds()
test['avg_time_per_site'] = test.session_duration / (test[time_diff_cols].count(axis=1) + 1)

In [None]:
def add_time_features(df, time_cols):
    for col in time_cols:
        df[col + '_hour'] = df[col].dt.hour
        df[col + '_minute'] = df[col].dt.minute
        df[col + '_day_name'] = df[col].dt.day_name()
        df[col + '_month'] = df[col].dt.month
        
    return df

In [None]:
train = add_time_features(train, time_cols)

In [None]:
test = add_time_features(test, time_cols)

In [None]:
def classify_time_of_day(row):
    hour = row['time1'].hour
    if hour >= 6 and hour < 12:
        return 'morning'
    elif hour >= 12 and hour < 18:
        return 'afternoon'
    elif hour >= 18 and hour < 24:
        return 'evening'
    else:
        return 'night'

train['time_of_day'] = train.apply(classify_time_of_day, axis=1)

In [None]:
test['time_of_day'] = test.apply(classify_time_of_day, axis=1)

# More features based on webiste/domain ids

In [None]:
def count_unique_ids(row):
    ids = [row['webpage1'], row['webpage2'], row['webpage3'], row['webpage4'], row['webpage5'],
           row['webpage6'], row['webpage7'], row['webpage8'], row['webpage9'], row['webpage10']]
    filtered_ids = [i for i in ids if not pd.isna(i)]
    return len(set(filtered_ids))

def create_session_features(df):
    df['unique_ids'] = df.apply(count_unique_ids, axis=1)
    return df

train = create_session_features(train)

In [None]:
test = create_session_features(test)

In [None]:
def session_length(df):
    return df[['webpage1', 'webpage2', 'webpage3', 'webpage4', 'webpage5',
               'webpage6', 'webpage7', 'webpage8', 'webpage9', 'webpage10']].notnull().sum(axis=1)
train['session_length'] = session_length(train)

In [None]:
test['session_length'] = session_length(test)

In [None]:
def count_repeated_webpages(df):
    repeated_webpages = df['session_length'] - df['unique_ids']
    df['repeated_webpages'] = repeated_webpages
    return df
train = count_repeated_webpages(train)

In [None]:
test = count_repeated_webpages(test)

In [None]:
train['target'].value_counts()

0    158394
1      1575
Name: target, dtype: int64

Helper functions from workshop notebook

In [None]:
def evaluate_model(classifier, train_set, train_target, test_set, test_target, metrics=[roc_auc_score]):
    prediction = train_model(classifier, train_set, train_target, test_set)
    results = get_model_metrics(classifier.__class__.__name__, test_target, prediction, metrics=[roc_auc_score])
    return results


def train_model(classifier, train_set, train_target, test_set):
    classifier.fit(train_set, train_target)
    return classifier.predict_proba(test_set)


def get_model_metrics(classifier_name, test_target, prediction, metrics=[roc_auc_score]):
    mectrics_result = dict()
    for metric in metrics:
        score = metric(test_target, prediction[:, 1])
        print("{} for model {} = {:.3f}".format(metric.__name__, classifier_name, score))
        mectrics_result[metric.__name__] = score
    mectrics_result['classifier'] = classifier_name
    return mectrics_result


XGBOOST AND RANDOM FOREST STEPS REMOVED

# CatBoost


this is to not repeat preprocessing every try

In [None]:
del [[train_with_features, test_with_features]]
gc.collect()
train_with_features = pd.DataFrame()
test_with_features = pd.DataFrame()

In [None]:
train_with_features = train.copy(deep=True)

In [None]:
test_with_features = test.copy(deep=True)

PREPROCESSING STARTS HERE

In [None]:
CATEGORICAL_FEATURES = ['webpage1', 'webpage2', 'webpage3', 'webpage4', 
                        'webpage5', 'webpage6', 'webpage7', 'webpage8',      
                        'webpage9', 'webpage10', 'time_of_day', 'time1_hour', 'time1_minute', 'time1_day_name', 
                        'time2_hour', 'time2_minute', 'time2_day_name', 'time3_hour', 'time3_minute', 'time3_day_name',       
                        'time4_hour', 'time4_minute', 'time4_day_name', 'time5_hour', 'time5_minute', 'time5_day_name',     
                        'time6_hour', 'time6_minute', 'time6_day_name', 'time7_hour', 'time7_minute', 'time7_day_name',         
                        'time8_hour', 'time8_minute', 'time8_day_name', 'time9_hour', 'time9_minute', 'time9_day_name',         
                        'time10_hour', 'time10_minute', 'time10_day_name', 'time1_month', 'time2_month', 'time3_month',
                        'time4_month', 'time5_month', 'time6_month', 'time7_month', 'time8_month', 'time9_month', 'time10_month']



NUM_FEATURES = ['session_duration', 'time_diff_1_2', 'time_diff_2_3', 
                 'time_diff_3_4', 'time_diff_4_5', 'time_diff_5_6', 
                 'time_diff_6_7', 'time_diff_7_8', 'time_diff_8_9', 
                 'time_diff_9_10', 'avg_time_per_site', 'unique_ids']

def process_categorical_only(dataframe, 
                             cat_features=CATEGORICAL_FEATURES):
    for feature in cat_features:
        dataframe[feature] = dataframe[feature].astype('category')
        dataframe[feature] = dataframe[feature].cat.codes
    return dataframe

def preprocess_frame(dataframe, 
                     cat_features=CATEGORICAL_FEATURES,
                     num_features=NUM_FEATURES):
    dataframe = dataframe.set_index("session_id")
    dataframe.drop(['time1', 'time2', 'time3', 'time4', 'time5', 'time6', 'time7', 'time8', 'time9', 'time10'], axis=1, inplace=True)
    dataframe[['webpage2', 'webpage3', 'webpage4', 'webpage5', 'webpage6', 'webpage7', 'webpage8', 'webpage9', 'webpage10']] = dataframe[['webpage2', 'webpage3', 'webpage4', 'webpage5', 'webpage6', 'webpage7', 'webpage8', 'webpage9', 'webpage10']].fillna(-1, inplace=True)
    dataframe = process_categorical_only(dataframe, cat_features)
    dataframe[num_features].fillna(-1, inplace=True)
    return dataframe


In [None]:
train.fillna(-1, inplace=True)
test.fillna(-1, inplace=True)

In [None]:
train_with_features.fillna(-1, inplace=True)
test_with_features.fillna(-1, inplace=True)

In [None]:
train_with_features = preprocess_frame(train_with_features, cat_features=CATEGORICAL_FEATURES, num_features=NUM_FEATURES)

In [None]:
train_with_features.dtypes

webpage1             int16
webpage2              int8
webpage3              int8
webpage4              int8
webpage5              int8
                     ...  
time10_month          int8
time_of_day           int8
unique_ids           int64
session_length       int64
repeated_webpages    int64
Length: 66, dtype: object

In [None]:
train.isna().sum()

session_id           0
webpage1             0
webpage2             0
webpage3             0
webpage4             0
webpage5             0
webpage6             0
webpage7             0
webpage8             0
webpage9             0
webpage10            0
target               0
session_duration     0
time_diff_1_2        0
time_diff_2_3        0
time_diff_3_4        0
time_diff_4_5        0
time_diff_5_6        0
time_diff_6_7        0
time_diff_7_8        0
time_diff_8_9        0
time_diff_9_10       0
avg_time_per_site    0
time_of_day          0
unique_ids           0
session_length       0
repeated_webpages    0
dtype: int64

In [None]:
test_with_features = preprocess_frame(test_with_features, cat_features=CATEGORICAL_FEATURES, num_features=NUM_FEATURES)

In [None]:
X = train_with_features.drop(columns='target')
y = train_with_features.target.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

CatBoost has been ran multiple times, both on full data and on undersampled data. ClusterCentroids have been tried for undersampling but the models performed significantly worse

In [None]:
under_sampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)

X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

This is a feature reduction function introduced last moment. Greatly improved performance on unseen data on undersampled model, didn't get to try it on model with all data points - started to late to make it to deadline

In [None]:
clf = CatBoostClassifier(verbose=False)
cv = StratifiedKFold(8)

rfecv = RFECV(
    estimator = clf,
    step = 1,
    cv = cv,
    scoring = 'roc_auc',
    min_features_to_select = 1,
    n_jobs = -1
)

rfecv.fit(X_train, y_train)

RFECV(cv=StratifiedKFold(n_splits=8, random_state=None, shuffle=False),
      estimator=<catboost.core.CatBoostClassifier object at 0x7fe9fd8cb2e0>,
      n_jobs=-1, scoring='roc_auc')

Finished too late for submission, 7 hours 36 minutes, cells below are from a n undersampled dataset with fewer Kfolds

In [None]:
print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 42


In [None]:
rfecv.n_features_in_

65

In [None]:
rfecv.ranking_

array([ 1, 24, 23, 22, 21, 20, 19, 18, 17, 16,  1, 11,  7, 13,  9, 10, 14,
        8, 12,  3,  1,  1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  6,  1,  1,  1,  4,  1,  1,  1,  1,
        1,  1,  1,  5,  1,  1,  1,  1,  1,  1,  1,  1, 15,  1])

In [None]:
rfecv.support_

array([ True, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True])

In [None]:
def filter_features(train_df, test_df, feature_mask):
    selected_columns = train_df.columns[feature_mask]
    filtered_train_df = train_df[selected_columns]
    filtered_test_df = test_df[selected_columns]
    return filtered_train_df, filtered_test_df


filtered_train, filtered_test = filter_features(X_resampled, X_test, rfecv.support_)

In [None]:
FIL_CATEGORICAL_FEATURES = ['webpage1', 'time1_hour', 'time1_minute', 'time1_day_name', 
                        'time2_hour', 'time2_day_name', 'time3_hour', 'time3_minute', 'time3_day_name',       
                        'time4_hour', 'time4_minute', 'time4_day_name', 'time5_hour', 'time5_minute', 'time5_day_name',     
                        'time6_hour', 'time6_day_name', 'time7_hour', 'time7_day_name',         
                        'time8_hour', 'time8_minute', 'time8_day_name', 'time9_hour', 'time9_day_name',         
                        'time10_hour', 'time10_minute', 'time10_day_name', 'time1_month', 'time2_month', 'time3_month',
                        'time4_month', 'time5_month', 'time6_month', 'time7_month', 'time8_month', 'time9_month', 'time10_month', 'time_of_day']

this function has been reused on and on for differently processed datasets

In [None]:
cat_clf = CatBoostClassifier(random_state=42, verbose=False, cat_features=FIL_CATEGORICAL_FEATURES)


result_dict = evaluate_model(cat_clf, filtered_train, y_resampled, filtered_test, y_test)
results_by_model = results_by_model.append(result_dict, ignore_index=True)  

roc_auc_score for model CatBoostClassifier = 0.992


Best ROC-AUC values: 0.992 for undersampled, 0.937 for data unseen by model. Code below is just a snipped to print out results. Did not get to check full model after feature reduction on unseen data

In [None]:
for col in test_with_features.columns:
    if col not in filtered_train.columns:
      test_with_features = test_with_features.drop(col, axis=1)

In [None]:
predictions = cat_clf.predict_proba(test_with_features)

In [None]:
submission = pd.DataFrame({
    "session_id": test.index,
    "target": predictions[:, 1]
})

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Cells from some of the things tried previousle are below:
First tuning of hyperparameters 3+ hours run time for most models, very minor improvement, not worth the time

In [None]:
%%time
def objective(space):
    params = {
        'depth': space['depth'],
        'learning_rate': space['learning_rate'],
        'l2_leaf_reg': space['l2_leaf_reg'],
        'n_estimators': space['n_estimators'],
        'scale_pos_weight': space['scale_pos_weight'],
        'random_seed': 42
    }
    
    clf = CatBoostClassifier(
        loss_function='Logloss',
        **params
    )
    
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=10).mean()
    print("AUC {:.3f} params {}".format(score, params))
    return {'loss':1 - score, 'status': STATUS_OK }

space = {
    'depth': hp.quniform('depth', 4, 10, 1),
    'learning_rate': hp.uniform('learning_rate', 0, 1),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 0, 10),
    'n_estimators': hp.choice('n_estimators', [100, 250, 500, 1000, 3000]),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 10),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=18)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2507:	learn: 0.0007171	total: 1m 42s	remaining: 20.2s
2508:	learn: 0.0007171	total: 1m 42s	remaining: 20.2s
2509:	learn: 0.0007171	total: 1m 43s	remaining: 20.1s
2510:	learn: 0.0007171	total: 1m 43s	remaining: 20.1s
2511:	learn: 0.0007171	total: 1m 43s	remaining: 20s
2512:	learn: 0.0007171	total: 1m 43s	remaining: 20s
2513:	learn: 0.0007171	total: 1m 43s	remaining: 19.9s
2514:	learn: 0.0007171	total: 1m 43s	remaining: 19.9s
2515:	learn: 0.0007171	total: 1m 43s	remaining: 19.9s
2516:	learn: 0.0007171	total: 1m 43s	remaining: 19.8s
2517:	learn: 0.0007171	total: 1m 43s	remaining: 19.8s
2518:	learn: 0.0007171	total: 1m 43s	remaining: 19.7s
2519:	learn: 0.0007171	total: 1m 43s	remaining: 19.7s
2520:	learn: 0.0007170	total: 1m 43s	remaining: 19.7s
2521:	learn: 0.0007170	total: 1m 43s	remaining: 19.6s
2522:	learn: 0.0007170	total: 1m 43s	remaining: 19.6s
2523:	learn: 0.0007170	total: 1m 43s	remaining: 19.5s
2524:	learn: 0.000717

AUC 0.976 params {'depth': 6.0, 'learning_rate': 0.05615257844786148, 'l2_leaf_reg': 2.759108720089063, 'n_estimators': 100, 'scale_pos_weight': 0.8447630666140027, 'random_seed': 42}

In [None]:
best

{'depth': 7.0,
 'l2_leaf_reg': 7.019263148917432,
 'learning_rate': 0.06739298575408503,
 'n_estimators': 1,
 'scale_pos_weight': 4.018230317378517}

after first tuning: {'depth': 8.0,
 'l2_leaf_reg': 9.201292429965548,
 'learning_rate': 0.20215265060327547,
 'n_estimators': 0}

In [None]:
best_cb_clf = CatBoostClassifier(loss_function='Logloss',
              random_seed=42,
              learning_rate=0.05615257844786148,
              l2_leaf_reg=2.759108720089063,
              depth=6,
              n_estimators=100, 
              scale_pos_weight = 0.8447630666140027,
              verbose=False, 
              cat_features=CATEGORICAL_FEATURES)

result_dict = evaluate_model(best_cb_clf, X_train, y_train, X_test, y_test)
results_by_model = results_by_model.append(result_dict, ignore_index=True)   

roc_auc_score for model CatBoostClassifier = 0.970


In [None]:
predictions = cat_clf.predict_proba(test_with_features)

CatBoostError: ignored

In [None]:
submission = pd.DataFrame({
    "session_id": test.index,
    "target": predictions[:, 1]
})

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>