In [None]:
import warnings
warnings.simplefilter('ignore')

import gc
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
import catboost as ctb

In [None]:

df_train = pd.read_pickle(f'../temp_data/df_train_v6.pickle')
df_valid = pd.read_pickle('../temp_data/df_valid_v6.pickle')
df_test = pd.read_pickle('../temp_data/df_test_v6.pickle')

def reduce_mem_usage(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in tqdm(df.columns):
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

df_train = reduce_mem_usage(df_train)
df_valid = reduce_mem_usage(df_valid)
df_test = reduce_mem_usage(df_test)
print(df_train.shape, df_valid.shape, df_test.shape)

In [None]:
print(f'There are {df_train.isnull().any().sum()} columns in train dataset with missing values.')
one_value_cols = [col for col in df_train.columns if df_train[col].nunique() <= 1]
one_value_cols_test = [col for col in df_test.columns if df_test[col].nunique() <= 1]
print(one_value_cols,one_value_cols_test)
many_null_cols = [col for col in df_train.columns if df_train[col].isnull().sum() / df_train.shape[0] > 0.9]
many_null_cols_test = [col for col in df_test.columns if df_test[col].isnull().sum() / df_test.shape[0] > 0.9]
print(many_null_cols,many_null_cols_test)
big_top_value_cols = [col for col in df_train.columns if df_train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in df_test.columns if df_test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
print(big_top_value_cols,big_top_value_cols_test)

In [None]:
df_train['is_finish'].value_counts()

In [None]:
feats = [c for c in df_train.columns if c not in ['ID','is_like',
 'is_favourite',
 'is_share','seq_no_rank',
 'is_finish']+big_top_value_cols+many_null_cols+one_value_cols]
ycol = 'is_finish'

feats = ['tag', 'videoid_cnt', 'videoid_is_like_sum',
       'videoid_is_favourite_sum', 'videoid_is_share_sum',
       'videoid_is_finish_sum', 'videoid_is_like_mean',
       'videoid_is_favourite_mean', 'videoid_is_share_mean',
       'videoid_is_finish_mean', 'userid_cnt', 'userid_is_like_sum',
       'userid_is_favourite_sum', 'userid_is_share_sum',
       'userid_is_finish_sum', 'userid_is_like_mean',
       'userid_is_favourite_mean', 'userid_is_share_mean',
       'userid_is_finish_mean']


In [None]:
cat_params = {
    'task_type': 'GPU',
    'learning_rate': 0.1,
    'loss_function': 'Logloss',
    'iterations': 100000,  # 10000,
    'random_seed': 2022,
    'max_depth': 6,
    'reg_lambda': 0.05,
    'early_stopping_rounds': 10
}

In [None]:
model = CatBoostClassifier(**cat_params)
model.fit(
    df_train[feats], df_train[ycol],
    eval_set=(df_valid[feats], df_valid[ycol]),
    verbose=100
)

In [None]:
df_valid[ycol + '_score'] = model.predict_proba(df_valid[feats])[:, 1]
val_log = log_loss(df_valid[ycol], df_valid[ycol + '_score'])
print('val log_loss: ', val_log)
predict = model.predict_proba(df_test[feats])[:, 1]

In [None]:
df_importance = pd.DataFrame({
    'column': feats,
    'importance': model.feature_importances_,
})
df_importance.sort_values(by='importance', ascending=False)

In [None]:
sub = pd.read_csv('../init_data/toUser/test/test.csv')

sub[ycol] = predict
sub[['ID',ycol]].to_csv('../temp_data/cat.csv', index=None)

In [None]:
display(sub)
sub.is_finish.describe()