In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from catboost import Pool
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, StratifiedKFold
import gensim
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("/kaggle/input/ykc-2nd/train.csv")
test = pd.read_csv("/kaggle/input/ykc-2nd/test.csv")
sub = pd.read_csv("/kaggle/input/ykc-2nd/sample_submission.csv")

In [3]:
df = pd.concat([train, test])
df = df.reset_index(drop=True)

In [4]:
target = "department_id" 
n_split = 5
features = ["product_name", "order_rate", "order_dow_mode", "order_hour_of_day_mode"]
text_features = ["product_name"]

In [5]:
train = df[~df[target].isna()]
test = df[df[target].isna()]

In [6]:
# cross validation
preds_test = []
scores = []
kfold = StratifiedKFold(n_splits=n_split, shuffle = True, random_state=42)
for i_fold, (train_idx, valid_idx) in enumerate(kfold.split(train, train[target])):
    print(f"--------fold {i_fold}-------")
    ## train data
    x_tr = train.loc[train_idx, features]
    y_tr = train.loc[train_idx, target]

    ## valid data
    x_va = train.loc[valid_idx, features]
    y_va = train.loc[valid_idx, target]

    train_pool = Pool(x_tr, y_tr, text_features=text_features)
    validate_pool = Pool(x_va, y_va, text_features=text_features)

#     params = {"loss_function": "MultiClass",
#               "eval_metric": "TotalF1:average=Micro",
#               "use_best_model": True,
#               "random_seed": 42,
#               'iterations': 198, 
#               'depth': 9,
#               'learning_rate': 0.17314687626992661, 
#               'random_strength': 76, 
#               'bagging_temperature': 0.6495194368584443,
#               'od_type': 'IncToDec', 
#               'od_wait': 47,
#               'verbose': False}
    
    params = {"loss_function": "MultiClass",
              "eval_metric": "TotalF1:average=Micro",
              "use_best_model": True,
              "random_seed": 42,
              'iterations': 296, 
              'depth': 8, 
              'learning_rate': 0.12981336602827964, 
              'random_strength': 64, 
              'bagging_temperature': 0.19418721946609516, 
              'od_type': 'IncToDec', 
              'od_wait': 44,
              'verbose': False}
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=validate_pool)
    
    ## predict on valid
    pred_val = model.predict_proba(x_va)
    
    ## evaluate
    score = f1_score(y_va, np.argmax(pred_val, axis = 1), average = "micro")
    print(f'F1 = {score}')
    scores.append(score)
    
    ## predict on test
    pred_test = model.predict_proba(test[features])
    preds_test.append(pred_test)

--------fold 0-------
F1 = 0.769370094586436
--------fold 1-------
F1 = 0.7768162608170658
--------fold 2-------
F1 = 0.7753623188405797
--------fold 3-------
F1 = 0.7739533011272142
--------fold 4-------
F1 = 0.7665056360708534


In [7]:
np.mean(scores)

0.7724015222884297

In [8]:
pred_test = model.predict_proba(test[features])
preds_test.append(pred_test)

In [9]:
pred_test_final = np.array(preds_test).mean(axis = 0)
pred_test_final = np.argmax(pred_test_final, axis = 1)

In [10]:
sub["department_id"] = pred_test_final
sub.to_csv("submission-catboost-text-stratified-optimized.csv", index = False)
sub.head()

Unnamed: 0,product_id,department_id
0,24842,18
1,24843,6
2,24844,6
3,24845,6
4,24846,0
