In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from catboost import Pool
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, StratifiedKFold
import gensim
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("/kaggle/input/ykc-2nd/train.csv")
test = pd.read_csv("/kaggle/input/ykc-2nd/test.csv")
sub = pd.read_csv("/kaggle/input/ykc-2nd/sample_submission.csv")
nn_oof = pd.read_csv("/kaggle/input/ykc2nn/nn_oof.csv")
nn_test = pd.read_csv("/kaggle/input/ykc2nn/nn_test.csv")

In [3]:
train = train.join(nn_oof)
test = test.join(nn_test)
df = pd.concat([train, test])
df = df.reset_index(drop=True)

In [4]:
target = "department_id" 
n_split = 5
features = ["order_rate", "order_dow_mode", "order_hour_of_day_mode"]  + list(map(str, range(21)))

In [5]:
train = df[~df[target].isna()]
test = df[df[target].isna()]

In [6]:
# cross validation
preds_test = []
scores = []
oof_preds = np.zeros((train.shape[0], train['department_id'].nunique()))
kfold = StratifiedKFold(n_splits=n_split, shuffle = True, random_state=42)
for i_fold, (train_idx, valid_idx) in enumerate(kfold.split(train, train[target])):
    print(f"--------fold {i_fold}-------")
    ## train data
    x_tr = train.loc[train_idx, features]
    y_tr = train.loc[train_idx, target]

    ## valid data
    x_va = train.loc[valid_idx, features]
    y_va = train.loc[valid_idx, target]

    train_pool = Pool(x_tr, y_tr)
    validate_pool = Pool(x_va, y_va)
    
    params = {"loss_function": "MultiClass",
              "eval_metric": "TotalF1:average=Micro",
              "use_best_model": True,
              "random_seed": 42,
              "verbose": False}
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=validate_pool)
    
    ## predict on valid
    pred_val = model.predict_proba(x_va)
    oof_preds[valid_idx, :] = pred_val
    
    ## evaluate
    score = f1_score(y_va, np.argmax(pred_val, axis = 1), average = "micro")
    print(f'F1 = {score}')
    scores.append(score)
    
    ## predict on test
    pred_test = model.predict_proba(test[features])
    preds_test.append(pred_test)

--------fold 0-------
F1 = 0.7381766955121756
--------fold 1-------
F1 = 0.7393841819279533
--------fold 2-------
F1 = 0.7330917874396136
--------fold 3-------
F1 = 0.7399355877616747
--------fold 4-------
F1 = 0.7413446054750402


In [7]:
np.mean(scores)

0.7383865716232914

In [8]:
pred_test = model.predict_proba(test[features])
preds_test.append(pred_test)

In [9]:
pred_test_prob = np.array(preds_test).mean(axis = 0)
pred_test_value = np.argmax(pred_test_prob, axis = 1)

In [10]:
pd.DataFrame(oof_preds).to_csv('catboost_nn_oof.csv', index=False)

In [11]:
pd.DataFrame(pred_test_prob).to_csv('catboost_nn_test.csv', index=False)

In [12]:
sub["department_id"] = pred_test_value
sub.to_csv("submission_nn.csv", index = False)
sub.head()

Unnamed: 0,product_id,department_id
0,24842,18
1,24843,6
2,24844,6
3,24845,6
4,24846,12
