In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from catboost import Pool
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, StratifiedKFold
import gensim
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("/kaggle/input/ykc-2nd/train.csv")
test = pd.read_csv("/kaggle/input/ykc-2nd/test.csv")
sub = pd.read_csv("/kaggle/input/ykc-2nd/sample_submission.csv")

train_ngt = pd.read_csv('../input/ykc2-ngt-top-n/train_top_n.csv')
test_ngt = pd.read_csv('../input/ykc2-ngt-top-n/test_top_n.csv')
del train_ngt['Unnamed: 0']
del test_ngt['Unnamed: 0']

In [3]:
train = pd.merge(train, train_ngt, left_index=True, right_index=True)
test = pd.merge(test, test_ngt, left_index=True, right_index=True)

train.head()

Unnamed: 0,product_id,product_name,order_rate,order_dow_mode,order_hour_of_day_mode,department_id,top1,top2,top3,top4,top5
0,0,All-Seasons Salt,2.779494e-06,0,11,12,12.0,18.0,10.0,0.0,2.0
1,1,Smart Ones Classic Favorites Mini Rigatoni Wit...,1.037875e-05,0,16,0,0.0,8.0,7.0,14.0,20.0
2,2,Green Chile Anytime Sauce,4.731054e-07,0,11,12,5.0,12.0,8.0,14.0,0.0
3,3,Dry Nose Oil,2.365527e-07,6,10,10,10.0,12.0,16.0,14.0,0.0
4,4,Pure Coconut Water With Orange,9.166416e-07,3,14,6,6.0,10.0,20.0,12.0,17.0


In [4]:
df = pd.concat([train, test])
df = df.reset_index(drop=True)

In [5]:
df["product_name_split"] = df["product_name"].apply(lambda words : words.lower().replace(",", "").replace("&", "").replace("'", "").split(' '))

In [6]:
## 訓練済みの単語ベクトルを読み込んで，product_nameに含まれる単語をベクトルに変換して平均を取ることで，各product_idに対して特徴量ベクトルを作成する

## gensimで.vecから読み込むときに時間がかかるので，他のnotebookでpickleで保存したものを使用している
model = pd.read_pickle("/kaggle/input/yjfasttext/fasttext_gensim_model.pkl") 

## gensimでvecから読み込む場合（５分ぐらいかかる）
# model = gensim.models.KeyedVectors.load_word2vec_format('../input/ykc-2nd/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec')

from collections import defaultdict
unused_words = defaultdict(int)
def to_vec(x, model):
    v = np.zeros(model.vector_size)
    for w in x:
        try:
            v += model[w] ## 単語が訓練済みモデルのvocabにあったら
        except:
            unused_words[w] += 1 ## ベクトルが存在しなかった単語をメモ
    v = v / (np.sqrt(np.sum(v ** 2)) + 1e-16) ## 長さを1に正規化
    return v    
vecs = df["product_name_split"].apply(lambda x : to_vec(x, model))
vecs = np.vstack(vecs)
fasttext_pretrain_cols = [f"fasttext_pretrain_vec{k}" for k in range(vecs.shape[1])]
vec_df = pd.DataFrame(vecs, columns=fasttext_pretrain_cols)
df = pd.concat([df, vec_df], axis = 1)
df.head()

Unnamed: 0,product_id,product_name,order_rate,order_dow_mode,order_hour_of_day_mode,department_id,top1,top2,top3,top4,...,fasttext_pretrain_vec290,fasttext_pretrain_vec291,fasttext_pretrain_vec292,fasttext_pretrain_vec293,fasttext_pretrain_vec294,fasttext_pretrain_vec295,fasttext_pretrain_vec296,fasttext_pretrain_vec297,fasttext_pretrain_vec298,fasttext_pretrain_vec299
0,0,All-Seasons Salt,2.779494e-06,0,11,12.0,12.0,18.0,10.0,0.0,...,-0.057035,0.031508,0.005112,0.000568,-0.008253,0.040396,0.056868,0.07187,-0.009923,-0.000702
1,1,Smart Ones Classic Favorites Mini Rigatoni Wit...,1.037875e-05,0,16,0.0,0.0,8.0,7.0,14.0,...,0.002409,-0.017017,-0.006144,0.038623,0.002476,-0.041209,0.012066,0.092758,0.062074,-0.07998
2,2,Green Chile Anytime Sauce,4.731054e-07,0,11,12.0,5.0,12.0,8.0,14.0,...,0.077602,-0.022785,-0.004824,0.016669,-0.038935,-0.043521,0.035625,0.018629,0.042928,0.042913
3,3,Dry Nose Oil,2.365527e-07,6,10,10.0,10.0,12.0,16.0,14.0,...,-0.048886,-0.037484,0.001347,-0.010236,0.002433,0.105475,-0.012166,0.042813,0.053351,0.032356
4,4,Pure Coconut Water With Orange,9.166416e-07,3,14,6.0,6.0,10.0,20.0,12.0,...,-0.027605,0.020973,0.026813,-0.031821,-0.015133,0.04593,0.020277,0.069537,0.07288,0.021491


In [7]:
cols = [f'top{idx+1}' for idx in range(5)]

In [8]:
target = "department_id" 
n_split = 5
features = fasttext_pretrain_cols + ["product_name", "order_rate", "order_dow_mode", "order_hour_of_day_mode"] ## 予測に使用する特徴量の名前
features = features + cols
text_features = ["product_name"]
category_features = cols

In [9]:
train = df[~df[target].isna()]
test = df[df[target].isna()]

In [10]:
for col in category_features:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

In [11]:
train.head()

Unnamed: 0,product_id,product_name,order_rate,order_dow_mode,order_hour_of_day_mode,department_id,top1,top2,top3,top4,...,fasttext_pretrain_vec290,fasttext_pretrain_vec291,fasttext_pretrain_vec292,fasttext_pretrain_vec293,fasttext_pretrain_vec294,fasttext_pretrain_vec295,fasttext_pretrain_vec296,fasttext_pretrain_vec297,fasttext_pretrain_vec298,fasttext_pretrain_vec299
0,0,All-Seasons Salt,2.779494e-06,0,11,12.0,12.0,18.0,10.0,0.0,...,-0.057035,0.031508,0.005112,0.000568,-0.008253,0.040396,0.056868,0.07187,-0.009923,-0.000702
1,1,Smart Ones Classic Favorites Mini Rigatoni Wit...,1.037875e-05,0,16,0.0,0.0,8.0,7.0,14.0,...,0.002409,-0.017017,-0.006144,0.038623,0.002476,-0.041209,0.012066,0.092758,0.062074,-0.07998
2,2,Green Chile Anytime Sauce,4.731054e-07,0,11,12.0,5.0,12.0,8.0,14.0,...,0.077602,-0.022785,-0.004824,0.016669,-0.038935,-0.043521,0.035625,0.018629,0.042928,0.042913
3,3,Dry Nose Oil,2.365527e-07,6,10,10.0,10.0,12.0,16.0,14.0,...,-0.048886,-0.037484,0.001347,-0.010236,0.002433,0.105475,-0.012166,0.042813,0.053351,0.032356
4,4,Pure Coconut Water With Orange,9.166416e-07,3,14,6.0,6.0,10.0,20.0,12.0,...,-0.027605,0.020973,0.026813,-0.031821,-0.015133,0.04593,0.020277,0.069537,0.07288,0.021491


In [12]:
# cross validation
preds_test = []
scores = []
oof_preds = np.zeros((train.shape[0], train['department_id'].nunique()))
kfold = StratifiedKFold(n_splits=n_split, shuffle = True, random_state=42)
for i_fold, (train_idx, valid_idx) in enumerate(kfold.split(train, train[target])):
    print(f"--------fold {i_fold}-------")
    ## train data
    x_tr = train.loc[train_idx, features]
    y_tr = train.loc[train_idx, target]

    ## valid data
    x_va = train.loc[valid_idx, features]
    y_va = train.loc[valid_idx, target]

    train_pool = Pool(x_tr, y_tr, text_features=text_features)
    validate_pool = Pool(x_va, y_va, text_features=text_features)
    
    params = {"loss_function": "MultiClass",
              "eval_metric": "TotalF1:average=Micro",
              "use_best_model": True,
              "random_seed": 42,
              "verbose": False,
             "task_type": "GPU"
             }
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=validate_pool)
    
    ## predict on valid
    pred_val = model.predict_proba(x_va)
    oof_preds[valid_idx, :] = pred_val
    
    ## evaluate
    score = f1_score(y_va, np.argmax(pred_val, axis = 1), average = "micro")
    print(f'F1 = {score}')
    scores.append(score)
    
    ## predict on test
    pred_test = model.predict_proba(test[features])
    preds_test.append(pred_test)

--------fold 0-------
F1 = 0.7909036023344738
--------fold 1-------
F1 = 0.7902998591265847
--------fold 2-------
F1 = 0.7898550724637681
--------fold 3-------
F1 = 0.7924718196457327
--------fold 4-------
F1 = 0.7864331723027376


In [13]:
np.mean(scores)

0.7899927051746594

In [14]:
pred_test = model.predict_proba(test[features])
preds_test.append(pred_test)

In [15]:
pred_test_prob = np.array(preds_test).mean(axis = 0)
pred_test_value = np.argmax(pred_test_prob, axis = 1)

In [16]:
pd.DataFrame(oof_preds).to_csv('catboost_text_vector_oof.csv', index=False)

In [17]:
pd.DataFrame(pred_test_prob).to_csv('catboost_text_vector_test.csv', index=False)

In [18]:
sub["department_id"] = pred_test_value
sub.to_csv("submission_text_vector.csv", index = False)
sub.head()

Unnamed: 0,product_id,department_id
0,24842,18
1,24843,6
2,24844,6
3,24845,6
4,24846,12
