<a href="https://colab.research.google.com/github/otkata19/competition/blob/main/lgbm_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd

from pathlib import Path
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

In [None]:
%matplotlib inline
font = {'family':'IPAexGothic'}
plt.rc('font', **font)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Load Files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/My Drive/Nishika/Narou

In [None]:
ls

In [None]:
df_exp = pd.read_csv("sample_submission.csv")

In [None]:
fparh = Path('./')

train_fname = 'train.csv'
test_fname = 'test.csv'
sub_fname = 'sample_submission.csv'

In [None]:
df_train = pd.read_csv(fparh / train_fname)
df_test = pd.read_csv(fparh / test_fname)
submission = pd.read_csv(fparh / sub_fname)

print('df_train shape :', df_train.shape)
print('df_test shape :', df_test.shape)
print('submission shape :', submission.shape)

In [None]:
df_train.head()

In [None]:
df_test.head()

## Distribution of Target

In [None]:
df_train['fav_novel_cnt_bin'].value_counts()

In [None]:
fig = plt.figure(figsize=(16, 9))
plt.hist(df_train['fav_novel_cnt_bin'], bins=5)
plt.show()

## Modeling

In [None]:
def fit(tr_x, tr_y, va_x, va_y, tr_w=None, va_w=None):
    """
    model training
  
    Parameters
    ----------
    tr_x: pd.DataFrame
    tr_y: pd.DataFrame
    va_x: pd.DataFrame
    va_y: pd.DataFrame

    Returns
    ----------
    model:
        - 学習済みモデル
    va_pred: 
        - 検証データの予測結果
    """ 
    # パラメータの設定
    params = {
        'objective': 'multiclass',  
        'boosting_type': 'gbdt',
        'metrics': 'multi_logloss',
        'num_class': 5,
        'seed': 777,
        'learning_rate': 0.01,
        'n_jobs': -1,
        'verbose': -1
        }

    # 学習セットを作成
    lgb_train = lgb.Dataset(tr_x, tr_y)
    lgb_eval = lgb.Dataset(va_x, va_y, reference=lgb_train)

    # モデルの学習
    model = lgb.train(
        params,
        train_set=lgb_train, # トレーニングデータの指定
        valid_sets=[lgb_train, lgb_eval],
        valid_names=['train', 'valid'],
        num_boost_round=1000,
        early_stopping_rounds = 100,
        verbose_eval = 20
        )
    
    # 検証データの予測確率
    va_pred = model.predict(va_x)

    return model, va_pred

In [None]:
def scoring(y_true, y_prob):
    """Multi-class logloss"""
    return log_loss(y_true, y_prob)

In [None]:
df_train['userid']

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from glob import glob

import torch
import transformers
from tqdm.notebook import tqdm
tqdm.pandas()

class BertSequenceVectorizer:
    def __init__(self, model_name: str):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = model_name
        self.tokenizer = transformers.T5Tokenizer.from_pretrained(self.model_name)
        self.tokenizer.do_lower_case = True 
        self.bert_model = transformers.RobertaModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128

    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy()
        else:
            return seq_out[0][0].detach().numpy()

DATA_DIR = Path('./')

train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')

BSV = BertSequenceVectorizer('rinna/japanese-roberta-base')

for col in ['title', 'story', 'keyword']:
    print('##########' + col + '##########')
    train[col] = train[col].fillna('NaN')
    test[col] = test[col].fillna('NaN')
    np.save(f'train_{col}_roberta', np.stack(train[col].progress_apply(lambda x: BSV.vectorize(x))))
    np.save(f'test_{col}_roberta', np.stack(test[col].progress_apply(lambda x: BSV.vectorize(x))))

In [None]:
ls

In [None]:
train_story = np.load('train_story_roberta.npy')

In [None]:
# 値が全て同じカラムとobject型のカラムは使用しない
drop_lst = ['end', 'isstop']
object_lst = ['ncode', 'general_firstup', 'title', 'story', 'keyword', 'writer']

df_train_numeric = df_train.drop(drop_lst, axis=1)
df_train_numeric = df_train_numeric.drop(object_lst, axis=1)

df_test_numeric = df_test.drop(drop_lst, axis=1)
df_test_numeric = df_test_numeric.drop(object_lst, axis=1)

df_train_numeric.head()

In [None]:
# 説明変数,目的変数を分割
X = df_train_numeric.drop('fav_novel_cnt_bin', axis=1)
y = df_train_numeric['fav_novel_cnt_bin']

In [None]:
df_pred = pd.DataFrame(index=X.index, columns=['proba_0', 'proba_1', 'proba_2',	'proba_3', 'proba_4'])
df_pred

In [None]:
#@title
models = []
df_pred = pd.DataFrame(index=X.index, columns=['proba_0', 'proba_1', 'proba_2',	'proba_3', 'proba_4'])

# トレーニングデータ,テストデータの分割
skf = StratifiedKFold(n_splits=5)
for i, (train_index, valid_index) in enumerate(skf.split(train_story, y),1):
    print('---CV{i}---')
    # X_train, y_train = X.loc[train_index], y.loc[train_index]
    # X_valid, y_valid = X.loc[valid_index], y.loc[valid_index]
    print(len(train_story[train_index]), len(y.loc[train_index]))
    X_train, y_train = train_story[train_index], y.loc[train_index]
    X_valid, y_valid = train_story[valid_index], y.loc[valid_index]
    # モデルの学習
    model, va_pred = fit(X_train, y_train, X_valid, y_valid) 
    # モデルの格納
    models.append(model)
    # 検証データの予測結果を格納
    df_pred.loc[valid_index] = va_pred

In [None]:
models = []
df_pred = pd.DataFrame(index=X.index, columns=['proba_0', 'proba_1', 'proba_2',	'proba_3', 'proba_4'])

# トレーニングデータ,テストデータの分割
skf = StratifiedKFold(n_splits=5)
for train_index, valid_index in skf.split(X, y):
    print('---CV{i}---')
    X_train, y_train = X.loc[train_index], y.loc[train_index]
    X_valid, y_valid = X.loc[valid_index], y.loc[valid_index]
    # モデルの学習
    model, va_pred = fit(X_train, y_train, X_valid, y_valid) 
    # モデルの格納
    models.append(model)
    # 検証データの予測結果を格納
    df_pred.loc[valid_index] = va_pred

In [None]:
models

In [None]:
# モデルの保存
import pickle

# カレントディレクトリへモデルを保存
file = 'trained_model.pkl'
pickle.dump(models, open(file, 'wb'))

In [None]:
# モデルの削除
del models

In [None]:
models = pickle.load(open('trained_model.pkl', 'rb'))

In [None]:
df_pred

In [None]:
# CVスコア
scoring(df_train['fav_novel_cnt_bin'], df_pred)

## Prediction

In [None]:
def predict(models, x):
    """
    prediction
  
    Parameters
    ----------
    models: list
        - trained model
    x: pd.DataFrame

    Returns
    ----------
    result: pd.DataFrame
    """ 
    result = pd.DataFrame(0, index=x.index, columns=['proba_0',	'proba_1', 'proba_2', 'proba_3', 'proba_4'])
    for model in models:
        pred_prob = model.predict(x, num_iteration=model.best_iteration)
        lgb.plot_importance(model, figsize=(12,8), max_num_features=50, importance_type='gain')
        plt.tight_layout()
        plt.show()
        df_pred = pd.DataFrame(pred_prob, index=x.index, columns=['proba_0', 'proba_1', 'proba_2', 'proba_3', 'proba_4'])
        result += df_pred
    result = result / 5
    return result

In [None]:
df_test_numeric

In [None]:
result = predict(models, df_test_numeric)
result

## Create Submission File

In [None]:
output_fpath = Path('./')
submission.iloc[:, 1:] = result
submission.to_csv(output_fpath / 'submission.csv', header=True, index=None)

In [None]:
ls