In [2]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import torch
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [4]:
MODEL_ID = "intfloat/multilingual-e5-large"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModel.from_pretrained(MODEL_ID).to(device)

In [5]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,review,score,thumbsUpCount,reviewCreatedVersion,replyContent,timeToReply
0,0,Banyak bug nya!!!! Dikit² eror terus,0,0,33.0,"Hello BANK Friend, sorry for the issues you’re...",0 days 05:06:00
1,1,Cash deposit menu does not appear Even after u...,2,0,32.0,"Hello, Friend BANK. Kindly upgrade the BANKApp...",0 days 14:25:00
2,2,Sangat membantu,4,0,33.0,"Hello BANK Customer, we appreciate you using t...",0 days 00:41:00
3,3,Very cool,4,0,33.0,"Hello BANK Friend, we appreciate your review. ...",0 days 00:30:00
4,4,Improved,4,0,33.0,"Hello BANK Friend, we appreciate your feedback...",0 days 15:05:00


In [6]:
#train_df とtest_dfの差分を確認
set(train_df.columns) - set(test_df.columns)

{'score'}

In [7]:
# "review"のマックス文字長を確認
train_df["review"].str.len().max()

np.int64(636)

Embedding操作(テキストのreviewとreplyContentをベクトル化)

In [5]:
class EmbDataset(Dataset):
    def __init__(self, texts, max_length=650):
        self.texts = texts
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, ix):
        token = self.tokenizer(
            self.texts[ix], max_length=self.max_length, padding="max_length", truncation=True, return_token_type_ids=True
        )
        return {
            "input_ids": torch.LongTensor(token["input_ids"]),
            "attention_mask": torch.LongTensor(token["attention_mask"]),
            "token_type_ids": torch.LongTensor(token["token_type_ids"]),
        }

In [6]:
from torch.cuda.amp import autocast

embeddings = {}
for key, df in zip(["train", "test"], [train_df, test_df]):
    emb_list_review = []
    emb_list_reply = []

    dataset_review = EmbDataset(df["review"].values, max_length=650)
    dataset_reply = EmbDataset(df["replyContent"].values, max_length=650)

    data_loader_review = DataLoader(
        dataset_review,
        batch_size=32,  # バッチサイズをさらに小さく
        num_workers=0,
        shuffle=False,
    )
    data_loader_reply = DataLoader(
        dataset_reply,
        batch_size=32,  # バッチサイズをさらに小さく
        num_workers=0,
        shuffle=False,
    )

    bar_review = tqdm(enumerate(data_loader_review), total=len(data_loader_review))
    for iter_i, batch in bar_review:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)

        with autocast():
            with torch.no_grad():
                last_hidden_state, pooler_output = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    return_dict=False,
                )
            batch_embs = last_hidden_state.mean(dim=1)

        emb_list_review.append(batch_embs.detach().cpu().numpy())
        del input_ids, attention_mask, token_type_ids, last_hidden_state, pooler_output
        torch.cuda.empty_cache()

    bar_reply = tqdm(enumerate(data_loader_reply), total=len(data_loader_reply))
    for iter_i, batch in bar_reply:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)

        with autocast():
            with torch.no_grad():
                last_hidden_state, pooler_output = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    return_dict=False,
                )
            batch_embs = last_hidden_state.mean(dim=1)

        emb_list_reply.append(batch_embs.detach().cpu().numpy())
        del input_ids, attention_mask, token_type_ids, last_hidden_state, pooler_output
        torch.cuda.empty_cache()

    embeddings[key] = {"review": np.concatenate(emb_list_review), "replyContent": np.concatenate(emb_list_reply)}

  with autocast():
 78%|███████▊  | 213/274 [2:29:47<57:55, 56.98s/it]  

In [None]:
def embeddings_to_dataframe(embeddings, column_name_prefix):
    emb_df = pd.DataFrame(embeddings)
    # カラム名に接頭辞をつけて区別する
    emb_df.columns = [f"{column_name_prefix}_emb_{i}" for i in range(emb_df.shape[1])]
    return emb_df


# 埋め込みデータをdf変換
train_review_embeddings_df = embeddings_to_dataframe(embeddings["train"]["review"], "review")
train_reply_embeddings_df = embeddings_to_dataframe(embeddings["train"]["replyContent"], "replyContent")
test_review_embeddings_df = embeddings_to_dataframe(embeddings["test"]["review"], "review")
test_reply_embeddings_df = embeddings_to_dataframe(embeddings["test"]["replyContent"], "replyContent")

# 埋め込みデータフレームを元dfにマージ
train_df = pd.concat([train_df, train_review_embeddings_df, train_reply_embeddings_df], axis=1)
test_df = pd.concat([test_df, test_review_embeddings_df, test_reply_embeddings_df], axis=1)

KeyError: 'train'

特徴量追加(timeToReplyを数値変換)

In [None]:
def convert_to_hours(time_str):
    if isinstance(time_str, str) and " days " in time_str:
        try:
            days, time = time_str.split(" days ")
            hours, minutes, seconds = map(int, time.split(":"))
            total_hours = int(days) * 24 + hours + minutes / 60 + seconds / 3600
            return total_hours
        except ValueError:
            return np.nan  # 不正なデータの場合はNaNを返す
    else:
        return np.nan  # time_strが文字列でないか有効な形式でない場合はNaNを返す

In [None]:
train_df["total_hours"] = train_df["timeToReply"].apply(convert_to_hours)
test_df["total_hours"] = test_df["timeToReply"].apply(convert_to_hours)

以降学習

In [None]:
lgb_params = {
    "objective": "multiclass",
    "num_class": 5,
    "metric": "multi_logloss",
    "learning_rate": 0.05,  
    "verbosity": -1,
    "boosting_type": "gbdt",
    "lambda_l1": 0.5,  
    "lambda_l2": 0.5,  # 正則化
    "max_depth": 4,  
    "num_leaves": 31, 
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_child_samples": 20,  # 子サンプル数の最小値を増やす
    "seed": 42,
}

In [None]:
except_cols = ["review", "replyContent", "timeToReply", "score", "Unnamed: 0"]

features = [col for col in train_df.columns if col not in except_cols]

optunaでのハイパラチューニングON

In [None]:
import optuna
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss


# Optunaの目的関数を定義
def objective(trial):
    # ハイパーパラメータをOptunaでサンプリング
    lgb_params = {
        "objective": "multiclass",
        "num_class": 5,
        "metric": "multi_logloss",
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.1),
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "seed": 42,
    }

    # 交差検証の設定
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    oof = np.zeros((train_df.shape[0], 5))

    # 各フォールドでのモデルのトレーニング
    for fold_ix, (trn_, val_) in enumerate(skf.split(train_df, train_df["score"])):
        trn_x = train_df.loc[trn_, features]
        trn_y = train_df.loc[trn_, "score"]
        val_x = train_df.loc[val_, features]
        val_y = train_df.loc[val_, "score"]

        trn_data = lgb.Dataset(trn_x, label=trn_y)
        val_data = lgb.Dataset(val_x, label=val_y)

        # LightGBMモデルのトレーニング
        lgb_model = lgb.train(
            lgb_params,
            trn_data,
            valid_sets=[trn_data, val_data],
            num_boost_round=10000,
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
        )

        # 検証データに対する予測
        oof[val_] = lgb_model.predict(val_x)

    # multi_loglossを計算
    oof_labels = np.argmax(oof, axis=1)
    score = log_loss(train_df["score"], oof)

    return score


# Optunaで最適化
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# 最適なハイパーパラメータを出力
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# 最適なハイパーパラメータでモデルを再トレーニング
best_params = trial.params
best_params.update({"objective": "multiclass", "num_class": 5, "metric": "multi_logloss", "seed": 42})
lgb_params = best_params

# モデルの再トレーニングと予測
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
oof = np.zeros((train_df.shape[0], 5))
preds = np.zeros((test_df.shape[0], 5))

for fold_ix, (trn_, val_) in enumerate(skf.split(train_df, train_df["score"])):
    trn_x = train_df.loc[trn_, features]
    trn_y = train_df.loc[trn_, "score"]
    val_x = train_df.loc[val_, features]
    val_y = train_df.loc[val_, "score"]

    trn_data = lgb.Dataset(trn_x, label=trn_y)
    val_data = lgb.Dataset(val_x, label=val_y)

    lgb_model = lgb.train(
        lgb_params,
        trn_data,
        valid_sets=[trn_data, val_data],
        num_boost_round=10000,
        callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)],
    )

    oof[val_] = lgb_model.predict(val_x)
    preds += lgb_model.predict(test_df[features]) / skf.n_splits

oof_labels = np.argmax(oof, axis=1)
preds_labels = np.argmax(preds, axis=1)

In [None]:
# 提出用ファイルを作成.2️列目を書き換える
sub_df = pd.read_csv("../input/sample_submission.csv", header=None)
sub_df[1] = preds_labels
sub_df.to_csv("sub_emb_opt.csv", index=False)