In [2]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import torch
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [4]:
MODEL_ID = "intfloat/multilingual-e5-large"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModel.from_pretrained(MODEL_ID).to(device)

In [5]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,review,score,thumbsUpCount,reviewCreatedVersion,replyContent,timeToReply
0,0,Banyak bug nya!!!! Dikit² eror terus,0,0,33.0,"Hello BANK Friend, sorry for the issues you’re...",0 days 05:06:00
1,1,Cash deposit menu does not appear Even after u...,2,0,32.0,"Hello, Friend BANK. Kindly upgrade the BANKApp...",0 days 14:25:00
2,2,Sangat membantu,4,0,33.0,"Hello BANK Customer, we appreciate you using t...",0 days 00:41:00
3,3,Very cool,4,0,33.0,"Hello BANK Friend, we appreciate your review. ...",0 days 00:30:00
4,4,Improved,4,0,33.0,"Hello BANK Friend, we appreciate your feedback...",0 days 15:05:00


In [6]:
#train_df とtest_dfの差分を確認
set(train_df.columns) - set(test_df.columns)

{'score'}

In [7]:
# "review"のマックス文字長を確認
train_df["review"].str.len().max()

np.int64(636)

In [10]:
class EmbDataset(Dataset):
    def __init__(self, texts, max_length=650):
        self.texts = texts
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, ix):
        token = self.tokenizer(
            self.texts[ix], max_length=self.max_length, padding="max_length", truncation=True, return_token_type_ids=True
        )
        return {
            "input_ids": torch.LongTensor(token["input_ids"]),
            "attention_mask": torch.LongTensor(token["attention_mask"]),
            "token_type_ids": torch.LongTensor(token["token_type_ids"]),
        }

In [8]:
embeddings = {}
for key, df in zip(["train", "test"], [train_df, test_df]):
    emb_list_review = []
    emb_list_reply = []

    # "review"のエンベディング用データセット
    dataset_review = EmbDataset(df["review"].values, max_length=650)
    # "replyContent"のエンベディング用データセット
    dataset_reply = EmbDataset(df["replyContent"].values, max_length=650)

    # DataLoaderをそれぞれのデータセットに対して作成
    data_loader_review = DataLoader(
        dataset_review,
        batch_size=256,
        num_workers=0,
        shuffle=False,
    )
    data_loader_reply = DataLoader(
        dataset_reply,
        batch_size=256,
        num_workers=0,
        shuffle=False,
    )

    # review のエンベディング計算
    bar_review = tqdm(enumerate(data_loader_review), total=len(data_loader_review))
    for iter_i, batch in bar_review:
        # input
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)

        with torch.no_grad():
            last_hidden_state, pooler_output, hidden_state = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                output_hidden_states=True,
                return_dict=False,
            )
            batch_embs = last_hidden_state.mean(dim=1)

        emb_list_review.append(batch_embs.detach().cpu().numpy())

    # replyContent のエンベディング計算
    bar_reply = tqdm(enumerate(data_loader_reply), total=len(data_loader_reply))
    for iter_i, batch in bar_reply:
        # input
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)

        with torch.no_grad():
            last_hidden_state, pooler_output, hidden_state = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                output_hidden_states=True,
                return_dict=False,
            )
            batch_embs = last_hidden_state.mean(dim=1)

        emb_list_reply.append(batch_embs.detach().cpu().numpy())

    # 埋め込みを一つの辞書に格納
    embeddings[key] = {"review": np.concatenate(emb_list_review), "replyContent": np.concatenate(emb_list_reply)}

NameError: name 'EmbDataset' is not defined

In [9]:
def embeddings_to_dataframe(embeddings, column_name_prefix):
    emb_df = pd.DataFrame(embeddings)
    # カラム名に接頭辞をつけて区別する
    emb_df.columns = [f"{column_name_prefix}_emb_{i}" for i in range(emb_df.shape[1])]
    return emb_df


# 埋め込みデータをデータフレームに変換
train_review_embeddings_df = embeddings_to_dataframe(embeddings["train"]["review"], "review")
train_reply_embeddings_df = embeddings_to_dataframe(embeddings["train"]["replyContent"], "replyContent")
test_review_embeddings_df = embeddings_to_dataframe(embeddings["test"]["review"], "review")
test_reply_embeddings_df = embeddings_to_dataframe(embeddings["test"]["replyContent"], "replyContent")

# 埋め込みデータフレームを元のデータフレームにマージ
train_df = pd.concat([train_df, train_review_embeddings_df, train_reply_embeddings_df], axis=1)
test_df = pd.concat([test_df, test_review_embeddings_df, test_reply_embeddings_df], axis=1)

KeyError: 'train'