In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
anime = pd.read_csv("../../data/input/anime.csv")
sample_submission = pd.read_csv("../../data/input/sample_submission.csv")
test = pd.read_csv("../../data/input/test.csv")
train = pd.read_csv("../../data/input/train.csv")

In [3]:
text = [
        "genres",
        "japanese_name",
        "aired",
        "producers",
        "licensors",
        "studios",
    ]

In [4]:
anime_text_only = anime[text]

In [5]:
anime["concated"] = anime_text_only.apply(lambda row: " [SEP] ".join(row), axis=1)

In [6]:
import torch
import torch.nn as nn
from transformers import AutoModel, BertJapaneseTokenizer,AutoTokenizer
import torch.nn.functional as F

In [7]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        )
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [8]:
model = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-v3")
model = model.to("mps")
pool = MeanPooling()
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v3")

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
model = AutoModel.from_pretrained("intfloat/multilingual-e5-large")
model = model.to("mps")
pool = MeanPooling()
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large")

In [10]:
anime.columns

Index(['anime_id', 'genres', 'japanese_name', 'type', 'episodes', 'aired',
       'producers', 'licensors', 'studios', 'source', 'duration', 'rating',
       'members', 'watching', 'completed', 'on_hold', 'dropped',
       'plan_to_watch', 'concated'],
      dtype='object')

In [11]:
embs = []
for c in tqdm(anime["genres"]):
    input = tokenizer.encode_plus(
        c,
        return_tensors="pt",
    )
    input.to("mps")

    with torch.no_grad():
        output = model(**input)
        last_hidden_states = output[0]
        feature = pool(last_hidden_states, input["attention_mask"])
        output = F.normalize(feature, p=2, dim=1)

    output = output.cpu().detach().numpy()
    embs.append(output[0])
embs = np.array(embs)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca_embs = pca.fit_transform(embs)

In [15]:
anime_embs = pd.DataFrame()
embs_df = pd.DataFrame(pca_embs)
anime_embs["anime_id"] = anime["anime_id"]
anime_embs = pd.concat([anime_embs, embs_df], axis=1)

In [16]:
anime_embs.to_csv("../../data/input/created_features/embs/anime_genres_embs50.csv", index=False)