In [1]:
import transformers
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

# DeBERTa でテキストを読み取って 0-shot で埋込表現を得る

def get_embeddings(texts, model_name='microsoft/deberta-base'):
    # モデルとトークナイザーの読み込み
    model = transformers.AutoModel.from_pretrained(model_name).to("cuda")
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

    batch_size = 32
    # テキストをトークン化
    embeddings = []
    attention_masks = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding="max_length", truncation=True, return_tensors='pt', max_length=128).to("cuda")

        # モデルを評価モードに設定
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
            # 出力は二次元で取得、最終的に３次元配列を保存
            last_hidden_states = outputs.last_hidden_state.cpu().numpy()
            embeddings.append(last_hidden_states)
            attention_masks.append(inputs['attention_mask'].cpu().numpy())
            
    # すべてのバッチの埋込表現を結合
    embeddings = np.concatenate(embeddings, axis=0)
    attention_masks = np.concatenate(attention_masks, axis=0)
    print(embeddings.shape)
    print(attention_masks.shape)
    return embeddings, attention_masks

In [2]:
train_df = pd.read_csv("../dataset/train.csv")
train_texts = train_df['Caption'].tolist()
# 埋込表現を取得
embeddings, attention_masks = get_embeddings(train_texts)
# 埋込表現を保存
np.save("../dataset/deberta_embeddings_train.npy", embeddings)
np.save("../dataset/deberta_attention_masks_train.npy", attention_masks)

test_df = pd.read_csv("../dataset/test.csv")
test_texts = test_df['Caption'].tolist()
# 埋込表現を取得
embeddings, attention_masks = get_embeddings(test_texts)
# 埋込表現を保存
np.save("../dataset/deberta_embeddings_test.npy", embeddings)
np.save("../dataset/deberta_attention_masks_test.npy", attention_masks)

2025-04-27 09:25:07.887794: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-27 09:25:08.024766: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745713508.099326 1087536 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745713508.122690 1087536 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745713508.225134 1087536 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

(5760, 128, 768)
(5760, 128)


100%|██████████| 20/20 [00:04<00:00,  4.65it/s]


(640, 128, 768)
(640, 128)
