In [1]:
import transformers
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

# roberta でテキストを読み取って 0-shot で埋込表現を得る

def get_embeddings(texts, model_name='FacebookAI/roberta-base'):
    # モデルとトークナイザーの読み込み
    model = transformers.AutoModel.from_pretrained(model_name).to("cuda")
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

    batch_size = 32
    # テキストをトークン化
    embeddings = []
    attention_masks = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding="max_length", truncation=True, return_tensors='pt', max_length=128).to("cuda")

        # モデルを評価モードに設定
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
            # 出力は二次元で取得、最終的に３次元配列を保存
            last_hidden_states = outputs.last_hidden_state.cpu().numpy()
            embeddings.append(last_hidden_states)
            attention_masks.append(inputs['attention_mask'].cpu().numpy())
            
    # すべてのバッチの埋込表現を結合
    embeddings = np.concatenate(embeddings, axis=0)
    attention_masks = np.concatenate(attention_masks, axis=0)
    print(embeddings.shape)
    print(attention_masks.shape)
    return embeddings, attention_masks

In [2]:
train_df = pd.read_csv("../dataset/train.csv")
train_texts = train_df['Caption'].tolist()
# 埋込表現を取得
embeddings, attention_masks = get_embeddings(train_texts)
# 埋込表現を保存
np.save("../dataset/roberta_embeddings_train.npy", embeddings)
np.save("../dataset/roberta_attention_masks_train.npy", attention_masks)

test_df = pd.read_csv("../dataset/test.csv")
test_texts = test_df['Caption'].tolist()
# 埋込表現を取得
embeddings, attention_masks = get_embeddings(test_texts)
# 埋込表現を保存
np.save("../dataset/roberta_embeddings_test.npy", embeddings)
np.save("../dataset/roberta_attention_masks_test.npy", attention_masks)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

2025-04-28 23:53:16.400406: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-28 23:53:16.631399: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745851996.713948  125523 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745851996.739833  125523 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745851996.913133  125523 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 180/180 [00:31<00:00,  5.79it/s]


(5760, 128, 768)
(5760, 128)


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 20/20 [00:03<00:00,  6.08it/s]


(640, 128, 768)
(640, 128)
