In [44]:
import polars as pl
from transformers import AutoTokenizer,AutoModel
from huggingface_hub import login
from dotenv import load_dotenv
import os
import re
import torch
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader

# .envファイルから環境変数を読み込む
load_dotenv()
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

# トークナイザーを初期化
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', token=huggingface_token)

# 学習データからテキストのみをリストで抽出
TRAIN_FILE_PATH = "../ch6/news+aggregator/train.txt"
TEST_FILE_PATH = "../ch6/news+aggregator/test.txt"
VALID_FILE_PATH = "../ch6/news+aggregator/valid.txt"
TRAIN_LABEL_FILE_PATH = "../ch8/matrix/y_train.npy"

#使用する全てのデータの最長の長さを測定
def get_max_len(train_filepath, test_filepath, valid_filepath):
    train_df = pl.read_csv(train_filepath, separator="\t", new_columns=["text", "categoory"])
    train_text_list = train_df["text"].to_list()
    train_cleaned_text_list = [re.sub(r'[^a-zA-Z\s]', "", text) for text in train_text_list]
    train_tokenized_list = [tokenizer.tokenize(sentence) for sentence in train_cleaned_text_list]
    train_max_len = max([len(sentence) for sentence in train_tokenized_list])

    test_df = pl.read_csv(test_filepath, separator="\t", new_columns=["text", "categoory"])
    test_text_list = test_df["text"].to_list()
    test_cleaned_text_list = [re.sub(r'[^a-zA-Z\s]', "", text) for text in test_text_list]
    test_tokenized_list = [tokenizer.tokenize(sentence) for sentence in test_cleaned_text_list]
    test_max_len = max([len(sentence) for sentence in test_tokenized_list])

    valid_df = pl.read_csv(valid_filepath, separator="\t", new_columns=["text", "categoory"])
    valid_text_list = valid_df["text"].to_list()
    valid_cleaned_text_list = [re.sub(r'[^a-zA-Z\s]', "", text) for text in valid_text_list]
    valid_tokenized_list = [tokenizer.tokenize(sentence) for sentence in valid_cleaned_text_list]
    valid_max_len = max([len(sentence) for sentence in valid_tokenized_list])

    max_len = max([train_max_len, test_max_len, valid_max_len])
    return max_len

max_len = get_max_len(TRAIN_FILE_PATH, TEST_FILE_PATH, VALID_FILE_PATH)

# BERTの入力データは何？要確認！！！！
class BertDataset(Dataset):
    def __init__(self, input_ids, masks, token_type_ids, labels):
        self.input_ids = input_ids
        self.masks = masks
        self.token_type_ids = token_type_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_id = self.input_ids[idx]
        mask = self.masks[idx]
        token_type_id = self.token_type_ids[idx]
        label = self.labels[idx]
        return {"input_ids":input_id, "token_type_ids":token_type_id, "x_attention_mask":mask}, label


def create_dataloader(file_path, label_file_path, max_len=max_len):
    df = pl.read_csv(file_path, separator="\t", new_columns=["text", "categoory"])
    text_list = df["text"].to_list()

    cleaned_text_list = [re.sub(r'[^a-zA-Z\s]', "", text) for text in text_list]
    tokenized_list = [tokenizer.tokenize(sentence) for sentence in cleaned_text_list]
    max_len = max([len(sentence) for sentence in tokenized_list])

    input_ids = []
    attention_masks = []
    token_type_ids = []
    for sentence in cleaned_text_list:
        encoded_dict = tokenizer.encode_plus(
                        sentence,
                        add_special_tokens = True, # Special Tokenの追加
                        max_length = max_len+2,           # 文章の長さを固定（Padding/Trancatinating）
                        pad_to_max_length = True,# PADDINGで埋める
                        return_attention_mask = True,   # Attention maksの作成
                        return_tensors = 'pt',     #  Pytorch tensorsで返す
        )
        input_ids.append(encoded_dict["input_ids"])
        attention_masks.append(encoded_dict["attention_mask"])
        token_type_ids.append(encoded_dict["token_type_ids"])
    # リストに入ったtensorを縦方向（dim=0）へ結合
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)

    Y_train = np.load(label_file_path)
    Y_train =torch.from_numpy(Y_train)
    datasets = BertDataset(input_ids, attention_masks, token_type_ids, Y_train)
    dataloader = DataLoader(datasets, shuffle=True, batch_size=64)
    return dataloader

In [45]:
train_dataloader = create_dataloader(TRAIN_FILE_PATH, TRAIN_LABEL_FILE_PATH, max_len)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [46]:
class BertModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Linear(in_features = 768, out_features = 4)
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask, token_type_ids)
        pooler_output = outputs.pooler_output
        logits = self.classifier(pooler_output).squeeze(-1)
        return logits

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertModel().to(device)
learning_rate = 1e-2
epochs = 500
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [48]:
from tqdm import tqdm

for t in tqdm(range(epochs)):
    size = len(train_dataloader.dataset)
    correct = 0
    for batch, (X, y) in enumerate(train_dataloader):
        # 予測と損失の計算
        input_ids = X["input_ids"].to(device)
        x_attention_mask = X["x_attention_mask"].to(device)
        token_type_ids = X["token_type_ids"].to(device)
        y = y.to(device)
        y = y.argmax(dim=1) #loss_fnのyにはクラスインデックスが期待されている
        pred = model.forward(input_ids, x_attention_mask, token_type_ids)
        loss = loss_fn(pred, y)

        # バックプロパゲーション
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        correct += (pred.argmax(dim=1) == y).sum().item()

    loss = loss.item()
    if (t+1)%100 == 0:
        print(f"epoch:{t+1}, loss: {loss:>7f}, accuracy: {correct/size}")

  0%|          | 0/500 [00:08<?, ?it/s]


KeyboardInterrupt: 