In [35]:
import polars as pl
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from collections import Counter
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

FILE_PATH = "./wiki_corpus_2.01/kyoto_lexicon.csv"
tokenizer_src = get_tokenizer('spacy', language='ja_core_news_sm')
tokenizer_tgt = get_tokenizer('spacy', language='en_core_web_sm')
# どれか一つの行がうまく読み込めなかったため、truncate_tagged_linesでfield数が一致しない行は無視している(元データ51983行)

class datasets(Dataset):
    def __init__(self, text, label):
        self.jp_datas = text
        self.en_datas = label

    def __len__(self):
        return len (self.jp_datas)

    def __getitem__(self, index):
        jp = self.jp_datas[index]
        en = self.en_datas[index]
        return jp,en

class DataLoaderCreater:

    def __init__(self, file_path, src_tokenizer, tgt_tokenizer):
        self.file_path = file_path
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def build_vocab(self, texts, tokenizer):
        counter = Counter()
        for text in texts:
            counter.update(tokenizer(text))
        specials = ['<unk>', '<pad>', '<start>', '<end>']
        v = vocab(counter, specials=specials, min_freq=1)
        v.set_default_index(v['<unk>'])
        return v

    def convert_text_to_indexes(self, text, vocab, tokenizer):
        return [vocab['<start>']] + [
            vocab[token] if token in vocab else vocab['<unk>'] for token in tokenizer(text.strip("\n"))
        ] + [vocab['<end>']]

    def create_dataloader(self):
        df = pl.read_csv(self.file_path, separator=",", encoding="utf-8", has_header=True, truncate_ragged_lines=True)
        df_selected = df.select([df.columns[0], df.columns[1]])
        df_jp = df_selected[:, 0]
        jp_list = df_jp.to_list()
        df_en = df_selected[:, 1]
        en_list = df_en.to_list()

        self.vocab_src = self.build_vocab(jp_list, tokenizer_src)
        self.vocab_tgt = self.build_vocab(en_list, tokenizer_tgt)
        self.vocab_src = self.vocab_src.get_stoi()
        self.vocab_tgt = self.vocab_tgt.get_stoi()
        self.len_src_vocab = len(self.vocab_src)
        self.len_tgt_vocab = len(self.vocab_tgt)

        src_data = pad_sequence([torch.tensor(self.convert_text_to_indexes(text, self.vocab_src, tokenizer=self.src_tokenizer)) for text in jp_list], batch_first = True, padding_value = self.vocab_src["<pad>"])
        tgt_data = pad_sequence([torch.tensor(self.convert_text_to_indexes(text, self.vocab_tgt, tokenizer=self.tgt_tokenizer)) for text in en_list], batch_first = True, padding_value = self.vocab_tgt["<pad>"])

        dataset = datasets(src_data, tgt_data)

        # データセットの長さを取得
        dataset_length = len(dataset)

        # 各分割のサイズを計算
        train_size = int(0.8 * dataset_length)
        val_size = int(0.1 * dataset_length)
        test_size = dataset_length - train_size - val_size

        # データセットをランダムに分割
        train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
        train_dataloader = DataLoader(train_dataset, batch_size=64)
        valid_dataloader = DataLoader(valid_dataset, batch_size=64)
        test_dataloader = DataLoader(test_dataset, batch_size=64)

        return train_dataloader, test_dataloader, valid_dataloader

dataloader_creater = DataLoaderCreater(FILE_PATH, tokenizer_src, tokenizer_tgt)
train_dataloader, test_dataloader, valid_dataloader = dataloader_creater.create_dataloader()
src_vocab_size = dataloader_creater.len_src_vocab
tgt_vocab_size = dataloader_creater.len_tgt_vocab
vocab_src = dataloader_creater.vocab_src
vocab_tgt = dataloader_creater.vocab_tgt

In [40]:


import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import pad
import math

# Transformerモデルの定義
class TransformerModel(nn.Module):
    def __init__(self, vocab_size_src, vocab_size_tgt, embedding_dim, num_heads, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding_src = nn.Embedding(vocab_size_src, embedding_dim)
        self.embedding_tgt = nn.Embedding(vocab_size_tgt, embedding_dim)
        self.transformer = nn.Transformer(d_model=embedding_dim, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(embedding_dim, vocab_size_tgt)

    def forward(self, src, tgt):
        src = self.embedding_src(src)
        tgt = self.embedding_tgt(tgt)
        output = self.transformer(src, tgt)
        output = self.fc_out(output)
        return output

# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, dropout=0.1, max_len=5000):
#         super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=dropout)

#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0).transpose(0, 1)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         x = x + self.pe[:x.size(0), :]
#         return self.dropout(x)

# モデルのハイパーパラメータ
embedding_dim = 512
num_heads = 8
num_layers = 6
dropout = 0.1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# モデルの初期化
model = TransformerModel(src_vocab_size, tgt_vocab_size, embedding_dim, num_heads, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=vocab_tgt['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# トレーニングループ
num_epochs = 10

def train_epoch(model, dataloader, criterion, optimizer):
    model.train()
    epoch_loss = 0

    for src, tgt in dataloader:
        src = src.to(device)
        tgt = tgt.to(device)
        output = model(src, tgt)
        # tgt = F.one_hot(tgt, tgt_vocab_size)
        print(tgt.shape)
        print(output.shape)
        output = output.permute(0, 2, 1) #なぜ(バッチサイズ、　シークエンス長、　vocab_size) => (バッチサイズ、　vocab_size、　シークエンス長)にする必要があるのか
        loss = criterion(output, tgt)       ##ここが問題　r
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

# def evaluate(model, dataloader, criterion):
#     model.eval()
#     epoch_loss = 0
#     with torch.no_grad():
#         for src, tgt in dataloader:
#             tgt_input = tgt[:, :-1]
#             tgt_output = tgt[:, 1:]
#             output = model(src, tgt_input)
#             output = output.view(-1, output.shape[-1])
#             tgt_output = tgt_output.contiguous().view(-1)
#             loss = criterion(output, tgt_output)
#             epoch_loss += loss.item()
#     return epoch_loss / len(dataloader)

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, criterion, optimizer)
    # valid_loss = evaluate(model, valid_dataloader, criterion)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')

torch.Size([64, 73])
torch.Size([64, 73, 37917])
torch.Size([64, 73])
torch.Size([64, 73, 37917])


KeyboardInterrupt: 