In [1]:
import polars as pl
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from collections import Counter
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

FILE_PATH = "./wiki_corpus_2.01/kyoto_lexicon.csv"
tokenizer_src = get_tokenizer('spacy', language='ja_core_news_sm')
tokenizer_tgt = get_tokenizer('spacy', language='en_core_web_sm')
# どれか一つの行がうまく読み込めなかったため、truncate_tagged_linesでfield数が一致しない行は無視している(元データ51983行)

class datasets(Dataset):
    def __init__(self, text, label):
        self.jp_datas = text
        self.en_datas = label

    def __len__(self):
        return len (self.jp_datas)

    def __getitem__(self, index):
        jp = self.jp_datas[index]
        en = self.en_datas[index]
        return jp,en

class DataLoaderCreater:

    def __init__(self, file_path, src_tokenizer, tgt_tokenizer):
        self.file_path = file_path
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def build_vocab(self, texts, tokenizer):
        counter = Counter()
        for text in texts:
            counter.update(tokenizer(text))
        specials = ['<unk>', '<pad>', '<start>', '<end>']
        v = vocab(counter, specials=specials, min_freq=1)
        v.set_default_index(v['<unk>'])
        return v

    def convert_text_to_indexes(self, text, vocab, tokenizer):
        return [vocab['<start>']] + [
            vocab[token] if token in vocab else vocab['<unk>'] for token in tokenizer(text.strip("\n"))
        ] + [vocab['<end>']]

    def create_dataloader(self):
        df = pl.read_csv(self.file_path, separator=",", encoding="utf-8", has_header=True, truncate_ragged_lines=True)
        df_selected = df.select([df.columns[0], df.columns[1]])
        df_jp = df_selected[:, 0]
        jp_list = df_jp.to_list()
        df_en = df_selected[:, 1]
        en_list = df_en.to_list()

        self.vocab_src = self.build_vocab(jp_list, tokenizer_src)
        self.vocab_tgt = self.build_vocab(en_list, tokenizer_tgt)
        self.vocab_src = self.vocab_src.get_stoi()
        self.vocab_tgt = self.vocab_tgt.get_stoi()
        self.len_src_vocab = len(self.vocab_src)
        self.len_tgt_vocab = len(self.vocab_tgt)

        src_data = pad_sequence([torch.tensor(self.convert_text_to_indexes(text, self.vocab_src, tokenizer=self.src_tokenizer)) for text in jp_list], batch_first = True, padding_value = self.vocab_src["<pad>"])
        tgt_data = pad_sequence([torch.tensor(self.convert_text_to_indexes(text, self.vocab_tgt, tokenizer=self.tgt_tokenizer)) for text in en_list], batch_first = True, padding_value = self.vocab_tgt["<pad>"])

        dataset = datasets(src_data, tgt_data)

        # データセットの長さを取得
        dataset_length = len(dataset)

        # 各分割のサイズを計算
        train_size = int(0.8 * dataset_length)
        val_size = int(0.1 * dataset_length)
        test_size = dataset_length - train_size - val_size

        # データセットをランダムに分割
        train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
        train_dataloader = DataLoader(train_dataset, batch_size=128)
        valid_dataloader = DataLoader(valid_dataset, batch_size=128)
        test_dataloader = DataLoader(test_dataset, batch_size=128)

        return train_dataloader, test_dataloader, valid_dataloader

dataloader_creater = DataLoaderCreater(FILE_PATH, tokenizer_src, tokenizer_tgt)
train_dataloader, test_dataloader, valid_dataloader = dataloader_creater.create_dataloader()
src_vocab_size = dataloader_creater.len_src_vocab
tgt_vocab_size = dataloader_creater.len_tgt_vocab
vocab_src = dataloader_creater.vocab_src
vocab_tgt = dataloader_creater.vocab_tgt



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import math
# EncodingとEmbeddingの違い
# 学習を行わないのがPositional Encoding
# 学習を行うのがPositional Embedding

class PositionalEncoding():
    def __init__(self, embedding_dim, len_sequence):
        self.embedding_dim = embedding_dim
        self.len_sequence = len_sequence

    def get_sin(self,i,k):
        return math.sin(i/(10000)**(k/self.len_sequence))

    def get_cos(self,i,k):
        return math.cos(i/(10000)**(k/self.len_sequence))

    def get_positional_vector(self):
        pe = torch.zeros(self.len_sequence, self.embedding_dim)
        for pos in range(self.len_sequence):
            for i in range(0, int(self.embedding_dim/2)):
                pe[pos, 2*i] = self.get_sin(pos, i)
                pe[pos, 2*i+1] = self.get_cos(pos, i)
        return pe


# Transformerモデルの定義
class TransformerModel(nn.Module):
    def __init__(self, vocab_size_src, vocab_size_tgt, embedding_dim, num_heads, num_layers, device,  dropout=0.1):
        super().__init__()
        self.device = device
        # Positional Encoderを加算する必要あり
        self.embedding_src = nn.Embedding(vocab_size_src, embedding_dim)
        self.embedding_tgt = nn.Embedding(vocab_size_tgt, embedding_dim)
        self.pos_embedding_src = PositionalEncoding(embedding_dim, 33)
        self.pos_embedding_tgt = PositionalEncoding(embedding_dim, 73)
        self.transformer = nn.Transformer(d_model=embedding_dim, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(embedding_dim, vocab_size_tgt)

    def forward(self, src, tgt):
        src = self.embedding_src(src)
        tgt = self.embedding_tgt(tgt)
        batch_size = src.shape[0]
        pos_src = self.pos_embedding_src.get_positional_vector().to(device)
        pos_src = pos_src.unsqueeze(0).repeat(batch_size, 1, 1)
        pos_tgt = self.pos_embedding_tgt.get_positional_vector().to(device)
        pos_tgt = pos_tgt.unsqueeze(0).repeat(batch_size, 1, 1)
        src = src + pos_src
        tgt = tgt + pos_tgt
        output = self.transformer(src, tgt)
        output = self.fc_out(output)
        return output



# モデルのハイパーパラメータ
embedding_dim = 512
num_heads = 8
num_layers = 6
lr_rate = 1e-5

device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')

# モデルの初期化
model = TransformerModel(src_vocab_size, tgt_vocab_size, embedding_dim, num_heads, num_layers, device).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=vocab_tgt['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=lr_rate)

# トレーニングループ
num_epochs = 100

def train_epoch(model, dataloader, criterion, optimizer):
    model.train()
    epoch_loss = 0
    for src, tgt in dataloader:
        optimizer.zero_grad()
        src = src.to(device)
        tgt = tgt.to(device)
        output = model(src, tgt)
        # tgt = F.one_hot(tgt, tgt_vocab_size)
        output = output.permute(0, 2, 1) #なぜ(バッチサイズ、　シークエンス長、　vocab_size) => (バッチサイズ、　vocab_size、　シークエンス長)にする必要があるのか
        loss = criterion(output, tgt)
        loss.backward()##ここが問題　r
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss/len(dataloader)

for epoch in tqdm(range(num_epochs)):
    train_loss = train_epoch(model, train_dataloader, criterion, optimizer)
    if (epoch+1)%10 == 0:
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')

 10%|█         | 10/100 [45:15<6:46:18, 270.88s/it]

Epoch 10, Train Loss: 2.8020


 20%|██        | 20/100 [1:30:35<6:03:37, 272.71s/it]

Epoch 20, Train Loss: 1.9804


 30%|███       | 30/100 [2:14:40<5:13:39, 268.85s/it]

Epoch 30, Train Loss: 1.4751


 40%|████      | 40/100 [3:00:15<4:33:28, 273.48s/it]

Epoch 40, Train Loss: 1.1139


 50%|█████     | 50/100 [3:45:51<3:47:58, 273.58s/it]

Epoch 50, Train Loss: 0.8380


 60%|██████    | 60/100 [4:30:46<3:00:00, 270.02s/it]

Epoch 60, Train Loss: 0.6200


 70%|███████   | 70/100 [5:16:23<2:16:47, 273.57s/it]

Epoch 70, Train Loss: 0.4444


 80%|████████  | 80/100 [6:02:03<1:31:15, 273.75s/it]

Epoch 80, Train Loss: 0.3026


 90%|█████████ | 90/100 [6:47:00<44:47, 268.74s/it]  

Epoch 90, Train Loss: 0.1882


100%|██████████| 100/100 [7:32:39<00:00, 271.59s/it]

Epoch 100, Train Loss: 0.1015





In [5]:
torch.save(model.state_dict(), 'model_weight.pth')