In [1]:
import polars as pl
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from collections import Counter
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence


JP_TRAIN_FILE_PATH = "./kftt-data-1.0/data/orig/kyoto-train.ja"
EN_TRAIN_FILE_PATH = "./kftt-data-1.0/data/orig/kyoto-train.en"

tokenizer_src = get_tokenizer('spacy', language='ja_core_news_sm')
tokenizer_tgt = get_tokenizer('spacy', language='en_core_web_sm')

with open(JP_TRAIN_FILE_PATH, "r", encoding="utf-8")as f:
    train_jp_list = f.readlines()
    train_jp_list = [jp.strip("\n") for jp in train_jp_list]

with open(EN_TRAIN_FILE_PATH, "r", encoding="utf-8")as f:
    train_en_list = f.readlines()
    train_en_list = [en.strip("\n") for en in train_en_list]




In [2]:
class datasets(Dataset):
    def __init__(self, text, label):
        self.jp_datas = text
        self.en_datas = label

    def __len__(self):
        return len(self.jp_datas)

    def __getitem__(self, index):
        jp = self.jp_datas[index]
        en = self.en_datas[index]
        return jp,en

class DataLoaderCreater:

    def __init__(self, src_tokenizer, tgt_tokenizer):
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def build_vocab(self, texts, tokenizer):
        counter = Counter()
        for text in texts:
            counter.update(tokenizer(text))
        specials = ['<unk>', '<pad>', '<start>', '<end>']
        v = vocab(counter, specials=specials, min_freq=2)   #1回しか出てきていない単語は語彙に入れない
        v.set_default_index(v['<unk>'])
        return v

    def convert_text_to_indexes(self, text, vocab, tokenizer):
        return [vocab['<start>']] + [
            vocab[token] if token in vocab else vocab['<unk>'] for token in tokenizer(text.strip("\n"))
        ] + [vocab['<end>']]

    def create_dataloader(self, jp_list, en_list, collate_fn):
        vocab_src = self.build_vocab(jp_list, tokenizer_src)
        vocab_tgt = self.build_vocab(en_list, tokenizer_tgt)
        self.vocab_src_itos = vocab_src.get_itos()
        self.vocab_tgt_itos = vocab_tgt.get_itos()
        self.vocab_src_stoi = vocab_src.get_stoi()
        self.vocab_tgt_stoi = vocab_tgt.get_stoi()
        self.vocab_size_src = len(self.vocab_src_stoi)
        self.vocab_size_tgt = len(self.vocab_tgt_stoi)

        src_data = [torch.tensor(self.convert_text_to_indexes(jp_data, self.vocab_src_stoi, self.src_tokenizer)) for jp_data in jp_list]
        tgt_data = [torch.tensor(self.convert_text_to_indexes(en_data, self.vocab_tgt_stoi, self.tgt_tokenizer)) for en_data in en_list]
        dataset = datasets(src_data, tgt_data)

        dataloader = DataLoader(dataset, batch_size=64, collate_fn=collate_fn, shuffle=True)

        return dataloader

PADDING_ID = 1

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    src_batch = pad_sequence(src_batch, padding_value=PADDING_ID,  batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PADDING_ID, batch_first=True)
    return src_batch, tgt_batch

dataloader_creater = DataLoaderCreater(tokenizer_src, tokenizer_tgt)
train_dataloader = dataloader_creater.create_dataloader(jp_list=train_jp_list, en_list=train_en_list, collate_fn=collate_fn)
vocab_size_src = dataloader_creater.vocab_size_src
vocab_size_tgt = dataloader_creater.vocab_size_tgt

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
# EncodingとEmbeddingの違い
# 学習を行わないのがPositional Encoding
# 学習を行うのがPositional Embedding

class PositionalEncoding():
    def __init__(self, embedding_dim, len_sequence):
        self.embedding_dim = embedding_dim
        self.len_sequence = len_sequence

    def get_sin(self,i,k):
        return torch.sin(torch.tensor(i/(10000)**(k/self.len_sequence)))

    def get_cos(self,i,k):
        return torch.cos(torch.tensor(i/(10000)**(k/self.len_sequence)))

    def get_positional_vector(self):
        pe = torch.zeros(self.len_sequence, self.embedding_dim)
        for pos in range(self.len_sequence):
            for i in range(0, int(self.embedding_dim/2)):
                pe[pos, 2*i] = self.get_sin(pos, i)
                pe[pos, 2*i+1] = self.get_cos(pos, i)
        return pe


# Transformerモデルの定義
class TransformerModel(nn.Module):
    def __init__(self, vocab_size_src, vocab_size_tgt, embedding_dim, num_heads, num_layers, device,  dropout=0.1):
        super().__init__()
        self.device = device
        # Positional Encoderを加算する必要あり
        self.embedding_dim = embedding_dim
        self.embedding_src = nn.Embedding(vocab_size_src, embedding_dim)
        self.embedding_tgt = nn.Embedding(vocab_size_tgt, embedding_dim)
        self.transformer = nn.Transformer(d_model=embedding_dim, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(embedding_dim, vocab_size_tgt)

    def forward(self, src, tgt):
        src = self.embedding_src(src).to(self.device)
        tgt = self.embedding_tgt(tgt).to(self.device)
        batch_size = src.shape[0]
        pos_src = PositionalEncoding(self.embedding_dim, src.shape[1]).get_positional_vector().to(self.device)
        pos_src = pos_src.unsqueeze(0).repeat(batch_size, 1, 1)
        pos_tgt = PositionalEncoding(self.embedding_dim, tgt.shape[1]).get_positional_vector().to(self.device)
        pos_tgt = pos_tgt.unsqueeze(0).repeat(batch_size, 1, 1)
        src = src + pos_src
        tgt = tgt + pos_tgt
        output = self.transformer(src, tgt)
        output = self.fc_out(output)
        return output

In [8]:
# モデルのハイパーパラメータ
embedding_dim = 512
num_heads = 8
num_layers = 6
lr_rate = 1e-5

# モデルの初期化
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
model = TransformerModel(vocab_size_src, vocab_size_tgt, embedding_dim, num_heads, num_layers, device).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PADDING_ID)
optimizer = optim.Adam(model.parameters(), lr=lr_rate)

# トレーニングループ
num_epochs = 100

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    epoch_loss = 0
    for batch_idx, (src, tgt) in enumerate(dataloader):
        optimizer.zero_grad()
        src = src.to(device)
        tgt = tgt.to(device)
        output = model(src, tgt)
        output = output.permute(0, 2, 1) #なぜ(バッチサイズ、　シークエンス長、　vocab_size) => (バッチサイズ、　vocab_size、　シークエンス長)にする必要があるのか
        loss = criterion(output, tgt)
        loss.backward()##ここが問題　r
        optimizer.step()
        epoch_loss += loss.item()
        del src, tgt, output, loss  # メモリ解放
        torch.cuda.empty_cache()
    return epoch_loss/len(dataloader)

for epoch in tqdm(range(num_epochs)):
    train_loss = train_epoch(model, train_dataloader, criterion, optimizer, device)
    if (epoch+1)%10 == 0:
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')

  0%|          | 0/100 [00:12<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.48 GiB. GPU  has a total capacity of 47.54 GiB of which 2.43 GiB is free. Process 2834466 has 29.90 GiB memory in use. Including non-PyTorch memory, this process has 15.20 GiB memory in use. Of the allocated memory 13.87 GiB is allocated by PyTorch, and 166.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.save(model.state_dict(),'model_weight.pth')