# Thực hành ở nhà Transformers

Hoàn thiện hàm huấn luyện cho mạng Transformer và tiến hành huấn luyện mô hình

### Cài đặt giải thuật tối ưu và huấn luyện mô hình

In [4]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import math

In [9]:
!mkdir data
!wget https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/english.txt
!mv english.txt data
!wget https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/french.txt data/french.txt
!mv french.txt data

--2025-10-30 16:48:27--  https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/english.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘raw.githubusercontent.com’
mv: cannot stat 'english.txt': No such file or directory
--2025-10-30 16:48:59--  https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/french.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘raw.githubusercontent.com’
--2025-10-30 16:49:31--  http://data/french.txt
Resolving data (data)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘data’
mv: cannot stat 'french.txt': No such file or directory


In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## MODEL


In [10]:
class Embedder (nn.Module): # vector meaning
    def __init__ (self, vocab_size, dim):
        super().__init__()  
        self.embed_vector = nn.Embedding(vocab_size, dim)

    def forward (self, x):
        return self.embed_vector(x)

In [13]:
# Return vector embedding include meaning and position
class PositionalEmbedding(nn.Module):
    def __init__(self,dim, max_seq = 300):
        super().__init__()
        self.dim = dim

        positional_embedding = torch.zeros(max_seq, dim)

        position = torch.arange(0, max_seq).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, dim, 2).float() * -(math.log(10000)/dim))

        positional_embedding[:, 0::2] = torch.sin(position * div_term)
        positional_embedding[:, 1::2] = torch.cos(position * div_term)

        positional_embedding = positional_embedding.unsqueeze(0)
        self.register_buffer('positional_embedding', positional_embedding)

    
    def forward(self, x):
        x = x* math.sqrt(self.dim)

        seq_len = x.size(1)

        x = x + self.positional_embedding[:, :seq_len].to(device)

        return x


In [12]:
#Multihead Attention
def attention(q, k, v, dim, mask = None, dropout = None):
    score = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(dim) # batch-seq-seq
    if mask is not None:

        score = score.masked_fill(mask == 0, float('-inf'))
    self_attention = F.softmax(score, dim=-1)
    
    if dropout is not None:
        self_attention = dropout(self_attention)
    
    vector_context = torch.matmul(self_attention, v)
    return vector_context


In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, dim, dropout = 0.1):
        super().__init__()
        self.dim = dim
        self.dim_head = dim//heads  
        self.heads = heads
        self.q_linear = nn.Linear(self.dim, self.dim)
        self.k_linear = nn.Linear(self.dim, self.dim)
        self.v_linear = nn.Linear(self.dim, self.dim)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(dim, dim)

    def forward(self, q, k, v, mask = None): #inputs shape: batch-seq-dim
        batch, sequence_length, dim = q.shape

        q = self.q_linear(q)
        k = self.k_linear(k)
        v = self.v_linear(v)

        q = q.view(batch, sequence_length, self.heads, self.dim_head).transpose(1, 2)
        k = k.view(batch, sequence_length, self.heads, self.dim_head).transpose(1, 2)
        v = v.view(batch, sequence_length, self.heads, self.dim_head).transpose(1, 2)


        vector_context = attention(q, k, v, self.dim_head, mask, self.dropout) # batch - head - sequence - dimhead

        vector_context = vector_context.transpose(1, 2).contiguous().view(batch, sequence_length, dim)
        return self.out(vector_context)


In [16]:
class FeedForward(nn.Module):
    def __init__(self, dim, d_ff = 2048, dropout = 0.1):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(dim, d_ff),
                                 nn.ReLU(),
                                 nn.Dropout(dropout),
                                 nn.Linear(d_ff, dim))

    def forward(self, x):
        return self.net(x)

class Normalization(nn.Module):
    def __init__(self, dim, eps = 1e-6):
        super().__init__()

        self.alpha = nn.Parameter(torch.ones(dim))
        self.bias = nn.Parameter(torch.zeros(dim))

        self.eps = eps

    def forward(self, x):
        norm = self.alpha * (x-x.mean(dim = -1, keepdim = True))/(x.std(dim = -1, keepdim=True) + self.eps)  + self.bias
        return norm

In [17]:
class EncoderBlock(nn.Module):
    def __init__(self, dim, heads, dropout = 0.1):
        super().__init__()
        self.ff = FeedForward(dim)
        self.norm_1 = Normalization(dim) # Should be 2 different norm layer because normalization in each layer different
        self.norm_2 = Normalization(dim)
        self.AttentionLayer = MultiHeadAttention(heads, dim, dropout)
        self.dropout = nn.Dropout(dropout)
    def forward (self, x, mask):
        residual = x
        x = self.AttentionLayer(x, x, x, mask)
        x = residual + self.dropout(x)
        x = self.norm_1(x)

        residual = x
        x = self.ff(x) 
        x = residual + self.dropout(x)
        x = self.norm_2(x)
        return x

class Encoder():
    def __init__(self, vocab_size, dim, heads, num_layers =6 , dropout = 0.1):
        super().__init__()
        self.meaning = Embedder(vocab_size, dim)
        self.postional_embedding = PositionalEmbedding(dim)
        self.encoder_layers == nn.ModuleList([
            EncoderBlock(dim, heads, dropout)
            for _ in range (num_layers)
        ])
        self.norm = Normalization(dim)
    
    def forward (self, src, mask):
        x = self.meaning(src) # vector meaning
        x= self.postional_embedding(x)  # vector meaning + vector position
        for layer in self.encoder_layers:
            x = layer(x, mask)
        encoder_output = self.norm(x)
        return encoder_output



In [18]:
class DecoderBlock(nn.Module):
    def __init__(self, dim, heads, dropout = 0.1):
        super().__init__()
        self.attention_layer = MultiHeadAttention(heads, dim)
        self.norm_1 = Normalization(dim)
        self.cross_attention = MultiHeadAttention(heads, dim)
        self.norm_2 = Normalization(dim)
        self.ff = FeedForward(dim)
        self.norm_3 = Normalization(dim)
        self.dropout = nn.Dropout(dropout)

    def forward (self, x, encoder_output, target_mask, src_mask):
        residual = x
        x = self.attention_layer(x, x, x, target_mask)
        x = residual + self.dropout(x)

        x = self.norm_1(x)

        residual = x
        x = self.cross_attention(x, encoder_output, encoder_output, src_mask)
        x = residual + self.dropout(x)

        x = self.norm_2(x)

        residual = x
        x = self.ff(x)
        x = self.dropout(x) + residual

        return self.norm_3(x)
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, dim, heads, num_layers):
        super().__init__()
        self.meaning = Embedder(vocab_size, dim)
        self.position = PositionalEmbedding(dim)
        self.decoder_layers = nn.ModuleList(DecoderBlock(dim, heads) for _ in range(num_layers))
        self.norm = Normalization(dim)
    def forward(self, x, encoder_output, target_mask, src_mask):
        x = self.meaning(x) # vector_meaning
        x = self.position(x) # vector_meaning + position

        for layer in self.decoder_layers:
            x = layer(x, encoder_output, target_mask, src_mask)
        decoder_output = self.norm(x)
        return decoder_output



In [19]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, target_vocab, dim, heads, num_layer):
        super().__init__()
        self.encoder = Encoder(src_vocab, dim, heads, num_layer)
        self.decoder = Decoder(target_vocab, dim, heads, num_layer)
        self.out = nn.Linear(dim, target_vocab)
    def forward(self, src, target, src_mask, target_mask):
        encoder_output = self.encoder(src, src_mask)
        decoder_output = self.decoder(target, encoder_output, target_mask, src_mask)
        output = self.out(decoder_output)
        
        return output  # Softmax will be performed automatically by our loss_function


## PREPROCESSING


In [20]:
import spacy
import re

class tokennize(object):
    def __init__(self, lang):
        self.nlp = spacy.load(lang)

    def tokenizer (self, sentence):
        sentence = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]
    
    

In [None]:
!pip install torchtext==0.17.2


[0m[31mERROR: Could not find a version that satisfies the requirement torchtext (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torchtext[0m[31m
[0m

In [22]:
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
import spacy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def nopeak_mask(size, device=device):
    """
    Returns a mask for preventing attention to future tokens.
    Shape expected by downstream: (1, size, size) (broadcastable)
    """
    # upper triangular with 1s above diagonal
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    mask = torch.from_numpy(np_mask) == 0  # True where allowed
    return mask.to(device)

def create_masks(src, trg, src_pad, trg_pad, device=device):
    """
    src: LongTensor shape (batch, src_len)
    trg: LongTensor shape (batch, trg_len)  OR None for inference
    Returns: src_mask (batch,1,src_len), trg_mask (batch,1,trg_len, trg_len) or None
    """
    # src_mask: (batch, 1, src_len)
    src_mask = (src != src_pad).unsqueeze(1).to(device)

    if trg is not None:
        # trg_mask: (batch, 1, trg_len)
        trg_mask = (trg != trg_pad).unsqueeze(1).to(device)  # (batch,1,trg_len)
        seq_len = trg.size(1)
        np_mask = nopeak_mask(seq_len, device)  # (1, seq_len, seq_len)
        # combine padding mask and subsequent mask
        # Need to broadcast trg_mask to (batch, seq_len) -> (batch, 1, seq_len) & np_mask (1, seq_len, seq_len)
        # final shape: (batch, seq_len, seq_len) after broadcasting; some models expect (batch, 1, seq_len, seq_len) - adapt as needed
        trg_mask = trg_mask & np_mask  # broadcasting: (batch,1,seq_len) & (1,seq_len,seq_len) -> (batch, seq_len, seq_len) because of alignment
        # For compatibility with many implementations, return shape (batch, 1, seq_len, seq_len)
        trg_mask = trg_mask.unsqueeze(1)  # (batch,1,seq_len,seq_len)
    else:
        trg_mask = None

    return src_mask, trg_mask

# === Batch-sizing helper retained (optional) ===
# If you want dynamic batching by tokens (like original), you can keep this function and use it when creating batches.
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new['src']))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new['trg']) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)


ModuleNotFoundError: No module named 'torchtext'

In [17]:
import os
import pandas as pd
from functools import partial

# Simple wrapper to get spacy tokenizer
def get_spacy_tokenizer(lang_code):
    # lang_code e.g., 'en_core_web_sm' or 'en'
    # Accept either 'en' or 'en_core_web_sm' in opt.src_lang
    name = lang_code if '_' in lang_code else f"{lang_code}_core_web_sm"
    try:
        nlp = spacy.load(name)
    except Exception as e:
        # user may need to install the model
        raise RuntimeError(f"Spacy model '{name}' not found. Install with: python -m spacy download {name}") from e

    def tokenize_text(text):
        return [tok.text.lower() for tok in nlp(text)]
    return tokenize_text

class TranslationDataset(Dataset):
    def __init__(self, src_lines, trg_lines, src_tok_fn, trg_tok_fn, src_vocab, trg_vocab, add_sos_eos=True):
        assert len(src_lines) == len(trg_lines)
        self.src_lines = src_lines
        self.trg_lines = trg_lines
        self.src_tok_fn = src_tok_fn
        self.trg_tok_fn = trg_tok_fn
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.add_sos_eos = add_sos_eos

    def __len__(self):
        return len(self.src_lines)
def __getitem__(self, idx):
        src_text = self.src_lines[idx]
        trg_text = self.trg_lines[idx]
        src_tokens = self.src_tok_fn(src_text)
        trg_tokens = self.trg_tok_fn(trg_text)
        if self.add_sos_eos:
            trg_tokens = ['<sos>'] + trg_tokens + ['<eos>']
        # numericalize lazily in collate
        return {'src': src_tokens, 'trg': trg_tokens}

def yield_tokens(lines, tokenizer):
    for line in lines:
        yield tokenizer(line)

def build_vocabs(opt, src_lines, trg_lines, src_tok_fn, trg_tok_fn, min_freq=2):
    specials = ['<pad>', '<sos>', '<eos>', '<unk>']
    src_vocab = build_vocab_from_iterator(yield_tokens(src_lines, src_tok_fn),
                                         specials=specials,
                                         special_first=True)
    trg_vocab = build_vocab_from_iterator(yield_tokens(trg_lines, trg_tok_fn),
                                         specials=specials,
                                         special_first=True)

    # set default index for unknown tokens
    src_vocab.set_default_index(src_vocab['<unk>'])
    trg_vocab.set_default_index(trg_vocab['<unk>'])
    return src_vocab, trg_vocab

def numericalize(tokens_list, vocab):
    return [vocab[token] for token in tokens_list]

def collate_fn(batch, src_vocab, trg_vocab, max_strlen=None, device=device):
    # batch is a list of {'src': [...tokens...], 'trg': [...tokens...]}
    src_batch = [torch.tensor(numericalize(x['src'], src_vocab), dtype=torch.long) for x in batch]
    trg_batch = [torch.tensor(numericalize(x['trg'], trg_vocab), dtype=torch.long) for x in batch]

    # optionally filter by max length (similar to your mask earlier)
    if max_strlen is not None:
        keep_indices = [i for i, (s, t) in enumerate(zip(src_batch, trg_batch))
                        if s.size(0) <= max_strlen and t.size(0) <= max_strlen]
        if len(keep_indices) != len(batch):
            src_batch = [src_batch[i] for i in keep_indices]
            trg_batch = [trg_batch[i] for i in keep_indices]

    # pad sequences to longest in batch (pad value is index of '<pad>')
    pad_idx_src = src_vocab['<pad>']
    pad_idx_trg = trg_vocab['<pad>']
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=pad_idx_src).to(device)  # (batch, src_len)
    trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=pad_idx_trg).to(device)  # (batch, trg_len)

    return src_padded, trg_padded

def read_data(opt):
    """Read text files into lists of lines (keeps your original API)"""
    if opt.src_data is not None:
        try:
            opt.src_data = open(opt.src_data).read().strip().split('\n')
        except Exception as e:
            raise RuntimeError(f"error: '{opt.src_data}' file not found") from e

    if opt.trg_data is not None:
        try:
            opt.trg_data = open(opt.trg_data).read().strip().split('\n')
        except Exception as e:
            raise RuntimeError(f"error: '{opt.trg_data}' file not found") from e
        
def create_dataset_and_dataloader(opt, device=device):
    """
    Replaces your create_fields + create_dataset workflow.
    Returns: dataloader, src_vocab, trg_vocab, pad indices, dataset length
    """
    print("Creating tokenizers...")
    src_lang = opt.src_lang #[0:2]
    trg_lang = opt.trg_lang#[0:2]
    src_tok = get_spacy_tokenizer(src_lang)
    trg_tok = get_spacy_tokenizer(trg_lang)

    # We expect opt.src_data and opt.trg_data to be lists of lines (read_data should be called first)
    src_lines = opt.src_data
    trg_lines = opt.trg_data

    print("Building vocabs...")
    src_vocab, trg_vocab = build_vocabs(opt, src_lines, trg_lines, src_tok, trg_tok)

    print("Creating dataset...")
    dataset = TranslationDataset(src_lines, trg_lines, src_tok, trg_tok, src_vocab, trg_vocab, add_sos_eos=True)

    # create DataLoader with custom collate that closes over vocabs and max_strlen
    my_collate = partial(collate_fn, src_vocab=src_vocab, trg_vocab=trg_vocab, max_strlen=getattr(opt, 'max_strlen', None))
    dataloader = DataLoader(dataset, batch_size=opt.batchsize, shuffle=True, collate_fn=my_collate)

    # pad ids
    opt.src_pad = src_vocab['<pad>']
    opt.trg_pad = trg_vocab['<pad>']

    # compute train_len like original get_len (number of batches)
    train_len = len(dataloader)

    return dataloader, src_vocab, trg_vocab, opt.src_pad, opt.trg_pad, train_len

# Example helper to iterate get_len as before (if you want compatibility)
def get_len(dataloader):
    for i, b in enumerate(dataloader):
        pass
    return i



In [18]:
# Optimizer
class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
    """
    Cosine annealing with restarts.
    Parameters
    ----------
    optimizer : torch.optim.Optimizer
    T_max : int
        The maximum number of iterations within the first cycle.
    eta_min : float, optional (default: 0)
        The minimum learning rate.
    last_epoch : int, optional (default: -1)
        The index of the last epoch.
    """

    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 T_max: int,
                 eta_min: float = 0.,
                 last_epoch: int = -1,
                 factor: float = 1.) -> None:
        # pylint: disable=invalid-name
        self.T_max = T_max
        self.eta_min = eta_min
        self.factor = factor
        self._last_restart: int = 0
        self._cycle_counter: int = 0
        self._cycle_factor: float = 1.
        self._updated_cycle_len: int = T_max
        self._initialized: bool = False
        super(CosineWithRestarts, self).__init__(optimizer, last_epoch)

def get_lr(self):
        """Get updated learning rate."""
        # HACK: We need to check if this is the first time get_lr() was called, since
        # we want to start with step = 0, but _LRScheduler calls get_lr with
        # last_epoch + 1 when initialized.
        if not self._initialized:
            self._initialized = True
            return self.base_lrs

        step = self.last_epoch + 1
        self._cycle_counter = step - self._last_restart

        lrs = [
            (
                self.eta_min + ((lr - self.eta_min) / 2) *
                (
                    np.cos(
                        np.pi *
                        ((self._cycle_counter) % self._updated_cycle_len) /
                        self._updated_cycle_len
                    ) + 1
                )
            ) for lr in self.base_lrs
        ]

        if self._cycle_counter % self._updated_cycle_len == 0:
            # Adjust the cycle length.
            self._cycle_factor *= self.factor
            self._cycle_counter = 0
            self._updated_cycle_len = int(self._cycle_factor * self.T_max)
            self._last_restart = step

        return lrs


In [19]:
def get_model(opt, src_vocab, trg_vocab):

    assert opt.d_model % opt.heads == 0
    assert opt.dropout < 1

    model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads)

    if opt.load_weights is not None:
        print("loading pretrained weights...")
        model.load_state_dict(torch.load(f'{opt.load_weights}/model_weights'))
    else:
        for p in model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    if opt.device == 0:
        model = model.cuda()

    return model

In [20]:
""" BAI TAP VE NHA """

import time
import os

class Opt:
    pass

def train_model(model, dataloader, opt):
    ########################

    model.train()
    start = time.time()

    # Định nghĩa criterion bên trong hàm
    criterion = nn.CrossEntropyLoss(ignore_index=opt.trg_pad)

    for epoch in range(opt.epochs):
        total_loss = 0

        # 1. Lặp qua dataloader
        for i, batch in enumerate(dataloader):

            # 2. Lấy src, trg (đã là batch_first=True và trên device từ collate_fn)
            src, trg = batch

            # trg_input là <sos>...word (ví dụ: [1, 5, 7, 9])
            trg_input = trg[:, :-1]
            # trg_output là word...<eos> (ví dụ: [5, 7, 9, 2])
            trg_output = trg[:, 1:].contiguous().view(-1)

            # 3. Tạo mặt nạ (masks) với đúng tham số
            src_mask, trg_mask = create_masks(src, trg_input, opt.src_pad, opt.trg_pad, opt.device)

            opt.optimizer.zero_grad()

            preds = model(src, trg_input, src_mask, trg_mask)

            preds_flat = preds.contiguous().view(-1, preds.size(-1))

            loss = criterion(preds_flat, trg_output)
            loss.backward()

            opt.optimizer.step()

            total_loss += loss.item()

            if (i + 1) % opt.printevery == 0:
                avg_loss = total_loss / opt.printevery
                print(f"Epoch [{epoch+1}/{opt.epochs}], Step [{i+1}/{opt.train_len}], Loss: {avg_loss:.4f}, Time: {time.time() - start:.2f}s")
                total_loss = 0
                start = time.time()

        #checkpoint
        if opt.checkpoint > 0:
            print(f"--- epoch {epoch+1} finished, saving weights ---")
            if not os.path.exists('weights'):
                os.makedirs('weights')
            torch.save(model.state_dict(), f'weights/model_epoch_{epoch+1}.weights')

    print("training complete.")
    ########################


def main():
    opt = Opt()
    opt.src_data = "data/english.txt"
    opt.trg_data = "data/french.txt"
    opt.src_lang = "en_core_web_sm"
    opt.trg_lang = 'fr_core_news_sm'
    opt.epochs = 2
    opt.d_model=512
    opt.n_layers=6
    opt.heads=8
    opt.dropout=0.1
    opt.batchsize=32
    opt.printevery=100
    opt.lr=0.0001
    opt.max_strlen=80
    opt.checkpoint = 0
    opt.no_cuda = False
    opt.load_weights = None

    # opt.device = 0
    # if opt.device == 0:
    #     assert torch.cuda.is_available()

    # read_data(opt)
    # SRC, TRG = create_fields(opt)
    # opt.train = create_dataset(opt, SRC, TRG)
    # model = get_model(opt, len(SRC.vocab), len(TRG.vocab)).to(device)

    # opt.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)

    # if opt.checkpoint > 0:
    #     print("model weights will be saved every %d minutes and at end of epoch to directory weights/"%(opt.checkpoint))

    # train_model(model, opt)

    device = torch.device("cuda" if torch.cuda.is_available() and not opt.no_cuda else "cpu")
    opt.device = device

    read_data(opt)

    dataloader, src_vocab, trg_vocab, opt.src_pad, opt.trg_pad, opt.train_len = \
        create_dataset_and_dataloader(opt, device=device)

    print(f"Train steps per epoch: {opt.train_len}")

    model = get_model(opt, len(src_vocab), len(trg_vocab)).to(device)
    opt.optimizer = torch.optim.Adam(
        model.parameters(),
        lr=opt.lr,
        betas=(0.9, 0.98),
        eps=1e-9
    )

    train_model(model, dataloader, opt)


    # for asking about further training use while true loop, and return
if __name__ == "__main__":
    main()


RuntimeError: error: 'data/english.txt' file not found