In [1]:
import sys
import os

root_path = ""
for path in os.getcwd().split("\\")[:-1]:
    root_path += f"{path}/"
sys.path.insert(1, root_path)
sys.path.insert(1, os.path.join(root_path, "src"))

In [2]:
import datasets
import time
import yaml
import torch
import os
from torch import nn
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def _tokenize(dictionary, path, limit_line=None):
    nb_tokens_in_dictionary = len(dictionary)
    # load document to tokenize
    with open(path, "r", encoding="utf-8") as f:
        document = f.read()

    # Count nb of tokens in text and update the dictionary
    for i, line in enumerate(tqdm(document, desc="Creating dictionary", unit=" lines")):
        if i == limit_line:
            break
        tokens = line.split() + ["<eos>"]
        for token in tokens:
            if token not in dictionary:
                dictionary[token] = nb_tokens_in_dictionary
                nb_tokens_in_dictionary += 1

    # Assign to each token its identifier
    ids = []
    for i, line in enumerate(tqdm(document, desc="Encoding token", unit=" lines")):
        if i == limit_line:
            break
        i += 1
        tokens = line.split() + ["<eos>"]
        for token in tokens:
            ids.append(dictionary[token])
    ids = torch.LongTensor(ids)
    return ids


class Corpus:
    def __init__(self, path=None):
        self._dictionary = {}
        print("Processing train ...")
        self.train = _tokenize(
            dictionary=self._dictionary, path=os.path.join(path, "train.txt")
        )
        print("Processing valid ...")
        self.validation = _tokenize(
            dictionary=self._dictionary, path=os.path.join(path, "validation.txt")
        )
        print("Processing test ...")
        self.test = _tokenize(
            dictionary=self._dictionary, path=os.path.join(path, "test.txt")
        )

    @property
    def vocab_size(self):
        return len(self._dictionary)

In [4]:
def batchify(data: torch.Tensor, batch_size):
    # Tính số batch trên data
    num_batches = data.size(0) // batch_size
    # Lấy đủ số lượng batch có thể lấy trên dữ liệu và cắt bỏ những dữ liệu cuối
    data = data.narrow(0, 0, num_batches * batch_size)
    # Evenly divide the data across the bsz batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data

In [5]:
class RNN_nlp(nn.Module):
    """
    Container module with an encoder, a recurrent module, and a decoder.
    """

    def __init__(self, nonlinearity, ntoken, ninp, nhid, nlayer, dropout=0.5, tie_weights=False):
        super(RNN_nlp, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.RNN(ninp, nhid, nlayer, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self._init_weights()

        self.nonlinearity = nonlinearity
        self.nhid = nhid
        self.nlayers = nlayer

    def _init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        return weight.new_zeros(self.nlayers, bsz, self.nhid)

In [6]:
def get_batch(data, i, stride, evaluation=False):
    seq_len = min(stride, len(data) - 1 - i)
    inputs = data[i : i + seq_len]
    targets = data[i + 1 : i + 1 + seq_len].view(-1)

    if evaluation:
        # Đảm bảo không cần theo dõi gradient
        with torch.no_grad():
            inputs = inputs.clone()
            targets = targets.clone()

    return inputs, targets

In [None]:
def train(model, train_data, criterion, optimize, num_epochs=5, **train_params):
    batch_size = train_params.get("batch_size", train_data.size(0))
    n_token = train_params.get("n_token", model.decoder.out_features)
    stride = train_params.get("stride", 32)
    log_interval = train_params.get("log_interval", 500)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    model.train()
    for epoch in range(num_epochs):
        start_time = time.time()
        hidden = model.init_hidden(bsz=batch_size)

        for batch, i in enumerate(tqdm(range(0, train_data.size(0), stride), desc=f"Epoch {epoch}|", unit=" batchs")):
            inputs, target = get_batch(data=train_data, i=i, stride=stride)
            inputs, target = inputs.to(device=device), target.to(device=device)
            print(inputs.shape, hidden.shape)

            model.zero_grad()
            hidden = hidden.detach()

            outputs, hidden = model(inputs, hidden)
            print(outputs.shape, target.view(-1).shape)
            loss = criterion(outputs.view(-1, n_token), target.view(-1))

            if batch % log_interval == 0:
                elapsed = time.time() - start_time
                print(
                    f"Epoch {epoch+1}/{num_epochs} | Batch {batch}/{int(len(train_data) / stride)} | "
                    f"ms/batch: {elapsed:.2f} | loss: {loss.item():.4f} | ppl: {torch.exp(loss).item():.4f}"
                )
                start_time = time.time()  # Reset timer after logging
            return 0
            loss.backward()
            optimize.step()
            optimize.zero_grad()

In [8]:
if os.path.exists(os.path.join(root_path, "data/wikitext-103/")):
    print("Data exited")
else:
    print("Loading data set ...")
    ds = datasets.load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")
    for type_ds in ["train", "validation", "test"]:
        lines = ds[type_ds]["text"]
        if not os.path.exists(os.path.join(root_path, "data/wikitext-103")):
            os.makedirs(os.path.join(root_path, "data/wikitext-103"))
        with open(os.path.join(root_path, f"data/wikitext-103/{type_ds}.txt"), "w", encoding="utf-8") as f:
            for line in tqdm(lines, desc=f"Saving {type_ds}.txt", unit=" lines"):
                f.write(line)
        f.close()

if os.path.exists(os.path.join(root_path, "data/wikitext-103/corpus.pt")):
    print("Loading corpus ...")
    corpus = torch.load(os.path.join(root_path, "data/wikitext-103/corpus.pt"))
else:
    print("Creating corpus ...")
    corpus = Corpus(path=os.path.join(root_path, "data/wikitext-103"))
    torch.save(corpus, os.path.join(root_path, "data/wikitext-103/corpus.pt"))

train_data = batchify(data=corpus.train, batch_size=128)

Data exited
Loading corpus ...


  corpus = torch.load(os.path.join(root_path, "data/wikitext-103/corpus.pt"))


In [14]:
with open(os.path.join(root_path, "config/model_params.yaml"),  "r") as f:
    model_params = yaml.safe_load(f)
f.close()
model_params['ntoken'] = corpus.vocab_size
batch_size = 128
model = RNN_nlp(**model_params)

with open(os.path.join(root_path, "config/train_params.yaml"), "r") as f:
    train_params = yaml.safe_load(f)

train_params.update(
    {
        "criterion": nn.CrossEntropyLoss(),
        "optimize": torch.optim.SGD(model.parameters(), lr=float(train_params["lr"])),
        "n_token": corpus.vocab_size,
        "batch_size": batch_size,
    } 
)
train(model= model, train_data=train_data, **train_params)

Epoch 0|:   0%|          | 0/7601 [00:00<?, ? batchs/s]

torch.Size([1000, 128]) torch.Size([3, 128, 32])
torch.Size([1000, 128, 5007]) torch.Size([128000])


Epoch 0|:   0%|          | 0/7601 [00:33<?, ? batchs/s]

Epoch 1/5 | Batch 0/7600 | ms/batch: 32.87 | loss: 8.5296 | ppl: 5062.5996





0

In [15]:
train_params

{'num_epochs': 5,
 'stride': 1000,
 'lr': '1e-3',
 'batch_size': 128,
 'criterion': CrossEntropyLoss(),
 'optimize': SGD (
 Parameter Group 0
     dampening: 0
     differentiable: False
     foreach: None
     fused: None
     lr: 0.001
     maximize: False
     momentum: 0
     nesterov: False
     weight_decay: 0
 ),
 'n_token': 5007}