In [2]:
import sys
import os

root_path = ""
for path in os.getcwd().split("\\")[:-2]:
    root_path += f"{path}/"
sys.path.insert(1, root_path)

In [3]:
import config

In [4]:
wandb_params = {
    "project_name": "project_name",
    "job_name": "job_name",
    "wandb_flag": False,
}

In [5]:
import torch

# set_up_env
env_params = {"device": torch.device("cpu")}

In [6]:
trainer_params = {
    "resume": False,
    "batch_size": 96,
    "checkpoint_path": "checkpoint/directory/smoe.pt",
    "full_eval_mode": False,
    "nb_batches_per_iter": 1000,
    "batch_split": 2,
}

In [19]:
data_params = {"data_path": "data/text8", "data_name": "text8"}

In [8]:
def _tokenize(ds, type_ds, dictionary_to_update):
    nb_tokens_in_dictionary = len(dictionary_to_update)

    # Count nb of tokens in text and update the dictionary
    for (
        i,
        line,
    ) in enumerate(ds[type_ds]["text"]):
        if i == 10:
            break
        tokens = line.split() + ["<eos>"]
        for token in tokens:
            if token not in dictionary_to_update:
                dictionary_to_update[token] = nb_tokens_in_dictionary
                nb_tokens_in_dictionary += 1

    # Assign to each token its identifier
    ids = []
    for (
        i,
        line,
    ) in enumerate(ds[type_ds]["text"]):
        if i == 10:
            break
        tokens = line.split() + ["<eos>"]
        for token in tokens:
            ids.append(dictionary_to_update[token])
    ids = torch.LongTensor(ids)
    return ids

In [9]:
from datasets import load_dataset

ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
class Corpus:
    def __init__(self, ds):
        self._dictionary = {}
        self.train = _tokenize(
            ds=ds, type_ds="train", dictionary_to_update=self._dictionary
        )
        self.valid = _tokenize(
            ds=ds, type_ds="validation", dictionary_to_update=self._dictionary
        )
        self.test = _tokenize(
            ds=ds, type_ds="test", dictionary_to_update=self._dictionary
        )

    @property
    def vocab_size(self):
        return len(self._dictionary)

In [11]:
corpus_path = os.path.join(config.ROOT_PATH, "data/raw/wikitext-103.pt")
corpus_path

'c:/Users/Admin/OneDrive - Hanoi University of Science and Technology/DANC/source_code/my_source/data/raw/wikitext-103.pt'

In [12]:
corpus = Corpus(ds=ds)

In [20]:
data_params["vocab_size"] = corpus.vocab_size

In [21]:
torch.save(corpus, corpus_path)

In [22]:
# Tạo ra khối token với hai bước
# Bước 1: là cắt bỏ các phần tử cuối của array sao cho độ dài array chia hết cho batch_size
# Bước 2: chuyển từng khối batch thành vector cột với độ dài mỗi cột là batch size


def _batchify(data_tensor, batch_size):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nb_batches = data_tensor.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data_tensor = data_tensor.narrow(0, 0, nb_batches * batch_size)
    # Evenly divide the data across the bsz batches.
    data_tensor = data_tensor.view(batch_size, -1).contiguous()
    return data_tensor


def _get_train_val_test_data(corpus: Corpus, batch_size: int) -> torch.Tensor:
    return [
        _batchify(corpus.train, batch_size),
        _batchify(corpus.valid, batch_size),
        _batchify(corpus.test, batch_size),
    ]

In [23]:
batch_size = trainer_params["batch_size"]
batch_size

96

In [24]:
train_data, val_data, test_data = _get_train_val_test_data(
    corpus=corpus, batch_size=batch_size
)
print(train_data)
print(val_data)
print(test_data)

tensor([[  0,   1,   2,   3,   4],
        [  1,   0,   0,   5,   6],
        [  2,   7,   8,   9,   3],
        [ 10,  11,   8,  12,  13],
        [ 14,  15,   2,  16,  17],
        [ 18,   7,  19,  13,  20],
        [ 21,  22,  23,   2,   3],
        [  4,  24,  25,  13,  26],
        [ 27,  28,  29,  30,  31],
        [ 32,  33,  34,  35,  36],
        [ 37,  38,  39,  17,  40],
        [ 41,  15,  42,  43,  44],
        [ 45,  43,  25,  13,  46],
        [ 26,  17,  47,  33,  43],
        [ 17,   2,  48,  15,  49],
        [ 17,  50,  51,  16,  28],
        [ 37,  52,  30,  53,  54],
        [ 23,  55,  56,  13,  17],
        [ 57,  58,  59,  22,  17],
        [ 60,  33,  37,  61,  17],
        [ 62,  63,  62,  13,  27],
        [ 64,  65,  66,  67,  17],
        [ 68,  16,  69,  70,  17],
        [ 71,  72,  73,  74,  75],
        [ 76,  77,  78,  37,  79],
        [ 80,  81,  17,  82,  66],
        [ 62,  83,  84,  62,  15],
        [  0,  85,  33,  86,  87],
        [ 43,  88,  

In [25]:
device = env_params["device"]
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

# Logging setup

In [26]:
fold_name = trainer_params["checkpoint_path"].split("/")[-1].split(".")[0]
fold_name

'smoe'

In [27]:
folder_path = "/".join(trainer_params["checkpoint_path"].split("/")[:-1])
folder_path

'checkpoint/directory'

In [28]:
import shutil
import functools


def _logging(s, log_path, print_=True, log_=True):
    if print_:
        print(s)
    if log_:
        with open(log_path, "a+") as f_log:
            f_log.write(s + "\n")


def get_logger(log_path, **kwargs):
    return functools.partial(_logging, log_path=log_path, **kwargs)


def create_exp_dir(dir_path, scripts_to_save=None, debug=False):
    if debug:
        print("Debug Mode : no experiment dir created")
        return functools.partial(_logging, log_path=None, log_=False)

    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    print("Experiment dir : {}".format(dir_path))
    if scripts_to_save is not None:
        script_path = os.path.join(dir_path, "scripts")
        if not os.path.exists(script_path):
            os.makedirs(script_path)
        for script in scripts_to_save:
            dst_file = os.path.join(dir_path, "scripts", os.path.basename(script))
            shutil.copyfile(script, dst_file)

    return get_logger(log_path=os.path.join(dir_path, "log.txt"))

In [29]:
logging = create_exp_dir(
    os.path.join(config.ROOT_PATH, f"{folder_path}/experiments/{fold_name}")
)
logging

Experiment dir : c:/Users/Admin/OneDrive - Hanoi University of Science and Technology/DANC/source_code/my_source/checkpoint/directory/experiments/smoe


functools.partial(<function _logging at 0x000001508511F380>, log_path='c:/Users/Admin/OneDrive - Hanoi University of Science and Technology/DANC/source_code/my_source/checkpoint/directory/experiments/smoe\\log.txt')

In [30]:
import datetime

current_time = datetime.datetime.now()

logging(str(current_time))

2025-01-09 23:12:15.550265


In [31]:
trainer_params

{'resume': False,
 'batch_size': 96,
 'checkpoint_path': 'checkpoint/directory/smoe.pt',
 'full_eval_mode': False,
 'nb_batches_per_iter': 1000,
 'batch_split': 2}

# Model

In [65]:
model_params = {
    "block_size": 256,
    "hidden_size": 128,
    "architecture": "sgsgsg",
    "base_arch": "transformer",
}

In [31]:
data_pos = [0] * 2
data_pos, data_pos[0]

([0, 0], 0)

In [40]:
batch_split = trainer_params["batch_split"]
batch_split

2

In [34]:
train_pos = data_pos[0]
block_size = model_params["block_size"]
X = train_data[:, train_pos : train_pos + block_size].contiguous()
Y = train_data[:, train_pos + 1 : train_pos + block_size + 1].contiguous()
X, Y, X.shape, Y.shape

(tensor([[  0,   1,   2,   3,   4],
         [  1,   0,   0,   5,   6],
         [  2,   7,   8,   9,   3],
         [ 10,  11,   8,  12,  13],
         [ 14,  15,   2,  16,  17],
         [ 18,   7,  19,  13,  20],
         [ 21,  22,  23,   2,   3],
         [  4,  24,  25,  13,  26],
         [ 27,  28,  29,  30,  31],
         [ 32,  33,  34,  35,  36],
         [ 37,  38,  39,  17,  40],
         [ 41,  15,  42,  43,  44],
         [ 45,  43,  25,  13,  46],
         [ 26,  17,  47,  33,  43],
         [ 17,   2,  48,  15,  49],
         [ 17,  50,  51,  16,  28],
         [ 37,  52,  30,  53,  54],
         [ 23,  55,  56,  13,  17],
         [ 57,  58,  59,  22,  17],
         [ 60,  33,  37,  61,  17],
         [ 62,  63,  62,  13,  27],
         [ 64,  65,  66,  67,  17],
         [ 68,  16,  69,  70,  17],
         [ 71,  72,  73,  74,  75],
         [ 76,  77,  78,  37,  79],
         [ 80,  81,  17,  82,  66],
         [ 62,  83,  84,  62,  15],
         [  0,  85,  33,  86

In [41]:
X.size(0) % batch_split

0

In [42]:
split_size = X.size(0) // batch_split
split_size

48

In [46]:
split_ind = 0
split_slice = slice(split_ind * split_size, (split_ind + 1) * split_size)
split_slice

slice(0, 48, None)

In [49]:
X = X[split_slice, :]
Y = Y[split_slice]
X, Y, X.shape, Y.shape

(tensor([[  0,   1,   2,   3,   4],
         [  1,   0,   0,   5,   6],
         [  2,   7,   8,   9,   3],
         [ 10,  11,   8,  12,  13],
         [ 14,  15,   2,  16,  17],
         [ 18,   7,  19,  13,  20],
         [ 21,  22,  23,   2,   3],
         [  4,  24,  25,  13,  26],
         [ 27,  28,  29,  30,  31],
         [ 32,  33,  34,  35,  36],
         [ 37,  38,  39,  17,  40],
         [ 41,  15,  42,  43,  44],
         [ 45,  43,  25,  13,  46],
         [ 26,  17,  47,  33,  43],
         [ 17,   2,  48,  15,  49],
         [ 17,  50,  51,  16,  28],
         [ 37,  52,  30,  53,  54],
         [ 23,  55,  56,  13,  17],
         [ 57,  58,  59,  22,  17],
         [ 60,  33,  37,  61,  17],
         [ 62,  63,  62,  13,  27],
         [ 64,  65,  66,  67,  17],
         [ 68,  16,  69,  70,  17],
         [ 71,  72,  73,  74,  75],
         [ 76,  77,  78,  37,  79],
         [ 80,  81,  17,  82,  66],
         [ 62,  83,  84,  62,  15],
         [  0,  85,  33,  86

In [57]:
model_params, data_params

({'block_size': 256, 'hide_size': 128},
 {'data_path': 'data/text8', 'data_name': 'text8', 'vocab_size': 532})

In [62]:
import torch.nn as nn

vocab_size = data_params["vocab_size"]
hidden_size = model_params["hidden_size"]
in_emb = nn.Embedding(vocab_size, hidden_size)
h = in_emb(X)
h, h.shape

(tensor([[[ 9.5447e-01, -3.0617e-01, -3.8776e-01,  ...,  7.8093e-01,
            3.5337e-01,  1.4196e+00],
          [-2.0225e-02, -3.6762e-01, -7.7904e-01,  ..., -3.7238e-01,
           -1.7133e+00,  2.0066e-02],
          [-6.4545e-01,  3.1715e-01, -5.5263e-01,  ..., -6.2707e-01,
            1.4005e+00, -2.5901e+00],
          [ 3.9238e-01, -1.1544e+00,  3.0694e-01,  ...,  2.5951e+00,
            7.2486e-01, -5.3140e-01],
          [-9.4617e-01,  9.0296e-01, -1.6938e+00,  ...,  2.5571e-01,
           -1.9398e+00,  9.6136e-01]],
 
         [[-2.0225e-02, -3.6762e-01, -7.7904e-01,  ..., -3.7238e-01,
           -1.7133e+00,  2.0066e-02],
          [ 9.5447e-01, -3.0617e-01, -3.8776e-01,  ...,  7.8093e-01,
            3.5337e-01,  1.4196e+00],
          [ 9.5447e-01, -3.0617e-01, -3.8776e-01,  ...,  7.8093e-01,
            3.5337e-01,  1.4196e+00],
          [-1.7920e+00,  4.3585e-01, -1.0626e+00,  ...,  3.7680e-01,
            8.7776e-01,  1.1222e+00],
          [-9.9017e-01, -1.6931e+0

In [64]:
moment = (torch.zeros_like(h), torch.zeros_like(h), torch.zeros_like(h))
moment

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         ...,
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
      

## TransformerSeqLayer

In [67]:
base_arch = model_params["base_arch"]
arch = model_params["architecture"]
s = arch[2 * 0]
s

's'

In [None]:
h_all = torch.cat([h_cache, h])