In [1]:
import sys
import os
sys.path.append("../")

from data_processors.dataset import load_as_hf_dataset
from config_utils.load_config import load_params_from_yaml, DataParamsSchema


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

In [3]:
dataset_params = load_params_from_yaml("dataset_params.yaml", DataParamsSchema)
dataset_params.data_params

DataParams(tokenizer_name='bert-base-uncased', train_data_path='data/processed/train.csv', validate_data_path='data/processed/valid.csv', test_data_path='data/processed/test.csv', subreddit1='music', subreddit2='gaming', mask_prob=0.15)

In [4]:
from pathlib import Path
root_dir = Path().resolve().parents[0]

In [5]:
from datasets import Dataset
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(dataset_params.data_params.tokenizer_name)
dataset = load_as_hf_dataset(root_dir / dataset_params.load_params.raw_data_path)



In [6]:
dataset[:3]

{'title': ['The one feature the iPad is really missing.',
  "Dear Sydney Reddit'ers, Would you like any changes made to the style of this subreddit?",
  'I skipped bail, ran away, and never got caught. AM(A)A.'],
 'body': ["I don't care about the lack of camera. I never use the one on my MacBook, and even if I did the angle would be terrible on the iPad.\n\nI don't care if third party apps can't run in the background. I don't listen to streaming music.\n\nI don't care that the App Store is a closed system. I can jailbreak for myself and I think the closed system works better for most users.\n\nThe one feature I want is User Accounts and a Guest Account. If this device is meant to be a coffee table computer, it needs to be able to accomadate multiple users.",
  'I was going to subtly edit the style of the Sydney subreddit but then I found this post and realised that people have very strong opinions about how their reddit should look. \n\n\n\nSo before I make any changes do you have any 

In [7]:
text = dataset["body"]
text

["I don't care about the lack of camera. I never use the one on my MacBook, and even if I did the angle would be terrible on the iPad.\n\nI don't care if third party apps can't run in the background. I don't listen to streaming music.\n\nI don't care that the App Store is a closed system. I can jailbreak for myself and I think the closed system works better for most users.\n\nThe one feature I want is User Accounts and a Guest Account. If this device is meant to be a coffee table computer, it needs to be able to accomadate multiple users.",
 'I was going to subtly edit the style of the Sydney subreddit but then I found this post and realised that people have very strong opinions about how their reddit should look. \n\n\n\nSo before I make any changes do you have any opinions or suggestions?',
 "Long/short story, I went to work in the United States in the last 90s and was busted in a major drug raid. I risked up to lifetime in jail if caught since I was associated with so many crimes; a

In [8]:
import numpy as np

np.mean(np.array([len(x) for x in text]))

# max_len 512 думаю хватит

616.981018981019

In [9]:
inputs = tokenizer(
    text, 
    return_tensors='pt', 
    max_length=512, 
    truncation=True, 
    padding='max_length')

In [10]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [11]:
inputs

{'input_ids': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
        [ 101, 2045, 2024,  ...,    0,    0,    0],
        [ 101, 2003, 2009,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [12]:
tokenizer.special_tokens_map["mask_token"]

'[MASK]'

In [13]:
inputs['input_ids'].shape

torch.Size([1001, 512])

In [14]:
# create random array of floats in equal dimension to input_ids
rand = torch.rand(inputs.input_ids.shape)
rand

tensor([[0.3330, 0.9380, 0.9112,  ..., 0.1385, 0.9758, 0.3696],
        [0.3008, 0.4528, 0.8277,  ..., 0.3094, 0.2432, 0.1220],
        [0.9802, 0.9555, 0.1588,  ..., 0.8469, 0.6594, 0.9235],
        ...,
        [0.8178, 0.0125, 0.7946,  ..., 0.5088, 0.8331, 0.5948],
        [0.0096, 0.3997, 0.7274,  ..., 0.3286, 0.0833, 0.0826],
        [0.8712, 0.3175, 0.1232,  ..., 0.4938, 0.0904, 0.5452]])

In [15]:
dataset_params.data_params.mask_prob

0.15

In [16]:
tokenizer.cls_token_id

101

In [17]:
tokenizer.mask_token_id

103

In [18]:
tokenizer.sep_token_id

102

In [19]:
# Создаем маску, исключая CLS, SEP и паддинг (0)
mask_arr = (rand < dataset_params.data_params.mask_prob) * \
            (inputs.input_ids != tokenizer.cls_token_id) * \
            (inputs.input_ids != tokenizer.sep_token_id) * \
            (inputs.input_ids != 0)  
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False]])

In [20]:
selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
selection

[8,
 12,
 14,
 18,
 31,
 33,
 40,
 41,
 42,
 44,
 46,
 50,
 58,
 60,
 64,
 68,
 69,
 78,
 80,
 81,
 86,
 93,
 100,
 106,
 108,
 109,
 118,
 121,
 123]

In [21]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs

{'input_ids': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
        [ 101, 2045, 2024,  ...,    0,    0,    0],
        [ 101, 2003, 2009,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
      

In [22]:
for i in range(inputs.input_ids.shape[0]):
    selection = torch.flatten((mask_arr[i]).nonzero()).tolist()
    inputs.input_ids[i, selection] = tokenizer.mask_token_id
inputs

{'input_ids': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
        [ 101,  103, 2024,  ...,    0,    0,    0],
        [ 101, 2003, 2009,  ...,    0,    0,    0],
        [ 101, 1045,  103,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
      

In [23]:
inputs["input_ids"][0]

tensor([  101,  1045,  2123,  1005,  1056,  2729,  2055,  1996,   103,  1997,
         4950,  1012,   103,  2196,   103,  1996,  2028,  2006,   103,  6097,
         8654,  1010,  1998,  2130,  2065,  1045,  2106,  1996,  6466,  2052,
         2022,   103,  2006,   103, 25249,  1012,  1045,  2123,  1005,  1056,
          103,   103,   103,  2283,   103,  2064,   103,  1056,  2448,  1999,
          103,  4281,  1012,  1045,  2123,  1005,  1056,  4952,   103, 11058,
          103,  1012,  1045,  2123,   103,  1056,  2729,  2008,   103,   103,
         3573,  2003,  1037,  2701,  2291,  1012,  1045,  2064,   103, 23890,
          103,   103,  1998,  1045,  2228,  1996,   103,  2291,  2573,  2488,
         2005,  2087,  5198,   103,  1996,  2028,  3444,  1045,  2215,  2003,
          103,  6115,  1998,  1037,  4113,  4070,   103,  2065,   103,   103,
         2003,  3214,  2000,  2022,  1037,  4157,  2795,  3274,   103,  2009,
         3791,   103,  2022,   103,  2000, 16222,  9626, 13701, 

In [24]:
inputs["input_ids"][1]

tensor([  101,  1045,  2001,  2183,  2000, 28797, 10086,  1996,  2806,  1997,
         1996,  3994,  4942,  5596, 23194,  2021,  2059,  1045,  2179,  2023,
         2695,  1998, 11323,  2008,  2111,   103,  2200,  2844, 10740,  2055,
         2129,  2037,  2417, 23194,  2323,  2298,  1012,  2061,  2077,  1045,
         2191,  2151,  3431,   103,  2017,  2031,  2151, 10740,  2030, 15690,
         1029,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [25]:
# final
inputs = tokenizer(
    text, 
    return_tensors='pt', 
    max_length=512, 
    truncation=True, 
    padding='max_length')

inputs['labels'] = inputs.input_ids.detach().clone()

# create random array of floats in equal dimension to input_ids
rand = torch.rand(inputs.input_ids.shape)

# create mask, without CLS, SEP and padding (0)
mask_arr = (rand < dataset_params.data_params.mask_prob) * \
            (inputs.input_ids != tokenizer.cls_token_id) * \
            (inputs.input_ids != tokenizer.sep_token_id) * \
            (inputs.input_ids != 0)  

selection = torch.flatten((mask_arr[0]).nonzero()).tolist()

for i in range(inputs.input_ids.shape[0]):
    selection = torch.flatten((mask_arr[i]).nonzero()).tolist()
    inputs.input_ids[i, selection] = tokenizer.mask_token_id

In [26]:
inputs["input_ids"][4]

tensor([  101,  1045,  2572,  1037,  4121,  2189,  5470,  1010,  1045,  2428,
         2572,   103,  2000,  2505,  2104,   103,  3103,  1012,  2021,   103,
         1045,  2572,  2108,  6135,  7481,   103,  2398,  2091,  5440,  4996,
         2024, 13569,  4095,   103,   103,  8808,  5043,  1010,   103,  2061,
         5743,  1998,  2061,  2006,  1012,  1045,  2001,  2074,  2667,  2000,
         2131,   103, 12369,  2004,  2000,  2339,  1006,  3227,  4092,  1007,
         2111,  2123,  1005,  1056,   103,   103,  1013,  2189,  1997,  2023,
         2828,  1037,  4189,  6073,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [27]:
import torch

torch.save(inputs, 'tokenized_dataset.pt')


In [28]:
inputs_loaded = torch.load('tokenized_dataset.pt')

  inputs_loaded = torch.load('tokenized_dataset.pt')


In [29]:
torch.flatten((inputs_loaded["input_ids"][0] == inputs["input_ids"][0]).nonzero()).shape

torch.Size([512])

In [30]:
tokenizer.vocab_size

30522

# Train model

In [31]:
from config_utils.load_config import load_params_from_yaml, ModelParamsSchema

model_params = load_params_from_yaml("model_params.yaml", ModelParamsSchema)

In [32]:
from model.model_main import MoETransformerEncoder

model = MoETransformerEncoder(**model_params.__dict__)

In [33]:
model

MoETransformerEncoder(
  (input_emb): InputEmbedding(
    (input_embedding): Embedding(30522, 256)
    (positional_encoding): PositionalEncoding()
  )
  (moe_transformer): Sequential(
    (0): EncoderBlock(
      (multi_head_attention): MultiHeadAttention_Parallel(
        (key): Linear(in_features=256, out_features=4096, bias=False)
        (query): Linear(in_features=256, out_features=4096, bias=False)
        (value): Linear(in_features=256, out_features=4096, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (proj): Linear(in_features=4096, out_features=256, bias=True)
      )
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (moe_block): MoELayer(
        (gate): Gate(
          (gate): Linear(in_features=256, out_features=4, bias=True)
        )
        (experts): ModuleList(
          (0-3): 4 x PositionwiseFeedForward(
            (linear1): Linear(in_features=256, out_features=512, bias=True)
            (linear2): Linear(in_features=51

In [34]:
type(inputs_loaded)

transformers.tokenization_utils_base.BatchEncoding

In [35]:
inputs_loaded['input_ids'].shape, inputs_loaded['attention_mask'].shape, inputs_loaded['labels'].shape

(torch.Size([1001, 512]), torch.Size([1001, 512]), torch.Size([1001, 512]))

In [36]:
import datasets
from datasets import DatasetDict

data_dict = {
    'input_ids': torch.tensor(inputs_loaded['input_ids']),
    'attention_mask': torch.tensor(inputs_loaded['attention_mask']),
    'labels': torch.tensor(inputs_loaded['labels'])
}

dataset = Dataset.from_dict(data_dict)
dataset = DatasetDict({'train': dataset})
dataset.set_format(type='torch')
dataset

  'input_ids': torch.tensor(inputs_loaded['input_ids']),
  'attention_mask': torch.tensor(inputs_loaded['attention_mask']),
  'labels': torch.tensor(inputs_loaded['labels'])


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1001
    })
})

In [37]:
from torch.utils.data import DataLoader
import yaml

# TODO: how to upload other yaml files via function
with open("../configs/train_params.yaml", "r") as f:
    train_params = yaml.safe_load(f)
train_params

{'batch_size': 16, 'n_epochs': 3, 'learning_rate': 1e-05}

In [38]:
dataloader = DataLoader(dataset['train'], batch_size=train_params['batch_size'], shuffle=True)
len(dataloader)

63

In [39]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.to(model_params.__dict__["device"])

MoETransformerEncoder(
  (input_emb): InputEmbedding(
    (input_embedding): Embedding(30522, 256)
    (positional_encoding): PositionalEncoding()
  )
  (moe_transformer): Sequential(
    (0): EncoderBlock(
      (multi_head_attention): MultiHeadAttention_Parallel(
        (key): Linear(in_features=256, out_features=4096, bias=False)
        (query): Linear(in_features=256, out_features=4096, bias=False)
        (value): Linear(in_features=256, out_features=4096, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (proj): Linear(in_features=4096, out_features=256, bias=True)
      )
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (moe_block): MoELayer(
        (gate): Gate(
          (gate): Linear(in_features=256, out_features=4, bias=True)
        )
        (experts): ModuleList(
          (0-3): 4 x PositionwiseFeedForward(
            (linear1): Linear(in_features=256, out_features=512, bias=True)
            (linear2): Linear(in_features=51

In [40]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=train_params["learning_rate"])

In [41]:
from tqdm import tqdm

n_epochs = train_params["n_epochs"]

for epoch in range(n_epochs):
    loop = tqdm (dataloader, leave=True)
    model.train()
    for batch in loop:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
        print(input_ids.shape) # torch.Size([16, 512])
        output = model(input_ids)
        print("SUCCESS")
        break
    break

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 0/63 [00:00<?, ?it/s]

torch.Size([16, 512])
input transform shape: torch.Size([16, 512])
input shape: torch.Size([16, 512])
embedded shape: torch.Size([16, 512])
input pos enc shape: torch.Size([16, 512, 256])
emb transfrom shape: torch.Size([16, 512])
transformer output shape: torch.Size([16, 512])


  0%|          | 0/63 [00:03<?, ?it/s]

SUCCESS



