In [67]:
import sys
import os
sys.path.append("../")

from data_processors.dataset import load_as_hf_dataset
from config_utils.load_config import load_params_from_yaml, DataParamsSchema


In [68]:
import torch

In [69]:
dataset_params = load_params_from_yaml("dataset_params.yaml", DataParamsSchema)
dataset_params.data_params

DataParams(tokenizer_name='bert-base-uncased', train_data_path='data/processed/train.csv', validate_data_path='data/processed/valid.csv', test_data_path='data/processed/test.csv', subreddit1='music', subreddit2='gaming', mask_prob=0.15)

In [70]:
from pathlib import Path
root_dir = Path().resolve().parents[0]

In [71]:
from datasets import Dataset
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(dataset_params.data_params.tokenizer_name)
dataset = load_as_hf_dataset(root_dir / dataset_params.load_params.raw_data_path)

In [None]:
dataset[:3]

In [None]:
text = dataset["body"]
text

In [115]:
inputs = tokenizer(
    text, 
    return_tensors='pt', 
    max_length=512, 
    truncation=True, 
    padding='max_length')

In [29]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [116]:
inputs

{'input_ids': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
        [ 101, 2045, 2024,  ...,    0,    0,    0],
        [ 101, 2003, 2009,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [28]:
tokenizer.special_tokens_map["mask_token"]

'[MASK]'

In [117]:
inputs['input_ids'].shape

torch.Size([1001, 512])

In [118]:
# create random array of floats in equal dimension to input_ids
rand = torch.rand(inputs.input_ids.shape)
rand

tensor([[0.2711, 0.9081, 0.3801,  ..., 0.6452, 0.3832, 0.6929],
        [0.6089, 0.0296, 0.8147,  ..., 0.8816, 0.0205, 0.8874],
        [0.2577, 0.4639, 0.8978,  ..., 0.6605, 0.1645, 0.6943],
        ...,
        [0.7656, 0.0294, 0.2186,  ..., 0.3912, 0.1415, 0.1696],
        [0.9782, 0.3017, 0.6712,  ..., 0.3487, 0.6084, 0.7261],
        [0.9038, 0.3582, 0.9781,  ..., 0.6544, 0.7492, 0.8723]])

In [36]:
dataset_params.data_params.mask_prob

0.15

In [50]:
tokenizer.cls_token_id

101

In [54]:
tokenizer.mask_token_id

103

In [56]:
tokenizer.sep_token_id

102

In [119]:
# Создаем маску, исключая CLS, SEP и паддинг (0)
mask_arr = (rand < dataset_params.data_params.mask_prob) * \
            (inputs.input_ids != tokenizer.cls_token_id) * \
            (inputs.input_ids != tokenizer.sep_token_id) * \
            (inputs.input_ids != 0)  
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [120]:
selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
selection

[5, 10, 15, 22, 28, 36, 63, 67, 83, 99, 116, 129]

In [121]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs

{'input_ids': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
        [ 101, 2045, 2024,  ...,    0,    0,    0],
        [ 101, 2003, 2009,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
      

In [122]:
for i in range(inputs.input_ids.shape[0]):
    selection = torch.flatten((mask_arr[i]).nonzero()).tolist()
    inputs.input_ids[i, selection] = tokenizer.mask_token_id
inputs

{'input_ids': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101,  103, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
        [ 101,  103, 2024,  ...,    0,    0,    0],
        [ 101, 2003, 2009,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 101, 1045, 2123,  ...,    0,    0,    0],
        [ 101, 1045, 2001,  ...,    0,    0,    0],
        [ 101, 2146, 1013,  ..., 2031, 2246,  102],
        ...,
      

In [126]:
inputs["input_ids"][0]

tensor([  101,  1045,  2123,  1005,  1056,   103,  2055,  1996,  3768,  1997,
          103,  1012,  1045,  2196,  2224,   103,  2028,  2006,  2026,  6097,
         8654,  1010,   103,  2130,  2065,  1045,  2106,  1996,   103,  2052,
         2022,  6659,  2006,  1996, 25249,  1012,   103,  2123,  1005,  1056,
         2729,  2065,  2353,  2283, 18726,  2064,  1005,  1056,  2448,  1999,
         1996,  4281,  1012,  1045,  2123,  1005,  1056,  4952,  2000, 11058,
         2189,  1012,  1045,   103,  1005,  1056,  2729,   103,  1996, 10439,
         3573,  2003,  1037,  2701,  2291,  1012,  1045,  2064,  7173, 23890,
         2005,  2870,  1998,   103,  2228,  1996,  2701,  2291,  2573,  2488,
         2005,  2087,  5198,  1012,  1996,  2028,  3444,  1045,  2215,   103,
         5310,  6115,  1998,  1037,  4113,  4070,  1012,  2065,  2023,  5080,
         2003,  3214,  2000,  2022,  1037,  4157,   103,  3274,  1010,  2009,
         3791,  2000,  2022,  2583,  2000, 16222,  9626, 13701, 

In [125]:
inputs["input_ids"][1]

tensor([  101,   103,  2001,  2183,  2000, 28797, 10086,  1996,   103,  1997,
         1996,  3994,  4942,  5596, 23194,  2021,   103,   103,  2179,  2023,
         2695,  1998, 11323,  2008,  2111,  2031,  2200,  2844, 10740,  2055,
         2129,   103,  2417, 23194,  2323,  2298,  1012,  2061,  2077,  1045,
         2191,  2151,  3431,  2079,  2017,  2031,  2151, 10740,  2030, 15690,
         1029,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [144]:
# final
inputs = tokenizer(
    text, 
    return_tensors='pt', 
    max_length=512, 
    truncation=True, 
    padding='max_length')

inputs['labels'] = inputs.input_ids.detach().clone()

# create random array of floats in equal dimension to input_ids
rand = torch.rand(inputs.input_ids.shape)

# create mask, without CLS, SEP and padding (0)
mask_arr = (rand < dataset_params.data_params.mask_prob) * \
            (inputs.input_ids != tokenizer.cls_token_id) * \
            (inputs.input_ids != tokenizer.sep_token_id) * \
            (inputs.input_ids != 0)  

selection = torch.flatten((mask_arr[0]).nonzero()).tolist()

for i in range(inputs.input_ids.shape[0]):
    selection = torch.flatten((mask_arr[i]).nonzero()).tolist()
    inputs.input_ids[i, selection] = tokenizer.mask_token_id




In [145]:
inputs["input_ids"][4]

tensor([  101,  1045,  2572,  1037,  4121,  2189,   103,  1010,  1045,   103,
         2572,  2330,  2000,   103,  2104,  1996,  3103,  1012,   103,  2065,
         1045,   103,  2108,  6135,  7481,  2026,   103,  2091,   103,  4996,
         2024, 13569,  4095,   103,  5164,  8808,  5043,  1010,  1998,  2061,
         5743,  1998,  2061,  2006,  1012,  1045,  2001,  2074,  2667,  2000,
         2131,  2070, 12369,  2004,  2000,  2339,   103,  3227,  4092,   103,
         2111,  2123,  1005,  1056,  2507,  4996,  1013,  2189,  1997,  2023,
         2828,  1037,  4189,  6073,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [131]:
import torch

torch.save(inputs, 'tokenized_dataset.pt')


In [134]:
inputs_loaded = torch.load('tokenized_dataset.pt')

  inputs_loaded = torch.load('tokenized_dataset.pt')


In [142]:
torch.flatten((inputs_loaded["input_ids"][0] == inputs["input_ids"][0]).nonzero()).shape

torch.Size([512])