In [1]:
!wget -qq --no-check-certificate https://raw.githubusercontent.com/Koziev/NLP_Datasets/master/Conversations/Data/extract_flibusta_dialogues.1.tar.xz -O flibusta1.tar.xz
!wget -qq --no-check-certificate https://raw.githubusercontent.com/Koziev/NLP_Datasets/master/Conversations/Data/extract_flibusta_dialogues.2.tar.xz -O flibusta2.tar.xz
!tar -xvf flibusta1.tar.xz
!tar -xvf flibusta2.tar.xz

extract_flibusta_dialogues.1.txt
extract_flibusta_dialogues.2.txt


In [2]:
!pip install -qq transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25h

Архитектура GPT (transformer decoder):

* Входной текст токенизируется в последовательность чисел (токенов).

* Список токенов проходит через Embedding Layer (линейный слой) и превращается в последовательность эмбеддингов.

* К каждому эмбеддингу прибавляется positional embedding.

* эмбеддинги проходят через стэк декодер слоев без cross-attention (Transformer Decoder Block).

* выходной эмбеддинг (равен по размерности входному) соответствующий последнему токену матрично умножается на транспонированный Embedding Layer и после применения SoftMax получается распределение вероятностей следующего токена.

* Из этого распределения выбираем следующий токен (например с помощью функции argmax, bean search, etc.).

* Добавляем этот токен к входному тексту и повторяем шаги 1-6.

In [3]:
import re, gc
import numpy as np

import tensorflow as tf

from transformers import create_optimizer
from transformers import AutoTokenizer, TFAutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling

from datasets import Dataset

gc.enable()

In [12]:
block_size = 128
batch_size = 16
num_epochs = 2
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
data_size = 30000

In [5]:
# part 1
with open("extract_flibusta_dialogues.1.txt") as f:
    data = f.read()

# part 2
# with open("extract_flibusta_dialogues.2.txt") as f:
#     data += f.read()

data = data.split("\n\n\n\n")[:data_size]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({ "eos_token": "</s>", "bos_token": "<s>", "pad_token": "<pad>"})

Downloading:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0

In [7]:
list_dataset = [{"text": line} for line in data]

In [8]:
ds = Dataset.from_list(list_dataset)
del list_dataset, data; gc.collect()

21

In [9]:
output = {}
# texts to numeric vectors of MAX_TOKENS
def tokenize_function(examples, tokenizer=tokenizer):
    # Add start and end token to each comment
    examples = [ex + tokenizer.eos_token for ex in examples["text"]]
    # tokenizer created input_ids and attention_mask as output
    output = tokenizer(
        examples,
        add_special_tokens=True,  # Only adds pad not eos and bos
        max_length=block_size,
        truncation=True,
        padding="max_length",
    )
    # shift labels for next token prediction
    # set padding token labels to -100 which is ignored in loss computation
    output["labels"] = [x[1:] for x in output["input_ids"]]
    output["labels"] = [
        [-100 if x == tokenizer.pad_token_id else x for x in y]
        for y in output["labels"]
    ]
    # truncate input ids and attention mask to account for label shift
    output["input_ids"] = [x[:-1] for x in output["input_ids"]]
    output["attention_mask"] = [x[:-1] for x in output["attention_mask"]]
    
    return output

ds = ds.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
        load_from_cache_file=True,
)

  0%|          | 0/30 [00:00<?, ?ba/s]

In [10]:
ds.set_format(type="python", columns=["input_ids", "attention_mask", "labels"])
ds = ds.train_test_split(
    test_size=0.20, shuffle=True, seed=42, load_from_cache_file=True
)

In [None]:
"""
train_tensor_inputs = tf.convert_to_tensor(ds["train"]["input_ids"])
train_tensor_labels = tf.convert_to_tensor(ds["train"]["labels"])
train_tensor_mask = tf.convert_to_tensor(ds["train"]["attention_mask"])
train = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": train_tensor_inputs, "attention_mask": train_tensor_mask},
        train_tensor_labels,
    )
)

test_tensor_inputs = tf.convert_to_tensor(ds["test"]["input_ids"])
test_tensor_labels = tf.convert_to_tensor(ds["test"]["labels"])
test_tensor_mask = tf.convert_to_tensor(ds["test"]["attention_mask"])
test = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": test_tensor_inputs, "attention_mask": test_tensor_mask},
        test_tensor_labels,
    )
)
"""

In [11]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")

In [13]:
model = TFAutoModelForCausalLM.from_pretrained(model_name, 
                                               use_cache=False, 
                                               pad_token_id=tokenizer.pad_token_id,
                                               eos_token_id=tokenizer.eos_token_id,
                                               bos_token_id=tokenizer.bos_token_id,
                                               from_pt=True)
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/551M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.6.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'lm_head.weight', 'transformer.h.5.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.10.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica

<transformers.modeling_tf_utils.TFSharedEmbeddings at 0x7fce32e56cd0>

In [14]:
tf_train_set = model.prepare_tf_dataset(
    ds["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    ds["test"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [17]:
total_train_steps = (len(ds["train"]) // batch_size) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [18]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fce33b32d30>

In [23]:
prompt = "- Почему ты не входишь в открытую дверь?\n-"

encoded_prompt = tokenizer.encode(prompt, return_tensors="tf")

out = model.generate(encoded_prompt, max_length=block_size, do_sample=True, top_k=35, top_p=0.85, temperature=1.0,
                     num_return_sequences=10, eos_token_id=2, pad_token_id=0)

for i, tokens in enumerate(out.numpy().tolist(), 1):
    tokens = tokens[encoded_prompt.shape[1]:]
    text = tokenizer.decode(tokens)
    reply = text[:text.index('</s>')]
    print(f"Answer {i}:\t{reply}")

Answer 1:	 Не хочу
Answer 2:	 Она открыта, и я не хочу видеть твою мать.
- Мне тоже жаль, я люблю тебя.
Answer 3:	 Я же сказала, что не буду мешать.
Answer 4:	 Ты не можешь меня позвать, потому что у меня нет ключа.
Answer 5:	 Потому что это опасно
- Ты боишься?
Answer 6:	 Не знаю.
Answer 7:	 Потому что боюсь, что тебя найдут
Answer 8:	 Нет.
Answer 9:	 Не хочу.
Answer 10:	 Я хочу остаться в безопасности
