In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")
# model = GPT2LMHeadModel.from_pretrained("gpt2")

from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Milos/slovak-gpt-j-162M")

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../../datasets/facebook_messages/messages.txt",
    block_size=64
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",
#     eval_steps=500,
#     save_steps=500,
#     num_train_epochs=10,
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     logging_steps=100,
#     save_total_limit=2,
# )
training_args = TrainingArguments(
    output_dir="./results",
    save_steps=100,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    logging_steps=1,
    save_total_limit=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("saved_model/mond/slovak-fb-msg-gpt2")

In [None]:
from tokenizers import ByteLevelBPETokenizer

# Initialize a new tokenizer
tokenizer = ByteLevelBPETokenizer()

# Train the tokenizer on a corpus of text
tokenizer.train(
    files=["../../datasets/facebook_messages/messages.txt"],
    vocab_size=30000,
    min_frequency=2,
        special_tokens=[
        "<s>", "<pad>", "</s>", "<unk>",
    ]
)
tokenizer.save_model("tokenizer")

In [None]:
import torch

from transformers import (
    GPT2TokenizerFast, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, GPT2Config
)
from tokenizers import ByteLevelBPETokenizer

vocab_size = 32768

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files=["../../datasets/facebook_messages/messages.txt"],
    vocab_size=vocab_size,
    min_frequency=2,
        special_tokens=[
        "<s>", "<pad>", "</s>", "<unk>",
    ]
)
tokenizer.save_model("tokenizer")
tokenizer = GPT2TokenizerFast.from_pretrained("tokenizer")

config = GPT2Config.from_pretrained(
    "gpt2", vocab_size=len(tokenizer), n_positions=256, n_embd=512, n_layer=6, n_head=4, n_inner=1024
)
model = GPT2LMHeadModel(config)

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../../datasets/facebook_messages/messages.txt",
    block_size=64
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# dataset = dataset.with_format("torch", device=device)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

training_args = TrainingArguments(
    output_dir="./results",
    save_steps=100,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    logging_steps=1,
    save_total_limit=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# trainer.train()

# trainer.save_model("saved_model/mond/slovak-fb-msg-gpt2")

In [None]:
trainer.train()

trainer.save_model("saved_model/mond/slovak-fb-msg-gpt2")

In [None]:
print(f"{sum(p.numel() for p in model.parameters()):,}")

In [None]:

import torch

from transformers import (
    GPT2TokenizerFast, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, GPT2Config
)
model = GPT2LMHeadModel.from_pretrained("saved_model/mond/slovak-fb-msg-gpt2")

tokenizer = GPT2TokenizerFast.from_pretrained("tokenizer")

print(f"{sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Generate text with the model
input_text = "Veeela lubim! No ako sa mas? :D"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=128, do_sample=True, top_k=128)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)

In [1]:
%load_ext autoreload
%autoreload 2

from lm.data.dataset import PlainTextDatasetGenerator
from lm.data.preprocessing import ConversationPreprocessor
from lm.data.reader import FacebookMessagesReader, SubtitlesReader

conversation_preprocessor = ConversationPreprocessor()
facebook_messages_reader = FacebookMessagesReader.from_directories("/mnt/e/datasets/FB_messages")
subtitles_reader = SubtitlesReader.from_directories("/mnt/e/datasets/open_subtitles/sk")
dataset_generator = PlainTextDatasetGenerator(conversation_preprocessor)
dataset_generator.generate_from_readers(
    "/mnt/e/datasets/conversation_corpus/dataset.txt", facebook_messages_reader, subtitles_reader
)

InvalidTextSampleFile: could not load subtitles file /mnt/e/datasets/open_subtitles/sk/2003/421357/3997680.xml

In [None]:
# (c)
# gmail
# hotmail
# .cz
# .sk
# .com
# webzdarma
# titulky
# T I T U L K Y
# preklad
# seznam
# released
# thanks
# @
