In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Milos/slovak-gpt-j-162M")
model = AutoModelForCausalLM.from_pretrained("Milos/slovak-gpt-j-162M")

In [None]:
prompt = "Moje najobľubenejšie mesto na severe Slovenska je"
encoded_input = tokenizer(prompt, return_tensors="pt")
output = model.generate(**encoded_input)
tokenizer.decode(output[0])

In [None]:
import re
import json
import unicodedata
import pathlib
from typing import Dict, Iterable, Tuple, Any

class MessageFilter:
    def __init__(self, *message_json_file_paths: Tuple[str, ...]) -> None:
        self.message_dumps = []
        for message_json_file_path in message_json_file_paths:
            with open(message_json_file_path, "rt") as in_file:
                message_dump = json.load(in_file)
                self.message_dumps.append(message_dump)

    def __call__(self, sender_name: str) -> Iterable[str]:
        for message_dump in self.message_dumps:
            yield from self._find_messages_from(message_dump, sender_name)

    def _find_messages_from(self, messages_dump: Dict[str, Any], sender_name: str) -> Iterable[str]:
        for message in messages_dump["messages"]:
            content = message.get("content")
            if content is None:
                continue
            curr_sender_name = self._fix_encoding(message["sender_name"])
            if curr_sender_name == sender_name:
                message_text = self._fix_encoding(message["content"])
                message_text_processed = self._process_message(message_text)
                if message_text_processed:
                    yield message_text_processed
    
    @staticmethod
    def _fix_encoding(text: str) -> str:
        return text.encode("latin1").decode("utf-8")
    
    @classmethod
    def _process_message(cls, text: str) -> str:
        text = cls.remove_urls(text)
        text = cls.remove_accents(text)
        text = cls.normalize_whitespaces(text)
        text = cls.normalize_emojis(text)
        text = text.strip()
        return text
    
    @staticmethod
    def remove_urls(text: str) -> str:
        return re.sub(r"http\S+", "", text)

    @staticmethod
    def remove_accents(text: str) -> str:
        nfkd_form = unicodedata.normalize("NFKD", text)
        only_ascii = nfkd_form.encode("ASCII", "ignore").decode("utf-8")
        return only_ascii
    
    @staticmethod
    def normalize_whitespaces(text: str) -> str:
        return re.sub(r"\s+", " ", text)

    @staticmethod
    def normalize_emojis(text: str) -> str:
        replacements = (
            (":-D", ":D"),
            (":-)", ":)"),
            (":-(", ":("),
            (":-P", ":P"),
            (":-*", ":*")
        )
        for old_sequence, new_sequence in replacements:
            text = text.replace(old_sequence, new_sequence)
        return text

message_json_file_paths = [str(file) for file in pathlib.Path("/mnt/e/FB_data/messages").rglob("*.json")]

message_filter = MessageFilter(*message_json_file_paths)
messages = list(message_filter("Milan Ondrašovič"))

In [None]:
with open("../../datasets/facebook_messages/messages.txt", "wt") as out_file:
    out_file.write("\n".join(messages))

In [None]:
len(messages)
lengths = [len(message.split()) for message in messages]
import pandas as pd

df = pd.DataFrame(data={"lengths": lengths})
df.describe()

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")
# model = GPT2LMHeadModel.from_pretrained("gpt2")

from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Milos/slovak-gpt-j-162M")

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../../datasets/facebook_messages/messages.txt",
    block_size=64
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",
#     eval_steps=500,
#     save_steps=500,
#     num_train_epochs=10,
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     logging_steps=100,
#     save_total_limit=2,
# )
training_args = TrainingArguments(
    output_dir="./results",
    save_steps=100,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    logging_steps=1,
    save_total_limit=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("saved_model/mond/slovak-fb-msg-gpt2")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.train(files=["../../datasets/facebook_messages/messages.txt"])
# tokenizer.save_pretrained("pretrained_tokenizer")

In [None]:
from tokenizers import ByteLevelBPETokenizer

# Initialize a new tokenizer
tokenizer = ByteLevelBPETokenizer()

# Train the tokenizer on a corpus of text
tokenizer.train(
    files=["../../datasets/facebook_messages/messages.txt"],
    vocab_size=30000,
    min_frequency=2,
        special_tokens=[
        "<s>", "<pad>", "</s>", "<unk>",
    ]
)
tokenizer.save_model("tokenizer")

In [None]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("tokenizer")
text = "Ja neviem co by som tu napisal, ale hadam to pojde!"

tokenized_text = tokenizer.encode(text)
tokenized_text
generated_text = tokenizer.decode(tokenized_text, skip_special_tokens=True)
generated_text

In [None]:
import torch

from transformers import (
    GPT2TokenizerFast, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, GPT2Config
)
from tokenizers import ByteLevelBPETokenizer

vocab_size = 32768

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files=["../../datasets/facebook_messages/messages.txt"],
    vocab_size=vocab_size,
    min_frequency=2,
        special_tokens=[
        "<s>", "<pad>", "</s>", "<unk>",
    ]
)
tokenizer.save_model("tokenizer")
tokenizer = GPT2TokenizerFast.from_pretrained("tokenizer")

config = GPT2Config.from_pretrained(
    "gpt2", vocab_size=len(tokenizer), n_positions=256, n_embd=512, n_layer=6, n_head=4, n_inner=1024
)
model = GPT2LMHeadModel(config)

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../../datasets/facebook_messages/messages.txt",
    block_size=64
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# dataset = dataset.with_format("torch", device=device)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

training_args = TrainingArguments(
    output_dir="./results",
    save_steps=100,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    logging_steps=1,
    save_total_limit=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# trainer.train()

# trainer.save_model("saved_model/mond/slovak-fb-msg-gpt2")

In [None]:
trainer.train()

trainer.save_model("saved_model/mond/slovak-fb-msg-gpt2")

In [None]:
print(f"{sum(p.numel() for p in model.parameters()):,}")

In [1]:

import torch

from transformers import (
    GPT2TokenizerFast, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, GPT2Config
)
model = GPT2LMHeadModel.from_pretrained("saved_model/mond/slovak-fb-msg-gpt2")

tokenizer = GPT2TokenizerFast.from_pretrained("tokenizer")

print(f"{sum(p.numel() for p in model.parameters()):,}")

2023-04-02 09:30:42.987152: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-02 09:30:44.107467: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-02 09:30:44.107582: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


29,526,528


In [7]:
# Generate text with the model
input_text = "Veeela lubim! No ako sa mas? :D"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=128, do_sample=True, top_k=128)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Veeela lubim! No ako sa mas? :D
Uz pojdem dat na krku a ked sa zobudis. :D No k nej prides, potom vecer.
Uz som vstal na chvilku, chcel som taku pol piatej :D Uz som nufik zavolat, dnes tiez mi z toho bol pekne. :D <3
Nazdar, tesim. Som rad. Vraj z ruk je normalne. :D Uvidis, ze si to aj trosku mozes zacat prist, keby uz. Rano som zistil, ze je podla mna. Vsak aj vyzera to asi problem trosku, nemam. :D Tesim sa za chvilku. :*
No ved to, tam je na oboch. Idem ja na obed 3 mesiace, a potom cvicit, tak potom vediet a pojdem aj na vecer, a tak som tam aj na FB uz povedal. :D Potom o 15:00 sa to da. No moooje. Veeela. :*
Videocet sa skoncil.
Videocet sa skoncil.
Moj nufik! O tej osmej, dobre, ako sa mas? Ako ma? :D Za chvilku <3
Aj ja teba. :D Este som dnes nufik to mal. :* <3 Uta!
V stave je ina stranka na par minut v pohode, ale ked je to tak sa ti o tom mne lepsie. :D

