In [None]:
import pickle
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [None]:
import sentencepiece
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel, AutoModelForMaskedLM
from transformers import pipeline


device = 0 if torch.cuda.is_available() else -1
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = AutoModelForMaskedLM.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=2)

class dataset_for_xlm_roberta(torch.utils.data.Dataset):
    def __init__(self, strings):
        self.strings = np.array(strings)

    def __getitem__(self, idx):
        return self.strings[idx]

    def __len__(self):
        return self.strings.shape[0]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
with open('emojis.pkl', 'rb') as f:
    emojis = pickle.load(f)

In [None]:
issuers = pd.read_csv('data/issuers - issuers.csv', index_col = 1).drop('ID человека', axis= 1)
tickers = list(set(issuers['BGTicker'].str.strip().str[0:4].values.flatten().tolist())) + list(set(issuers['BGTicker.1'].str.strip().values.flatten().tolist()) )[1:]
tickers = tickers[1:]

In [None]:
for x in tickers:
    if type(x) is not str:
        print(x)

In [None]:
tokenizer.add_tokens(list(set(emojis)) + tickers)

0

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(121425, 768)

In [None]:
tokenizer.decode(tokenizer.encode('I 🙁 hate ☹️ to 😣 see 😖 this 😫 fail 😩 , 🥺 pls 😢 help 😭 me 😤'))

'[CLS] I 🙁 hate ☹️ to 😣 see 😖 this 😫 fail 😩, 🥺 pls 😢 help 😭 me 😤 [SEP]'

In [None]:
from datasets import load_dataset
datasets = load_dataset("text", data_files={"train": 'sentences.txt'})

Found cached dataset text (/home/kir1200/.cache/huggingface/datasets/text/default-d9a9ac251803957c/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [None]:
block_size = 512

In [None]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=12, remove_columns=["text"])

Map (num_proc=12):   0%|          | 0/2497550 [00:00<?, ? examples/s]

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1,
    num_proc=6,
)
train_size = 0.8
test_size = 1 - train_size

lm_datasets_split = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)


Map (num_proc=6):   0%|          | 0/2497550 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments
model_name = 'rubert/'
training_args = TrainingArguments(
    f"{model_name}fine_tuned_ours_add_tickers",
    evaluation_strategy = "epoch",
    learning_rate=5e-7,
    weight_decay=0.01,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 4,
    num_train_epochs = 400,
    save_steps = 5000
    
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=True, mlm_probability=0.15)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets_split["train"],
    eval_dataset=lm_datasets_split["test"],
    data_collator=data_collator,

)

In [None]:
tokenizer.model_max_length

1000000000000000019884624838656

In [None]:
trainer.train()