In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from tokenizers.models import WordLevel
from transformers import BertConfig, BertTokenizer, BertForMaskedLM, AutoModel, AutoConfig, AutoTokenizer, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import LineByLineTextDataset
from torch import nn
import torch

In [None]:
class MLTimeseriesTrainer(Trainer):
    def init_params_ex(self, dmin, dmax, uvalues):
        self.dmin = dmin
        self.dmax = dmax
        self.uvalues = uvalues

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs['labels']
        logits = model(**inputs)['logits']
        indices = labels != -100
        labels = labels[indices]
        logits = logits[indices]

        if len(logits) > 0:
            idx = 0
            for y_true in tokenizer.convert_ids_to_tokens(labels.view(-1).cpu().numpy()):
                recalc_logits = [abs(el - int(y_true)) for el in self.uvalues]
                s = np.sum(recalc_logits)
                recalc_logits = [1, 1, 1, 1, 1] + list([np.exp(1 - el/s)/2.14 for el in recalc_logits])

                results = []
                for (r,l) in zip(recalc_logits, logits[idx].detach().cpu().numpy()):
                    results.append(r*l)
                    
                logits[idx] = torch.tensor(results)
                idx += 1

        logits.reques_gradient = True
        loss_fct = nn.CrossEntropyLoss()
        return loss_fct(logits, labels)
        
def prepare_tokenizer(uvalues, vocab_file_name):
    !mkdir krv_tokenizer

    print(f'The size of vacabulary {len(uvalues)}')

    s = '[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n'
    for i in uvalues:
        s += str(i) + '\n'

    textfile = open("krv_tokenizer/" + vocab_file_name, "w")
    textfile.write(s)
    textfile.close()

    tokenizer = Tokenizer(WordLevel())
    tokenizer.pre_tokenizer = Whitespace()
    trainer = WordLevelTrainer(special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"])
    tokenizer.train(["krv_tokenizer/" + vocab_file_name], trainer)

    stm = '{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}'
    tc  = '{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "name_or_path": "krv_tokenizer"}'
    
    textfile = open("krv_tokenizer/special_tokens_map.json", "w")
    textfile.write(stm)
    textfile.close()

    textfile = open("krv_tokenizer/tokenizer_config.json", "w")
    textfile.write(tc)
    textfile.close()


def make_sin_data(number_of_counts, window):
    df = pd.DataFrame(np.sin(np.linspace(-2*np.pi, 2*np.pi, number_of_counts)), columns = ['x'])
    delta = np.abs(df.diff()).min().values[0]
    ln1 = (len(df) // window ) * window
    df = pd.DataFrame(df[:ln1].x.values.reshape((-1, window)))
    df.columns = [f't{i+1}' for i in range(window)]
        
    return df, delta

def make_ts_data(number_of_counts, window):
    return make_sin_data(number_of_counts, window)

def make_ts_as_history(number_of_counts = 2000, window = 4):
    df, delta = make_ts_data(number_of_counts, window)
    dmin = df.min().min()
    dmax = df.max().max()
    num_uniq_words = int(np.ceil((dmax - dmin)/delta))
    df = ((df - dmin)/delta).astype('int')
    uvalues = np.sort(list(set(np.concatenate(df.values))))
    df = df.astype('str')
    df['history'] = df.agg(' '.join, axis=1)
    return df[['history']], uvalues, delta

def make_model(tokenizer, window, dmin, dmax, first_run = True):
    if first_run == True:
        model = BertForMaskedLM(config=BertConfig())
        model.resize_token_embeddings(len(tokenizer))
    else:
        model = BertForMaskedLM.from_pretrained('krv_model')
        
    model.resize_token_embeddings(len(tokenizer))
        
    print(f'Num of model parameters = {model.num_parameters()}')

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    training_args = TrainingArguments(
        output_dir="./ml_ts",
        overwrite_output_dir=True,
        num_train_epochs=50,
        per_gpu_train_batch_size=4,
        logging_steps= 200,
        prediction_loss_only=True,
        #learning_rate=0.0002,
        lr_scheduler_type = "cosine"
    )

    trainer = MLTimeseriesTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.init_params_ex(-1, 1, uvalues)
    return trainer, model

In [None]:
window = 20
number_of_counts = 3000

df, uvalues, delta = make_ts_as_history(number_of_counts, window)
np.savetxt(r'corpus.txt', df['history'].values, fmt='%s')

first_run = True

tokenizer = prepare_tokenizer(uvalues, 'vocab.txt')
tokenizer = BertTokenizer.from_pretrained('./krv_tokenizer')

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./corpus.txt",
    block_size=512,
)

for i in range(0, 1):
    trainer, model = make_model(tokenizer, window, -1, 1, first_run = first_run)
    trainer.train()
    model.save_pretrained('krv_model')
    first_run = False

In [None]:
text = ['63145 63760 64373 64984 65592 66198 66802 67403 68001 68596 69189 69779 70366 70949 71529 72107 72680 73250 73817 [MASK]']
encoding = tokenizer(text, return_tensors="pt").to('cuda:0')

# forward pass
outputs = model(**encoding)
preds = outputs.logits.argmax(-1)
preds

In [None]:
from IPython.display import FileLinks
FileLinks(r'krv_tokenizer')