In [1]:
from tqdm.notebook import tqdm
import numpy as np

import sklearn
import sklearn.datasets
import sklearn.model_selection
import sklearn.metrics

import torch
import torchvision
from torch import nn
from torch.nn import functional as F


PYTORCH_DEVICE = torch.device("cpu" if not torch.cuda.is_available() else "cuda:0")
print(PYTORCH_DEVICE)

cuda:0


In [5]:
import validators

def load_text_data(filename):
    LINE_SEPERATOR = "##########"
    with open(filename) as input_file:
        data = []
        current_text_list = []
        current_url = None
        for row in tqdm(input_file):
            if row.strip() == LINE_SEPERATOR:
                if len(current_text_list) > 0 and current_url is not None:
                    data.append({
                        "url": current_url,
                        "text": "".join(current_text_list).replace("\xa0", " "),
                    })
                    current_url = None
                    current_text_list = []
                current_url = next(input_file).strip()
                validators.url(current_url)
            else:
                current_text_list.append(row)
    return data

In [6]:
import random
import sklearn.model_selection

data = load_text_data("porn_dataset.txt")
print("Dats size:", len(data))

random.shuffle(data)

data_train, data_test = sklearn.model_selection.train_test_split(data, test_size=1_000, shuffle=True)


0it [00:00, ?it/s]

Dats size: 74232


In [7]:
import tokenizers
import tokenizers.models
import tokenizers.trainers
import tokenizers.processors
import tokenizers.pre_tokenizers

START_TOKEN = "[START]"
END_TOKEN = "[END]"
UNK_TOKEN = "[UNK]"

VOCAB_SIZE = 10_000
SPECIAL_TOKENS = [UNK_TOKEN, START_TOKEN, END_TOKEN, ".", ",", "!", "?", "-"]
TOKENIZER_TRAIN_SIZE = 1_000

tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(unk_token=UNK_TOKEN))
tokenizer.normalizer = tokenizers.normalizers.Lowercase()
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
trainer = tokenizers.trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=SPECIAL_TOKENS)

tokenizer_train_texts = list(map(lambda row: row["text"], data_train))[:TOKENIZER_TRAIN_SIZE]
tokenizer.train_from_iterator(tokenizer_train_texts, trainer=trainer)

tokenizer.post_processor = tokenizers.processors.TemplateProcessing(
    single=f"{START_TOKEN} $A {END_TOKEN}",
    special_tokens=[
        (START_TOKEN, tokenizer.token_to_id(START_TOKEN)),
        (END_TOKEN, tokenizer.token_to_id(END_TOKEN)),
    ],
)

In [8]:
MAX_TOKENS = None

random_text = data_test[random.randint(0, len(data_test) - 1)]["text"]

"_".join(tokenizer.encode(random_text).tokens[:MAX_TOKENS])

'[START]_дальше_они_с_але_ной_ста_би_льно_соз_вани_вались_два_раза_в_неделю_,_она_без_ро_по_тно_приез_жала_в_течение_трех_месяцев_и_давала_свои_ножки_на_ра_стер_за_ние_,_выполня_я_любой_ка_приз_своего_мучи_теля_-_она_поняла_,_что_проти_виться_выходит_всегда_доро_же_._если_он_хотел_с_ней_поиграть_в_его_люби_мую_игру_"_где_тебя_по_щеко_тать_"_или_"_где_твои_ножки_по_лизать_"_-_она_просто_на_зывала_место_,_больше_не_сопротивля_ясь_._приходила_теперь_она_всегда_одна_,_ей_хватило_одного_раза_._и_,_когда_,_в_последний_раз_она_приехала_,_когда_виктор_,_в_очередной_раз_выли_зал_каждый_милли_метр_ее_ступ_ней_,_он_спросил_:_-_ну_вот_и_последняя_наша_встреча_,_ты_мне_вы_пла_тила_дол_г_за_тот_телефон_._мне_очень_понрави_лись_твои_ножки_,_они_просто_беспо_доб_ны_._если_тебе_потре_бу_ются_деньги_-_ты_можешь_ко_мне_обра_титься_,_что_мне_нужно_,_ты_сама_знаешь_._тебе_такое_предложение_интересно_?_-_не_знаю_._-_алена_немного_слу_ка_вила_,_она_уже_привыкла_к_этому_бан_ди_ту_,_привыкла_к_его_игра_м_и_,_п

In [9]:
# precalculate 

def encode_texts_inplace(data):
    for row in tqdm(data):
        row["encoded"] = tokenizer.encode(row["text"])

encode_texts_inplace(data_train)
encode_texts_inplace(data_test)

  0%|          | 0/73232 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [33]:
class SimpleLMDataset(torch.utils.data.Dataset):
    def __init__(self, data, max_length=64, is_random_substring=True, device=PYTORCH_DEVICE):
        self.data = data
        self.device = device
        self.is_random_substring = is_random_substring
        self.max_length = max_length

    def __getitem__(self, index):
        row = data[index]
        encoded = row["encoded"]
        tokens_ids = encoded.ids
        attention_mask = encoded.attention_mask
        length = len(tokens_ids)
        assert length == len(tokens_ids) == len(attention_mask)
        
        if self.is_random_substring and length > self.max_length:
            random_start = random.randint(0, length - self.max_length)
            random_end = random_start + self.max_length
            tokens_ids = tokens_ids[random_start:random_end]
            attention_mask = attention_mask[random_start:random_end]

        return {
            "tokens_ids": torch.LongTensor(tokens_ids).to(self.device),
            "attention_mask": torch.LongTensor(attention_mask).to(self.device),
        }

    def __len__(self):
        return len(self.data)

    @staticmethod
    def collate_fn(original_batch):
        batch = {
            "tokens_ids": torch.nn.utils.rnn.pad_sequence(list(map(lambda x: x["tokens_ids"], original_batch)), batch_first=True, padding_value=tokenizer.token_to_id(END_TOKEN)),
            "attention_masks": torch.nn.utils.rnn.pad_sequence(list(map(lambda x: x["attention_mask"], original_batch)), batch_first=True, padding_value=0)
        }
        return batch
    
MAX_TEXT_LENGTH = 64

train_dataset = SimpleLMDataset(data_train, max_length=MAX_TEXT_LENGTH, is_random_substring=True, device=PYTORCH_DEVICE)
test_dataset = SimpleLMDataset(data_test, max_length=MAX_TEXT_LENGTH, is_random_substring=True, device=PYTORCH_DEVICE)

BATCH_SIZE = 10

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=SimpleLMDataset.collate_fn)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=SimpleLMDataset.collate_fn)

In [34]:
def infinite_dataloader_wrapper(dataloader):
    while True:
        for batch in dataloader:
            yield batch
            
infinite_train_dataloader = infinite_dataloader_wrapper(train_dataloader)
infinite_test_dataloader = infinite_dataloader_wrapper(test_dataloader)

In [35]:
class SimpleLM(torch.nn.Module):
    EMBEDDING_DIM = 64
    
    def __init__(self, dict_size):
        super(SimpleLM, self).__init__()
        self.dict_size = dict_size
        self.embedding = nn.Embedding(dict_size, self.EMBEDDING_DIM)
        self.lstm = nn.LSTM(self.EMBEDDING_DIM, self.EMBEDDING_DIM, batch_first=True)
        self.final_fc = nn.Linear(self.EMBEDDING_DIM, dict_size)

    def forward(self, x):
        x = self.embedding(x)
        x, (_, _) = self.lstm(x)
        x = F.relu(x)
        x = self.final_fc(x)
        return F.softmax(x, dim=2)

In [36]:
# Single run of your model
model = SimpleLM(tokenizer.get_vocab_size())
random_text = data_test[random.randint(0, len(data_test) - 1)]["text"]
result = model(torch.LongTensor([tokenizer.encode(random_text).ids]))
result

tensor([[[1.0779e-04, 1.0674e-04, 8.4647e-05,  ..., 8.3036e-05,
          1.1021e-04, 9.3027e-05],
         [1.0465e-04, 1.1261e-04, 8.3650e-05,  ..., 8.1279e-05,
          1.1061e-04, 1.0199e-04],
         [1.1288e-04, 1.1751e-04, 8.3820e-05,  ..., 8.1542e-05,
          1.1469e-04, 9.8401e-05],
         ...,
         [1.0618e-04, 1.1168e-04, 8.9255e-05,  ..., 8.7125e-05,
          1.0668e-04, 9.6197e-05],
         [1.0905e-04, 1.1439e-04, 8.9307e-05,  ..., 8.5584e-05,
          1.0400e-04, 1.0260e-04],
         [1.0543e-04, 1.1669e-04, 9.0656e-05,  ..., 8.1283e-05,
          1.0115e-04, 1.0447e-04]]], grad_fn=<SoftmaxBackward0>)

In [37]:
model = SimpleLM(tokenizer.get_vocab_size()).to(PYTORCH_DEVICE)
loss_function = torch.nn.CrossEntropyLoss().to(PYTORCH_DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)

In [38]:
from torch.utils.tensorboard import SummaryWriter

tensorboard_writer = SummaryWriter(comment=f"_{str(model.__class__.__name__)}")

def process_batch_metrics(results, is_train, iteration):
    if iteration < 1000:
        return
    train_or_test = "train" if is_train else "test"

    accuracy = sklearn.metrics.accuracy_score(
        results["tokens_ids_shifted"].reshape(-1, 1), 
        results["tokens_predicted_shifted_class"].reshape(-1, 1)
    )

    tensorboard_writer.add_scalar(f'{train_or_test}/loss', results["loss_numpy"], iteration)
    tensorboard_writer.add_scalar(f'{train_or_test}/accuracy', accuracy, iteration)


def process_batch(batch, is_train, iteration):
    if is_train:
        model.train()
    else:
        model.eval()

    
    tokens_ids = batch["tokens_ids"]
    attention_masks = batch["attention_masks"]
    
    if is_train:
        optimizer.zero_grad()

    tokens_predicted = model(tokens_ids)
   
    assert not torch.any(torch.isnan(tokens_predicted))
    
    tokens_predicted_shifted = tokens_predicted[:, :-1, :]
    tokens_ids_shifted = tokens_ids[:, 1:]
    
    loss = loss_function(
        tokens_predicted_shifted.reshape(-1, tokens_predicted_shifted.size(-1)),
        tokens_ids_shifted.reshape(-1)
    )
    
#     tokens_predicted_raw_selected = torch.gather(tokens_predicted_raw[:, :-1, :], dim=2, index=tokens_ids[:,1:,None])
#     loss = -F.log_softmax(tokens_predicted_raw_selected, dim=2).mean()

    if is_train:
        loss.backward()
        optimizer.step()
    
    results = {}
    results["tokens_predicted"] = tokens_predicted.cpu().detach().numpy()
    results["tokens_predicted_class"] = np.argmax(results["tokens_predicted"], axis=2)
    results["tokens_predicted_shifted"] = tokens_predicted_shifted.cpu().detach().numpy()
    results["tokens_predicted_shifted_class"] = np.argmax(results["tokens_predicted_shifted"], axis=2)

    results["tokens_ids"] = tokens_ids.cpu().detach().numpy()
    results["tokens_ids_shifted"] = tokens_ids_shifted.cpu().detach().numpy()

    results["loss_numpy"] = loss.cpu().detach().numpy()
    
    process_batch_metrics(results, is_train, iteration)
    
    return results
    
iteration = None
for iteration in tqdm(range(1_000_000_000)):
    process_batch(next(infinite_train_dataloader), True, iteration)
    with torch.no_grad():
        process_batch(next(infinite_test_dataloader), False, iteration)

  0%|          | 0/1000000000 [00:00<?, ?it/s]

KeyboardInterrupt: 