In [1]:
import tqdm
import torch
import torch.nn as nn
import numpy as np
from matplotlib import pyplot as plt
import requests
import dvc.api
import pandas as pd
import pickle

Загружаем датасет твиттера

In [2]:
TWITTER = './twitter.csv'

In [3]:
data = pd.read_csv(TWITTER)

In [4]:
data

Unnamed: 0,tag,message
0,0.0,is so sad for my APL friend.............
1,0.0,I missed the New Moon trailer...
2,1.0,omg its already 7:30 :O
3,0.0,.. Omgaga. Im sooo im gunna CRy. I've been at...
4,0.0,i think mi bf is cheating on me!!! T_T
...,...,...
1578609,1.0,Zzzzzz.... Finally! Night tweeters!
1578610,1.0,"Zzzzzzz, sleep well people"
1578611,0.0,ZzzZzZzzzZ... wait no I have homework.
1578612,0.0,"ZzZzzzZZZZzzz meh, what am I doing up again?"


В качестве токенизатора предложений используем готовый из LaBSE

In [5]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")

In [150]:
data["message"][0]

'is so sad for my APL friend.............'

Для словаря создадим отдельный класс, который загружает данный из наборов предложений, также он позволяет менять количество слов в словаре

In [7]:
import heapq
class WordsVocabulary:
    def __init__(self, freq_threshold):
        self.idx2word = {0: '[PAD]', 1: '[CLS]', 2: '[SEP]', 3: '[UNK]'}
        self.word2idx = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3}
        self.freq_threshold = freq_threshold
        self.frequencies = {}

    def __len__(self):
        return len(self.idx2word)

    def build_vocabulary(self, sents, tokenizer, voc_size = None):
        idx = len(self.word2idx)
        for sent in tqdm.tqdm(sents):
            if(type(sent) == str):
                sent = tokenizer.tokenize(sent)
                for word in sent:
                    if word not in self.frequencies:
                        self.frequencies[word] = 1
                    else:
                        self.frequencies[word] += 1
        
        
        if voc_size == None:
            for word in self.frequencies.keys():
                if self.frequencies[word] >= self.freq_threshold:
                    self.word2idx[word] = idx
                    self.idx2word[idx] = word
                    idx += 1
        else:
            words_sorted_by_frequencies = heapq.nlargest(voc_size - len(self.idx2word), self.frequencies, key=self.frequencies.get)
            for word in words_sorted_by_frequencies:
                self.word2idx[word] = idx
                self.idx2word[idx] = word
                idx += 1
    def rebuild_vocabulary(self, voc_size):
        self.idx2word = {0: '[PAD]', 1: '[CLS]', 2: '[SEP]', 3: '[UNK]'}
        self.word2idx = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3}
        idx = len(self.word2idx)
        words_sorted_by_frequencies = heapq.nlargest(voc_size - len(self.idx2word), self.frequencies, key=self.frequencies.get)
        for word in words_sorted_by_frequencies:
            self.word2idx[word] = idx
            self.idx2word[idx] = word
            idx += 1
    def numericalize(self, tokens):
        return [self.word2idx[token] if token in self.word2idx else self.word2idx['[UNK]']
                for token in tokens]
        

In [8]:
wordvoc = WordsVocabulary(1)
wordvoc.build_vocabulary(data["message"], tokenizer)

100%|██████████| 1578614/1578614 [01:12<00:00, 21890.29it/s]


Сохраняем словарь

In [66]:
with open('wordvoc.pickle', 'wb') as f:
    pickle.dump(wordvoc, f)

Можно загрузить словарь, предварительно создав класс

In [8]:
with open('wordvoc.pickle', 'rb') as f:
    wordvoc = pickle.load(f)

In [9]:
len(wordvoc.word2idx)

88154

Вспомогательные функции для обучения модели

In [10]:
def batch_generator(dataset, word2idx, tokenizer, batch_size=64, shuffle=True):
    X, Y = dataset, dataset
    PAD = word2idx['[PAD]']
    n_samples = len(X)

# генерим список индексов
    list_of_indexes = np.linspace(
        0, n_samples - 1, n_samples, dtype=np.int64)
    List_X = []
    List_Y = []
    
# если нужно перемешать, то перемешиваем
    if shuffle:
        np.random.shuffle(list_of_indexes)
        

# сгенерируем список индексов, по этим индексам,
# сделаем новый перемешаный спиисок токенов и тэгов
    for indx in list_of_indexes:
        if(type(X[indx]) == str) and (type(X[indx]) == str):
            List_X.append(tokenizer.tokenize(X[indx]))
            List_Y.append(tokenizer.tokenize(Y[indx]))
    n_samples = len(List_X)
    n_batches = n_samples//batch_size
    if n_samples%batch_size != 0:
        n_batches+=1
        
    # For each k yield pair x and y
    for k in range(n_batches):
# указываем текущии размер батча
        this_batch_size = batch_size
    
# если мы выдаем последний батч, то его нужно обрезать
        if k == n_batches - 1:
            if n_samples%batch_size > 0:
                this_batch_size = n_samples%batch_size
                
        This_X = List_X[k*batch_size:k*batch_size + this_batch_size]
        This_Y = List_Y[k*batch_size:k*batch_size + this_batch_size]
        
        This_X_line = [
                       [word2idx.get(char, 0) for char in sent]\
                       for sent in This_X]
        This_Y_line = [
                       [word2idx.get('[CLS]', 0)]\
                       + [word2idx.get(char, 0) for char in sent]\
                       + [word2idx.get('[SEP]', 0)]\
                       for sent in This_Y]
        List_of_length_x = [len(sent) for sent in This_X_line]
        length_of_sentence_x = max(List_of_length_x)
        List_of_length_y = [len(sent) for sent in This_Y_line]
        length_of_sentence_y = max(List_of_length_y)

        x_arr = np.ones(shape=[this_batch_size, length_of_sentence_x])*PAD
        y_arr = np.ones(shape=[this_batch_size, length_of_sentence_y])*PAD

        for i in range(this_batch_size):
            x_arr[i, :len(This_X_line[i])] = This_X_line[i]
            y_arr[i, :len(This_Y_line[i])] = This_Y_line[i]

        x = torch.LongTensor(x_arr)
        y = torch.LongTensor(y_arr)
        lengths = torch.LongTensor(List_of_length_x)

        yield x, y

In [64]:
CLIP = 1
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    encoder, decoder = model
    encoder.train()
    decoder.train()
    encoder.zero_grad()
    decoder.zero_grad()
    def closure():
        d, h, c = encoder(batch_of_x.to(encoder.device))
        output = decoder(
            batch_of_y.to(decoder.device), 
            h=h.to(decoder.device)[:, -decoder.num_layers:, :], 
            c=c.to(decoder.device)[:, -decoder.num_layers:, :])

        loss = loss_function(output[:, :-1, :].transpose(1, 2), batch_of_y.to(decoder.device)[:, 1:])
        loss.backward()
        return loss
    
    torch.nn.utils.clip_grad_norm_(model[0].parameters(), CLIP) # Клипуем градиент на случай его взрыва
    torch.nn.utils.clip_grad_norm_(model[1].parameters(), CLIP)
    optimizer.step(closure)
    
    return closure().cpu().item()

In [12]:

def train_epoch(train_generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        local_loss = train_on_batch(
            model, batch_of_x, batch_of_y, optimizer, loss_function)
        if callback is not None:
            with torch.no_grad():
                callback(model, local_loss)
        train_generator.set_postfix({'train batch loss': local_loss})

        epoch_loss += local_loss*len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total

In [13]:


def trainer(count_of_epoch, 
            batch_size,
            model,
            dataset,
            word2idx,
            loss_function,
            optimizer,tokenizer, callback = None):
    iterations = tqdm.notebook.tqdm(range(count_of_epoch))
    epoch_loss = 0
    for it in iterations:
        optima = optimizer

        number_of_batch = len(dataset)//batch_size + (len(dataset)%batch_size>0)
        generator = tqdm.notebook.tqdm(
            batch_generator(dataset, word2idx, tokenizer, batch_size),
            leave=False, total=number_of_batch)
        
        epoch_loss = train_epoch(
            train_generator = generator, model = model, 
            loss_function = loss_function, 
            optimizer = optima, callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})
    model[0].eval()
    model[1].eval()
    return epoch_loss

Энкодер для нашего автокодировщика, основной блок это LSTM

In [14]:
class Encoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
        
    def __init__(self,
                 vocab_dim,
                 emb_dim = 10, 
                 hidden_dim = 10,
                 num_layers = 3,
                 bidirectional = False, dropout = 0.5, batch_norm = False):
        super(Encoder, self).__init__()
        
        self.num_direction = int(bidirectional + 1)
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim

        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)

        self.encoder = torch.nn.LSTM(
            emb_dim, hidden_dim, num_layers, bidirectional = bidirectional)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input):
        input = self.dropout(self.embedding(input))
        input = torch.transpose(input, 0, 1)
        d, (h, c) = self.encoder(input)
        return d, torch.transpose(h, 0, 1) , torch.transpose(c, 0, 1)

Декодер для нашего автокодировщика, основной блок это LSTM

In [16]:
class Decoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self,
                 vocab_dim,
                 output_dim, word2idx,
                 emb_dim = 10, 
                 hidden_dim = 10,
                 num_layers = 3,
                 bidirectional = False, dropout = 0.5, batch_norm = False):
        super(Decoder, self).__init__()
        
        self.num_direction = int(bidirectional + 1)
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        self.embedding = torch.nn.Embedding(vocab_dim, self.emb_dim)

        self.decoder = torch.nn.LSTM(
            emb_dim, hidden_dim, num_layers, bidirectional = bidirectional)

        self.linear = torch.nn.Linear(
            self.num_direction*hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.word2idx = word2idx

    def forward(self, real=None, h = None, c = None, max_len = 50):
        batch_size = 1
        if h is not None:
            batch_size = h.shape[0]
        if c is not None:
            batch_size = c.shape[0]
        if real is not None:
            batch_size = real.shape[0]

        #Для обучения нам нужно, чтобы модель принимала реальную строку в качестве ввода
        if real is not None:
            input = self.dropout(self.embedding(real))

            if h is None:
                h = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )
            if c is None:
                c = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )

            input = torch.transpose(input, 0, 1)
            h = torch.transpose(h, 0, 1)
            c = torch.transpose(c, 0, 1)
            d, _ = self.decoder(input, (h, c))
            answers = self.linear(d)
        #Для инференса нам нужно, чтобы модель автоматически подставаля в качесте первого элемента входа символ начала строки
        else:
            input = self.embedding(
                torch.tensor(
                    [[self.word2idx['[CLS]']] for _ in range(
                        batch_size)]).long().to(
                        self.device
                    )
                )

            if h is None:
                h = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )
            if c is None:
                c = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )

            input = torch.transpose(input, 0, 1)
            h = torch.transpose(h, 0, 1)
            c = torch.transpose(c, 0, 1)

            answers = torch.zeros(
                (max_len, input.shape[1], self.output_dim)).to(
                    self.device)
                
            for i in range(max_len):
                d, (h, c) = self.decoder(input, (h, c))
                answers[i, :, :] = self.linear(d)[0]
                input = self.embedding(
                    torch.argmax(answers[i:i+1, :, :], dim=-1))

        return torch.transpose(answers, 0, 1)

In [17]:
device = torch.device("cuda")

In [17]:
encoder = Encoder(vocab_dim=len(wordvoc.word2idx), 
                  num_layers=2, emb_dim=100, hidden_dim=100, dropout= 0.5)
encoder.to(device)
decoder = Decoder(vocab_dim=len(wordvoc.word2idx), 
                  output_dim=len(wordvoc.word2idx), word2idx=wordvoc.word2idx, num_layers=2, emb_dim=100, hidden_dim=100, dropout= 0.5)
decoder.to(device)

optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(decoder.parameters()), lr=1e-3)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=wordvoc.word2idx['[PAD]'])

In [18]:
sent = torch.tensor([[wordvoc.word2idx.get(char, 0) for char in tokenizer.tokenize(data["message"][0])]])

In [19]:
d, h, c = encoder(sent.to(encoder.device))

Восстановление строки из датасета с помощью модели до обучения

In [20]:
sent = torch.tensor([[wordvoc.word2idx.get(char, 0) for char in tokenizer.tokenize(data["message"][0])]])
d, h, c = encoder(sent.to(encoder.device))
encoder.eval()
decoder.eval()
indexes = torch.argmax(
    decoder(max_len=100,
            h=h, 
            c=c), dim=-1).detach().cpu().numpy()[0]
list_of_char = []
for idx in indexes:
    if idx == wordvoc.word2idx['[SEP]']:
        break
    list_of_char.append(wordvoc.idx2word[idx])
print(''.join(list_of_char))

##sonmathsrifdeledele##chargergheTOFmankmankTOFmank##chdHawaiTOF##chd##chdrssmankshaftvlagevlagetratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamiento


Строка, которую дает энкодер на случайном векторе

In [21]:
for _ in range(10):
    indexes = torch.argmax(
        decoder(max_len=100,
                h=0.1*torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                        decoder.device
                ), 
                c=torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                    decoder.device
                )), dim=-1).detach().cpu().numpy()[0]
    list_of_char = []
    for idx in indexes:
        if idx == wordvoc.word2idx['[SEP]']:
            break
        list_of_char.append(wordvoc.idx2word[idx])
    print(''.join(list_of_char))

##endoCambridgeCambridgeMinMinMinMinAsiavlageBergmantailoredtodos##ilotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratamientotratam

In [22]:
len(data)

1578614

Обучаем сеть на небольших значениях гиперпараметров

In [None]:
trainer(count_of_epoch = 100,
        batch_size = 16,
        model = (encoder, decoder),
        dataset = data.head(n=10*64 + 10)["message"], 
        tokenizer=tokenizer,
        word2idx = wordvoc.word2idx,
        loss_function = loss_function,
        optimizer = optimizer)

In [24]:
torch.save(encoder.state_dict(), 'encoder_0.pt')
torch.save(decoder.state_dict(), 'decoder_0.pt')

In [25]:
encoder.load_state_dict(torch.load('encoder_0.pt'))
decoder.load_state_dict(torch.load('decoder_0.pt'))

<All keys matched successfully>

In [26]:
d, h, c = encoder(sent.to(encoder.device))

In [27]:
print(d.shape)

torch.Size([20, 1, 100])


In [41]:
data["message"][100]

'no pavel tonight &lt;Tigersfan &gt;'

In [43]:
sent = torch.tensor([[wordvoc.word2idx.get(char, 0) for char in tokenizer.tokenize(data["message"][100])]])
y = torch.tensor([[wordvoc.word2idx.get('[CLS]', 0)]\
                       + [wordvoc.word2idx.get(char, 0) for char in tokenizer.tokenize(data["message"][100])]\
                       + [wordvoc.word2idx.get('[SEP]', 0)]])

In [46]:
encoder.eval()
decoder.eval()
d, h, c = encoder(sent.to(encoder.device))
indexes = torch.argmax(
    decoder(
        max_len=100,
        h=h.to(decoder.device)[:, -decoder.num_layers:, :], 
        c=c.to(decoder.device)[:, -decoder.num_layers:, :]), dim=-1).detach().cpu().numpy()[0]
list_of_char = []
for idx in indexes:
    if idx == wordvoc.word2idx['[SEP]']:
        break
    list_of_char.append(wordvoc.idx2word[idx])
print(''.join(list_of_char))

@dan##rega##n@lini##nini##S##S##S--


Маленькая сеть не позволяет получить что-то вразумительное, в основном либо этой самый популярный в датасете символ точки, либо какая-то бессмыслица. Единственное преимущество такой сети, что она учится быстро

In [47]:
indexes

array([  96, 1409, 1417,   75,   96, 1546, 1547, 1480, 1480, 1480,   80,
         80,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
          2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
          2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
          2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
          2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
          2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
          2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
          2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
          2])

In [51]:
for _ in range(10):
    indexes = torch.argmax(
        decoder(max_len=100,
                h=0.1*torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                        decoder.device
                ), 
                c=torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                    decoder.device
                )), dim=-1).detach().cpu().numpy()[0]
    list_of_char = []
    for idx in indexes:
        if idx == wordvoc.word2idx['[SEP]']:
            break
        list_of_char.append(wordvoc.idx2word[idx])
    print(''.join(list_of_char))

##uu##uu##uu
##ay
...
if..
##on
...
www
;.
just
aunt...


Тоже самое получается для случайных векторов

## Подбор гиперпараметров

Для удобство дальнейшего поиска по сетке представим параметры сети в виде словаря

In [18]:
config_encoder = dict()
config_encoder['vocab_dim'] = len(wordvoc.word2idx)
config_encoder['emb_dim'] = 100
config_encoder['hidden_dim'] = 30
config_encoder['num_layers'] = 4
config_encoder['bidirectional'] = False
config_encoder['dropout'] = 0.7
config_encoder['batch_norm'] = False

config_decoder= dict()
config_decoder['vocab_dim'] = len(wordvoc.word2idx)
config_decoder['output_dim'] = len(wordvoc.word2idx)
config_decoder['emb_dim'] = 100
config_decoder['hidden_dim'] = 30
config_decoder['num_layers'] = 4
config_decoder['bidirectional'] = False
config_decoder['dropout'] = 0.7
config_decoder['batch_norm'] = False
config_decoder['word2idx'] = wordvoc.word2idx

encoder, decoder = Encoder(**config_encoder), Decoder(**config_decoder)

In [19]:
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp

2024-04-07 23:49:19.688774: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


В основном callback нам нужен для удобной записи в tensorboard

In [39]:
class callback():
    def __init__(self, writer, batch_generator, loss_function, metrics, hparams,  delimeter = 100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.hparams = hparams
        self.batch_generator = batch_generator
        self.metrics = metrics

    def forward(self, model, loss):
        self.step += 1
        with self.writer.as_default():
            hp.hparams(self.hparams)
            tf.summary.scalar('Loss', loss, self.step)
        
        if self.step % self.delimeter == 0:
            encoder, decoder = model
            pred = []
            real = []
            test_loss = 0
            encoder.eval()
            decoder.eval()
            total = 0
            for its, (x_batch, y_batch) in enumerate(self.batch_generator):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                with torch.no_grad():
                    d, h, c = encoder(x_batch.to(encoder.device))
                    output = decoder(
                        y_batch.to(decoder.device), 
                        h=h.to(decoder.device)[:, -decoder.num_layers:, :], 
                        c=c.to(decoder.device)[:, -decoder.num_layers:, :])

                local_loss = loss_function(output[:, :-1, :].transpose(1, 2), y_batch.to(decoder.device)[:, 1:]).cpu().item()
                test_loss += local_loss*len(x_batch)
                total += len(x_batch)
            if(total > 0):
                test_loss /= total
            with self.writer.as_default():
                tf.summary.scalar(self.metrics, test_loss, self.step)
          
    def __call__(self, model, loss):
        return self.forward(model, loss)

Optuna дает удобный интерфейс для оптимизации по гиперпараметрам

In [21]:
import optuna
import optuna_dashboard

In [22]:
optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(decoder.parameters()), lr=1e-3)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=wordvoc.word2idx['[PAD]'])

hp нужен для записи значений гиперпараметров в tensorboard

In [49]:
HP_VOCAB_DIM = hp.HParam('vocab_dim', hp.IntInterval(1* 1000, 10* 1000))
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.5, 0.8))
HP_BATCH_NORM = hp.HParam('batch_norm', hp.Discrete([True, False]))
HP_HIDDEN_DIM = hp.HParam('hidden_dim', hp.IntInterval(5, 30))
HP_EMB_DIM = hp.HParam('emb_dim', hp.IntInterval(50, 100))
HP_NUM_LAYERS = hp.HParam('num_layers', hp.IntInterval(1, 4))
METRICS_NAME = "Accuracy"
DELIMETER = 100
IMAGE_SIZE = 28
EPOCHS = 50
BATCH_SIZE  = 16
COLORS = 1
MIN_CHANNELS = 4

In [24]:
logs_dir = 'logs/hparam_tuning'
writer_hparam = tf.summary.create_file_writer('logs/hparam_tuning')
with writer_hparam.as_default():
  hp.hparams_config(
    hparams=[HP_VOCAB_DIM,
      HP_EMB_DIM,
      HP_HIDDEN_DIM,
      HP_NUM_LAYERS,
      HP_BATCH_NORM,
      HP_DROPOUT],
    metrics=[hp.Metric(METRICS_NAME , display_name='Accuracy')],
  )

2024-04-07 23:49:30.609091: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-07 23:49:30.609490: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [28]:
%load_ext tensorboard
%tensorboard --logdir logs/hparam_tuning

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 342712), started 0:00:23 ago. (Use '!kill 342712' to kill it.)

In [None]:
NUM_TRIALS = 10
def objective(trial):
    #получаем от optuna случайно полученные значения гиперпараметров
    run_number = trial.number
    run_name = logs_dir + f'/run-{run_number}'
    print("Current run: " + run_name)
    vocab_dim =  trial.suggest_int('vocab_dim', 70, 88) * 1000
    dropout = trial.suggest_float('dropout', 0.4, 0.6)
    emb_dim = trial.suggest_int('emb_dim', 5, 15)
    hidden_dim = trial.suggest_int('hidden_dim', 5, 15)
    batch_norm = trial.suggest_categorical('batch_norm', [True, False])
    num_layers = trial.suggest_int('layers_count', 2, 4)

    writer = tf.summary.create_file_writer(run_name)

    hparams = {
      HP_VOCAB_DIM: vocab_dim,
      HP_EMB_DIM:  emb_dim,
      HP_HIDDEN_DIM: hidden_dim,
      HP_NUM_LAYERS: num_layers,
      HP_BATCH_NORM: batch_norm,
      HP_DROPOUT: dropout
    }
    wordvoc.rebuild_vocabulary(hparams[HP_VOCAB_DIM])
    #Параметры декодера
    config_decoder = dict()
    config_decoder['vocab_dim'] = len(wordvoc.word2idx)
    config_decoder['output_dim'] = len(wordvoc.word2idx)
    config_decoder['emb_dim'] = hparams[HP_EMB_DIM]
    config_decoder['hidden_dim'] = hparams[HP_HIDDEN_DIM]
    config_decoder['num_layers'] = hparams[HP_NUM_LAYERS]
    config_decoder['bidirectional'] = False
    config_decoder['dropout'] = hparams[HP_DROPOUT]
    config_decoder['batch_norm'] = hparams[HP_BATCH_NORM]
    config_decoder['word2idx'] = wordvoc.word2idx
    #Параметры энеодера
    config_encoder = dict()
    config_encoder['vocab_dim'] = len(wordvoc.word2idx)
    config_encoder['emb_dim'] = hparams[HP_EMB_DIM]
    config_encoder['hidden_dim'] = hparams[HP_HIDDEN_DIM]
    config_encoder['num_layers'] = hparams[HP_NUM_LAYERS]
    config_encoder['bidirectional'] = False
    config_encoder['dropout'] = hparams[HP_DROPOUT]
    config_encoder['batch_norm'] = hparams[HP_BATCH_NORM]
    
    
    device = torch.device("cuda")
    encoder, decoder = Encoder(**config_encoder), Decoder(**config_decoder)
    encoder, decoder = encoder.to(device), decoder.to(device)

    optimizer = torch.optim.Adam(
      list(encoder.parameters()) + list(decoder.parameters()), lr=1e-3)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=wordvoc.word2idx['[PAD]'])

    loss = trainer(count_of_epoch = EPOCHS,
        batch_size = BATCH_SIZE,
        model = (encoder, decoder),
        dataset = data.head(n=10*64 + 10)["message"], 
        tokenizer=tokenizer,
        word2idx = wordvoc.word2idx,
        loss_function = loss_function,
        optimizer = optimizer, callback=callback(writer, batch_generator(data[40*64 + 100: 42*64 + 100]["message"].values, wordvoc.word2idx, tokenizer, BATCH_SIZE), prob_accuracy, METRICS_NAME , hparams, DELIMETER))

    
    print(loss)
    return loss
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=NUM_TRIALS)

Лучшие значнеия гиперпараметров

In [51]:
study.best_params

{'vocab_dim': 88,
 'dropout': 0.4971203614510987,
 'emb_dim': 14,
 'hidden_dim': 12,
 'batch_norm': True,
 'layers_count': 3}

Минимальный loss

In [52]:
study.best_value

6.271160375539254

## Вывод

Неожиданным результатом оказалось то, что на обученеи влиет во основном размер словаря, данных не достаточно чтобы сказать, что остальные гиперпараметры влияет на качество восстановления. Возможно это связано с тем, что изначально были подобраны оптимальные параметры модели.

## Результат обучения на подобранных параметрах

In [127]:


wordvoc.rebuild_vocabulary(90000)
config_decoder = dict()
config_decoder['vocab_dim'] = len(wordvoc.word2idx)
config_decoder['output_dim'] = len(wordvoc.word2idx)
config_decoder['emb_dim'] = 256
config_decoder['hidden_dim'] = 512
config_decoder['num_layers'] = 2
config_decoder['bidirectional'] = False
config_decoder['dropout'] = study.best_params['dropout']
config_decoder['batch_norm'] = study.best_params['batch_norm']
config_decoder['word2idx'] = wordvoc.word2idx

config_encoder = dict()
config_encoder['vocab_dim'] = len(wordvoc.word2idx)
config_encoder['emb_dim'] = 256
config_encoder['hidden_dim'] = 512
config_encoder['num_layers'] = 5
config_encoder['bidirectional'] = False
config_encoder['dropout'] = study.best_params['dropout']
config_encoder['batch_norm'] = study.best_params['batch_norm']

In [128]:
encoder, decoder = Encoder(**config_encoder), Decoder(**config_decoder)
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
encoder.apply(init_weights)
decoder.apply(init_weights)
encoder, decoder = encoder.to(device), decoder.to(device)

In [129]:
optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=1e-3)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=wordvoc.word2idx['[PAD]'])

In [130]:
trainer(count_of_epoch = 20,
        batch_size = 16,
        model = (encoder, decoder),
        dataset = data.head(n=100*64 + 10)["message"], 
        tokenizer=tokenizer,
        word2idx = wordvoc.word2idx,
        loss_function = loss_function,
        optimizer = optimizer)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

KeyboardInterrupt: 

Сохраним сеть, чтобы потом можно ее восстановить

In [131]:
torch.save(encoder.state_dict(), 'encoder_opt.pt')
torch.save(decoder.state_dict(), 'decoder_opt.pt')

Загружаем сохраненную сеть

In [None]:
encoder.load_state_dict(torch.load('encoder_opt.pt'))
decoder.load_state_dict(torch.load('decoder_opt.pt'))

In [148]:
sent = torch.tensor([[wordvoc.word2idx.get(char, 0) for char in tokenizer.tokenize(data["message"][1200])]])

In [146]:
data["message"][1200]

'happy. spending time with mum'

In [149]:
encoder.eval()
decoder.eval()
d, h, c = encoder(sent.to(encoder.device))
indexes = torch.argmax(
    decoder(
        max_len=100,
        h=h.to(decoder.device)[:, -decoder.num_layers:, :], 
        c=c.to(decoder.device)[:, -decoder.num_layers:, :]), dim=-1).detach().cpu().numpy()[0]
list_of_char = []
for idx in indexes:
    if idx == wordvoc.word2idx['[SEP]']:
        break
    list_of_char.append(wordvoc.idx2word[idx])
print(''.join(list_of_char))

itsokayFallagoingtobed.


In [142]:
for _ in range(10):
    indexes = torch.argmax(
        decoder(max_len=100,
                h=0.1*torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                        decoder.device
                ), 
                c=torch.randn(
                    (1, decoder.num_layers, decoder.num_direction*decoder.hidden_dim)).to(
                    decoder.device
                )), dim=-1).detach().cpu().numpy()[0]
    list_of_char = []
    for idx in indexes:
        if idx == wordvoc.word2idx['[SEP]']:
            break
        list_of_char.append(wordvoc.idx2word[idx])
    print(''.join(list_of_char))

inthesametime
checkedout
I'mstuckin
imiforwardto
imissher.
I'mstuckwith
:://tinyurl.com/x##q##q##x##x
Poutm##r
theco##ochi##k
haveagoodday


Ручная подборка параметров помогла достичь более осознаных предложений. Можно сделать вывод, что при подборке гиперпараметров также важна точна с которй мы начинаем. Для слишком маленькой сети модель просто обучается на самый распространенный символ, но начиная с какого-то порогого размера сети она уже обучается дальше.