**Нейросетевая языковая модель**

Евгений Борисов <esborisov@sevsu.ru>

In [None]:
# Dataset getitem
# DataLoader generate_batches

In [None]:
import gzip
import numpy as np
# from random import sample

from razdel import sentenize
from razdel import tokenize

import torch
from torchtext.vocab  import build_vocab_from_iterator
from torch.utils.data import Dataset

In [None]:
class TextDataset(Dataset):
    
    def __init__(self, file_name):
        super().__init__()
        self._UNK = '<unk>'
        self._tokens = self._tokenize(self._load(file_name))
        self._vocab = self._build_vocab( tokens = self._tokens,token_default=self._UNK)
         
    @staticmethod       
    def _load(file_name):
        with gzip.open(file_name,'rt') as f: text = f.read() 
        return text
      
    @staticmethod       
    def _tokenize(text):
        return [ 
            [ w.text for w in tokenize(s.text) ] # разбиваем предложения на слова
            for s in sentenize(text) # режем текст на отдельные предложения
        ]
    @staticmethod
    def _build_vocab(tokens,token_default):
        vocab = build_vocab_from_iterator( tokens, specials=[token_default])
        vocab.set_default_index(vocab[token_default])
        return vocab
    
    @property
    def vocabulary(self): return self._vocab
    
    def __getitem__(self, idx):
        return self._tokens[idx]
    
    def __len__(self):
        return len(self._tokens)

In [None]:
class ContextDataset(TextDataset):

    def __init__(self, file_name,context_deep=7,context_deep_min=2):
        assert (context_deep>context_deep_min)
        
        super().__init__(file_name)
        self._target = self._flatten_sentences(context_deep_min)
        self._contex = self._collect_context(context_deep)
        
    def _flatten_sentences(self,context_deep_min):
        return [ self._vocab[t] for s in self._tokens for t in s[context_deep_min:] ]
    
    def _collect_context(self,context_deep,context_deep_min):
        return [
            c
            for s in self._tokens
            for c in self._collect_context_sentence(s,context_deep,context_deep_min )
        ]
    
    def _collect_context_sentence(self,sentence,context_deep,context_deep_min):
        sentence_ = self._sentence_padding(sentence,context_deep)
        context_count = len(sentence_)-context_deep
        return [ sentence_[i:i+context_deep] for i in range(context_count) ]

    def _sentence_padding(self,sentence,context_deep):
        pad = [self._UNK]*(context_deep-context_deep_min)
        return [ self._vocab[t] for t in ( pad + sentence) ]

    def __getitem__(self, idx):
        return (
            np.array( self._contex[idx], dtype=np.int32 ),
            # np.array( self._target[idx], dtype=np.int32 ),
            self._target_ohe( self._target[idx] ),
        )
    
    def _target_ohe(self,target):
        return np.eye(len(self._vocab),dtype=np.float32)[target]
                
    def __len__(self):
        return len(self._target)

In [None]:
FIXME: проверка минимальной длинны sentence > context_deep_min

In [None]:
dataset = ContextDataset('../data/dostoevsky-besy-p2.txt.gz',context_deep=7)
vocabulary_len = len(dataset.vocabulary)

print('vocabulary size:', vocabulary_len)
print('dataset size:', len(dataset))

---

In [None]:
import torch
import torch.nn as nn

In [None]:
class Model(nn.Module):
    
    def __init__(self, vocab_size, emb_dim, hid_dim):
        super().__init__()
        self._emb0 = nn.Embedding(vocab_size,emb_dim)
        self._rnn0 = nn.LSTM(emb_dim, hid_dim,)
        self._lin0 = nn.Linear(hid_dim,vocab_size) 
        self._smx0 = nn.Softmax(dim=1)

    def forward(self,x):
        o = self._emb0(x)
        o,_ = self._rnn0(o)
#         o = o[:,-1,:]
        o = self._lin0( o[:,-1,:] )
        o = self._smx0(o)
        return o
        
    def predict(self, x):    
        return self.forward(x)

model = Model(vocab_size=vocabulary_len,emb_dim=(1024*8),hid_dim=1024)

In [None]:
from torch.utils.data import DataLoader

x, target = next(iter(  DataLoader( dataset, batch_size=64, shuffle=True ) ))
predicted = model.predict(x)

x.shape, predicted.shape, target.shape

In [None]:
# predicted,target

In [None]:
# nn.BCELoss()(predicted,target)
# nn.MSELoss()(predicted,target)
# nn.CrossEntropyLoss()(predicted,target)

----

In [None]:
# используем GPU если есть
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [None]:
model = model.to(device)

In [None]:
# from sklearn.metrics import accuracy_score
# from tqdm.notebook import tqdm as tqdm

# target    = []
# predicted = []

# with torch.set_grad_enabled(False):
#     for x,y in tqdm(train_dataloader):
#         predicted.append( np.argmax( model.predict(x.to(device)).cpu().numpy(),axis=1 ) )
#         target.append(y.numpy())
        
# accuracy_score( np.hstack( target ), np.hstack( predicted ) )

In [None]:
# from sklearn.metrics import accuracy_score

# def accuracy(x,target):
#     with torch.set_grad_enabled(False):
#         predicted = model.predict(x.to(device)).cpu().numpy()
#     return accuracy_score(np.argmax(target.numpy(),axis=1),np.argmax(predicted,axis=1))

In [None]:
# from tqdm.notebook import tqdm as tqdm

# with torch.set_grad_enabled(False):
#     for x,target in tqdm(test_dataloader):
#         acc_history = [ accuracy(x,target) ] # начальное значение погрешности

# acc_history = []

In [None]:
# ф-ция потери
# criterion = nn.MSELoss()
# criterion = nn.BCELoss()
criterion = nn.CrossEntropyLoss()

In [None]:
from torch import optim

# метод оптимизации ф-ции потери
# optimizer = optim.Adam(model.parameters(), lr=1e-2) 
optimizer = optim.Adam(model.parameters(), lr=5e-1) 


In [None]:
# with torch.set_grad_enabled(False):
#     loss = criterion( 
#             torch.Tensor(y_train).to(device), 
#             model.predict( torch.Tensor(X_train).to(device) ) 
#         ).cpu().numpy().flatten()[0]
    
# loss_history = [ loss ] # начальное значение ф-ции потери

# loss_history = [] # начальное значение ф-ции потери

In [None]:
from sklearn.metrics import accuracy_score

def accuracy(predicted,target):
    return accuracy_score(
        np.argmax(target.cpu().numpy(),axis=1),
        np.argmax(predicted.detach().cpu().numpy(),axis=1),
    )

In [None]:
%%time

from tqdm.notebook import tqdm # рисует прогрессбар
from torch.utils.data import DataLoader # генератор батчей


loss_history = []
acc_history = []

# acc_min = .98 # порог минимально допустимой погрешности модели

n_epoch = 2 # количество эпох обучения

# epoch = tqdm(range(n_epoch))

for epoch in range(n_epoch): 
    
    batches = tqdm(DataLoader( dataset, batch_size=256, shuffle=True ) )
    
    for x,target in batches:
        o = model.forward( x.to(device) ) # считаем выход модели
        loss = criterion( target.to(device),o ) # вычисляем значение ф-ции потери
        loss_history.append(loss.item()) # дополняем историю изменения значений ф-ции потери
        optimizer.zero_grad() # очищаем предыдущее значение градиента
        loss.backward()  # вычисляем текущее значение градиента ф-ции потери
        optimizer.step() # корректируем параметры модели
        acc_history.append( accuracy(o,target) ) #значение погрешности

        batches.set_postfix({
            'loss':loss_history[-1], 
             'acc':acc_history[-1],
        })
        
    #if acc_history[-1] > acc_min: # проверяем достижение минимального порога погрешности модели
    #    print('step %i/%i: loss %.03f, acc threshold %.03f reached\n'%(i+1,n_epoch,loss_history[-1],acc_min))
    #    break
        


---

In [None]:
# model.predict(batch_input).shape

In [None]:
# em0 = nn.Embedding(vocabulary_len,1024)
# rc0  = nn.LSTM(1024, 32,)
# dense0 = nn.Linear(32,vocabulary_len) 
# smax0 = nn.Softmax(dim=1)

# o = smax0( dense0( rc0( em0(batch_input) )[0][:,-1,:] ) )

# o.shape

# # o[:,-1,:].shape, len(s), s[0].shape, s[1].shape

In [None]:
# smax1 = nn.Softmax(dim=1)
# x = torch.randn(2, 3)
# smax1(x)

In [None]:
# torch.Tensor( batch_input ) #, batch_target 

In [None]:
# model.predict(batch_input)

In [None]:
# batch_input 
# batch_target

In [None]:
# class LSTMTagger(nn.Module):

#     def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
#         super(LSTMTagger, self).__init__()
#         self.hidden_dim = hidden_dim

#         self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

#         # The LSTM takes word embeddings as inputs, and outputs hidden states
#         # with dimensionality hidden_dim.
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim)

#         # The linear layer that maps from hidden state space to tag space
#         self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

#     def forward(self, sentence):
#         embeds = self.word_embeddings(sentence)
#         lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
#         tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
#         tag_scores = F.log_softmax(tag_space, dim=1)
#         return tag_scores

---

In [None]:
# import torch
# from torch import nn
# from torch import sigmoid

# class MLP(nn.Module): 
    
#     def __init__(self,input_size,output_size):
#         super().__init__()
#         self.dense1 = nn.Linear(input_size,10) # первый - обрабатывающий  слой 
#         self.dense2 = nn.Linear(10,5) # второй - обрабатывающий/скрытый слой
#         self.dense3 = nn.Linear(5,output_size) # третий - обрабатывающий/выходной слой

#     def forward(self,x):
#         o = sigmoid(self.dense1(x))
#         o = sigmoid(self.dense2(o))
#         o = sigmoid(self.dense3(o))
#         return o
    
#     def predict(self, x):    
#         return self.forward(x)
    
    
# model = MLP( input_size=X_train.shape[1], output_size=y_train.shape[1], )

# from torch import optim

# criterion = nn.MSELoss() # ф-ция потери
# optimizer = optim.Adam(model.parameters(), lr=1e-2) # метод оптимизации ф-ции потери


# from torch.utils.data import TensorDataset


# # пакуем данные в формат Torch
# dataset = TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train) )


# # используем GPU если есть
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# device

# model = model.to(device)


# from sklearn.metrics import accuracy_score

# def accuracy(x,y):
#     with torch.set_grad_enabled(False):
#         o = np.argmax( model.predict(torch.Tensor(x).to(device)).cpu().numpy(),axis=1 )
#     return accuracy_score(y[:,1],o)

# acc_history = [ accuracy(X_train,y_train) ] # начальное значение погрешности



# with torch.set_grad_enabled(False):
#     loss = criterion( 
#             torch.Tensor(y_train).to(device), 
#             model.predict( torch.Tensor(X_train).to(device) ) 
#         ).cpu().numpy().flatten()[0]
    
# loss_history = [ loss ] # начальное значение ф-ции потери


# %%time

# from tqdm import tqdm # рисует прогрессбар
# from torch.utils.data import DataLoader # генератор батчей

# n_epoch = 500 # количество эпох обучения
# acc_min = .98 # порог минимально допустимой погрешности модели

# for i in tqdm(range(n_epoch)): 
    
#     for x,y in DataLoader(dataset, batch_size=len(y_train)//3, shuffle=True): # получаем батч учебных примеров
#         out = model.forward(x.to(device)) # считаем выход модели
#         loss = criterion( y.to(device),out ) # вычисляем значение ф-ции потери
#         loss_history.append(loss.item()) # дополняем историю изменения значений ф-ции потери
#         optimizer.zero_grad() # очищаем предыдущее значение градиента
#         loss.backward()  # вычисляем текущее значение градиента ф-ции потери
#         optimizer.step() # корректируем параметры модели
        
#     acc_history.append( accuracy(X_train,y_train) ) #значение погрешности
#     if acc_history[-1] > acc_min: # проверяем достижение минимального порога погрешности модели
#         print('step %i/%i: loss %.03f, acc threshold %.03f reached\n'%(i+1,n_epoch,loss_history[-1],acc_min))
#         break
        
        
# # история изменения значений погрешности модели
# plt.plot(acc_history,label='max acc=%.3f'%(max(acc_history)),c='r')
# plt.grid()
# plt.legend()


# # история изменения значений ф-ции потери
# plt.plot(loss_history,label='min loss=%.3f'%(min(loss_history)))
# plt.grid()
# plt.legend()

# with torch.set_grad_enabled(False):
#     s = model.predict( torch.Tensor(X_test).to(device)).cpu().numpy()[:,1]
    
# from sklearn.metrics import roc_curve
# from sklearn.metrics import auc

# fpr, tpr, thresholds = roc_curve( y_test[:,1], s )
# roc_auc = auc(fpr,tpr)

# plt.figure(figsize=(7,7))
# plt.grid(True)
# plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC AUC %0.2f' % roc_auc)
# plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic')
# plt.legend(loc="lower right")
# plt.show()    

---

In [None]:
#     def __len__(self):
#         return len(self._tokens)    



In [None]:
# sentence = [
#  'Я-то',
#  'кой-куда',
#  'еще',
#  'выходил',
#  'и',
#  'по-прежнему',
#  'приносил',
#  'ему',
#  'разные',
#  'вести',
#  ',',
#  'без',
#  'чего',
#  'он',
#  'и',
#  'пробыть',
#  'не',
#  'мог',
#  '.',
# ]

# context_deep = 3

# UNK_KWD = '<unk>'

# sentence_ = [UNK_KWD,]*context_deep + sentence

# [ 
#  [ sentence_[i:i+context_deep] ]  
#  for i in range(len(sentence)) 
# ]




In [None]:
# len(data)
# next(iter(train_dataloader))

----

In [None]:
# vocab['отворил'],vocab['сам'],

In [None]:
# import torchtext
# from torchtext.data import Field
# from torchtext.data import BucketIterator
# from torchtext.data import TabularDataset

# en = spacy.load('en')
# fr = spacy.load('fr')

# def tokenize_en(sentence):
#     return [tok.text for tok in en.tokenizer(sentence)]

# def tokenize_fr(sentence):
#     return [tok.text for tok in fr.tokenizer(sentence)]

# EN_TEXT = Field(tokenize=tokenize_en)
# FR_TEXT = Field(tokenize=tokenize_fr, init_token = "<sos>", eos_token = "<eos>")

In [None]:
# len(voc)
# [ w for w in voc ]

---

In [None]:
# # from nltk.util import bigrams
# from nltk.util import ngrams as nltk_ngrams

# # вынимаем все n-gram из текста
# ngram_len = 3 # работаем с триграммами
# text_ngrams = [ ngram for s in text for ngram in nltk_ngrams(s,ngram_len) ]
# print('количество n-gram: %i'%(len(set(text_ngrams))))
# sample(text_ngrams,5)

----

In [None]:
# from nltk.util import flatten as nltk_flatten

# vocab = { w:i for i,w in enumerate(sorted(set(nltk_flatten(text)))) }
# print(len(vocab))

In [None]:
# [ 
#     [ vocab[w] for w in t ]
#     for t in text 
# ]

----

In [None]:
# from torchtext.data.utils import get_tokenizer
# from torchtext.vocab import build_vocab_from_iterator

# tokenizer = get_tokenizer('basic_english')

# train_iter = AG_NEWS(split='train')

# def yield_tokens(data_iter):
#     for _, text in data_iter:
#         yield tokenizer(text)

# vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
# vocab.set_default_index(vocab["<unk>"])

In [None]:
# import re
# import gzip

# # загружаем текст ...
# file_name = '../data/dostoevsky-besy-p2.txt.gz'
# with gzip.open(file_name,'rt') as f:  
#     text = f.read()[105:] # ...и выкидываем заголовок

# print('символов:%i\n'%(len(text)))
# print(text[:364].strip())

In [None]:
# from random import sample

# from razdel import sentenize
# from razdel import tokenize

# tokens = [ 
#     [ w.text for w in tokenize(s.text) ] # разбиваем предложения на слова
#     for s in sentenize(text) # режем текст на отдельные предложения
# ]

# print('предложений: %i\n'%(len(tokens)))

# sample(tokens,2)

---

In [None]:
# from torchtext.vocab import build_vocab_from_iterator

# vocab = build_vocab_from_iterator(tokens, specials=['<unk>',])

In [None]:
# [ vocab[t] for t in tokens[1] ]
# vocab['<unk>']