**Нейросетевая языковая модель**

Евгений Борисов <esborisov@sevsu.ru>

In [1]:
# import re
# import gzip

# # загружаем текст ...
# file_name = '../data/dostoevsky-besy-p2.txt.gz'
# with gzip.open(file_name,'rt') as f:  
#     text = f.read()[105:] # ...и выкидываем заголовок

# print('символов:%i\n'%(len(text)))
# print(text[:364].strip())

In [2]:
# from random import sample

# from razdel import sentenize
# from razdel import tokenize

# tokens = [ 
#     [ w.text for w in tokenize(s.text) ] # разбиваем предложения на слова
#     for s in sentenize(text) # режем текст на отдельные предложения
# ]

# print('предложений: %i\n'%(len(tokens)))

# sample(tokens,2)

---

In [3]:
# from torchtext.vocab import build_vocab_from_iterator

# vocab = build_vocab_from_iterator(tokens, specials=['<unk>',])

In [4]:
# [ vocab[t] for t in tokens[1] ]
# vocab['<unk>']

In [5]:
# Dataset getitem
# DataLoader generate_batches

In [10]:
import gzip
# from random import sample
from razdel import sentenize
from razdel import tokenize
from torchtext.vocab import build_vocab_from_iterator

from torch.utils.data import Dataset

class TextDataset(Dataset):
    
    def __init__(self, file_name):
        super().__init__()
        self._UNK = '<unk>'
        self._data = self._tokenize(self._load(file_name))
        self._vocab = self._build_vocab( tokens = self._data,token_default=self._UNK)
         
    @staticmethod       
    def _load(file_name):
        with gzip.open(file_name,'rt') as f: text = f.read() 
        return text
      
    @staticmethod       
    def _tokenize(text):
        return [ 
            [ w.text for w in tokenize(s.text) ] # разбиваем предложения на слова
            for s in sentenize(text) # режем текст на отдельные предложения
        ]
    @staticmethod
    def _build_vocab(tokens,token_default):
        vocab = build_vocab_from_iterator( tokens, specials=[token_default])
        vocab.set_default_index(vocab[token_default])
        return vocab
    
    def __getitem__(self, idx):
        return self._data[idx]
    
    def __len__(self):
        return len(self._data)

In [27]:
class ContextDataset(TextDataset):

    def __init__(self, file_name,context_deep=3):
        super().__init__(file_name)
        self._context_deep = context_deep
        self._contex = self._collect_context()
        self._tokens = self._flatten_sentences()
        
    def _flatten_sentences(self,):
        return [
            self._vocab[t]
            for s in self._data
            for t in s
        ]
    
    def _collect_context(self,):
        return [
            c
            for s in self._data
            for c in self._collect_context_sentence( self._encode_tokens(s) )
        ]
        
    def _encode_tokens(self,tokens):
        return [ self._vocab[t] for t in ([self._UNK]*self._context_deep + tokens) ]

    
    def _collect_context_sentence(self,sentence):
        return [ 
            sentence[i:i+self._context_deep]  
            for i in range(len(sentence)-self._context_deep) 
        ]
     
    def __getitem__(self, idx):
        return self._contex[idx],self._tokens[idx]
        
    def __len__(self):
        return len(self._tokens)   

In [28]:
data = ContextDataset('../data/dostoevsky-besy-p2.txt.gz')
data[3]

([6101, 3618, 3521], 6307)

In [32]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(data, batch_size=64, shuffle=True)

next(iter(train_dataloader))

[[tensor([  102,     0,     9,     0,     1, 11270,     1,    41,     1,     0,
              5,    56,  5064,   214,    10,   337,     5,     0, 16294,  2498,
            260,     0,   167,   714,   991,   704,     0,     1,     0,     7,
              5,     3,     1,     3,  5592,     0,    10,     5,  1799,     1,
              7,     4,    47,  1510,     0,    72,  4018,     0,   709,    19,
             59,    13,   384,    11,    20,     0, 14405,     5,  6780,     0,
              3,   164,   204,  1491]),
  tensor([ 9381,     0,    78,    79,  4478, 12258,   302,    97,     7,    22,
             10,    18,  1509,    53,  3002,   378, 11849,     0,    13,  2520,
              7,     0,    32,    37,  3714,   937,   202,     3,     3,    12,
          16486,    59,    92,    79,     9,     0,   474,   890,    20,    37,
            132,   272,    24,    25,   147,   216,    13,     0,    51,   197,
            222,     5,  4104,    67,    47,     0,     1, 12382,    33,     3,


In [None]:
    
#     def __len__(self):
#         return len(self._tokens)    



In [None]:
# sentence = [
#  'Я-то',
#  'кой-куда',
#  'еще',
#  'выходил',
#  'и',
#  'по-прежнему',
#  'приносил',
#  'ему',
#  'разные',
#  'вести',
#  ',',
#  'без',
#  'чего',
#  'он',
#  'и',
#  'пробыть',
#  'не',
#  'мог',
#  '.',
# ]

# context_deep = 3

# UNK_KWD = '<unk>'

# sentence_ = [UNK_KWD,]*context_deep + sentence

# [ 
#  [ sentence_[i:i+context_deep] ]  
#  for i in range(len(sentence)) 
# ]




In [None]:
# len(data)
# next(iter(train_dataloader))

----

In [None]:
# vocab['отворил'],vocab['сам'],

In [None]:
# import torchtext
# from torchtext.data import Field
# from torchtext.data import BucketIterator
# from torchtext.data import TabularDataset

# en = spacy.load('en')
# fr = spacy.load('fr')

# def tokenize_en(sentence):
#     return [tok.text for tok in en.tokenizer(sentence)]

# def tokenize_fr(sentence):
#     return [tok.text for tok in fr.tokenizer(sentence)]

# EN_TEXT = Field(tokenize=tokenize_en)
# FR_TEXT = Field(tokenize=tokenize_fr, init_token = "<sos>", eos_token = "<eos>")

In [None]:
# len(voc)
# [ w for w in voc ]

---

In [None]:
# # from nltk.util import bigrams
# from nltk.util import ngrams as nltk_ngrams

# # вынимаем все n-gram из текста
# ngram_len = 3 # работаем с триграммами
# text_ngrams = [ ngram for s in text for ngram in nltk_ngrams(s,ngram_len) ]
# print('количество n-gram: %i'%(len(set(text_ngrams))))
# sample(text_ngrams,5)

----

In [None]:
# from nltk.util import flatten as nltk_flatten

# vocab = { w:i for i,w in enumerate(sorted(set(nltk_flatten(text)))) }
# print(len(vocab))

In [None]:
# [ 
#     [ vocab[w] for w in t ]
#     for t in text 
# ]

----

In [None]:
# from torchtext.data.utils import get_tokenizer
# from torchtext.vocab import build_vocab_from_iterator

# tokenizer = get_tokenizer('basic_english')

# train_iter = AG_NEWS(split='train')

# def yield_tokens(data_iter):
#     for _, text in data_iter:
#         yield tokenizer(text)

# vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
# vocab.set_default_index(vocab["<unk>"])