__Нейросетевая языковая модель на основе LSTM__ 

Евгений Борисов <esborisov@sevsu.ru>

Shivam Bansal   
Language Modelling and Text Generation using LSTMs — Deep Learning for NLP.    
Mar 26, 2018

https://medium.com/@shivambansal36/language-modelling-text-generation-using-lstms-deep-learning-for-nlp-ed36b224b275

In [1]:
# загружаем текст
import gzip
with gzip.open('../data/dostoevsky-besy-p2.txt.gz','rt',encoding='utf-8') as f: data = f.read()     
print(len(data))

465595


In [2]:
from nltk.tokenize import sent_tokenize as nltk_sentence_split
from nltk.tokenize import word_tokenize as nltk_tokenize_word

EOS = '<EOS>'
PAD = '<PAD>'

text = [ 
    nltk_tokenize_word(s,language='russian')+[EOS] # разбиваем предложения на слова
    for s in nltk_sentence_split(data,language='russian') # режем текст на отдельные предложения
]

del data

In [3]:
# from random import sample

# def get_sample(text,min_len=10):
#     for _ in range(100):
#         sentence = sample(text,1)[0]
#         # print(len(sentence),sentence)
#         if len(sentence)>min_len:
#             return ' '.join(sentence[:(min_len//2)] )
#     return '<empty>'

# get_sample(text)

In [4]:
import itertools

words = sorted( set(itertools.chain(*text)) - set([PAD,EOS]) )
word2index = { w:i+2 for i,w in enumerate(words) }
del words

word2index[PAD]=0
word2index[EOS]=1

index2word = { i:w for  w,i in word2index.items() }

total_words = len(word2index)

total_words

16660

In [5]:
n_grams = [
   [ word2index[token] for token in sentence[:i+1] ]
   for sentence in text
   for i in range(1,len(sentence))
]

max_sequence_len = max([len(s) for s in n_grams])
max_sequence_len

113

In [6]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

input_sequences = np.array(pad_sequences(n_grams, maxlen=max_sequence_len, padding='pre'))

del n_grams

In [7]:
# input_sequences

In [8]:
import keras.utils as ku 
inputs, targets = input_sequences[:,:-1],input_sequences[:,-1]
targets = ku.np_utils.to_categorical(targets, num_classes=total_words)
del input_sequences

In [9]:
inputs.shape, inputs.dtype, targets.shape, targets.dtype

((92194, 112), dtype('int32'), (92194, 16660), dtype('float32'))

In [10]:
max_sequence_len

113

In [11]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
# from keras.layers import Dropout

model = Sequential()
model.add(Embedding(total_words, 32, input_length=inputs.shape[1]))
model.add(LSTM(128, return_sequences = True))
# model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dense(targets.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

2022-03-22 17:45:04.508519: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-22 17:45:04.570806: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-22 17:45:04.570988: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-22 17:45:04.571429: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [12]:
# emb = Embedding(total_words, 16, input_length=inputs.shape[1])
# lstm = LSTM(128)
# dense = Dense(targets.shape[1], activation='softmax')
      
# x = inputs[:3]
# o = emb(x)
# o = lstm(o)
# o = dense(o)
# o.shape

In [13]:
# inputs[:3]

In [14]:
# emb(inputs[:3]).shape

In [15]:
# from tensorflow.keras.utils import plot_model
# plot_model(model,to_file='cnn.png', show_layer_names=True, show_shapes=True )
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 112, 32)           533120    
                                                                 
 lstm (LSTM)                 (None, 112, 128)          82432     
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 16660)             1082900   
                                                                 
Total params: 1,747,860
Trainable params: 1,747,860
Non-trainable params: 0
_________________________________________________________________


In [16]:
from keras.callbacks import EarlyStopping
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')  

In [17]:
def batch_generator(inputs,targets,batch_size):
    batch_count = np.ceil(len(targets)/batch_size).astype(int)
    for i in range(batch_count-1):
        yield (
            inputs[i*batch_size:(i+1)*batch_size],
            targets[i*batch_size:(i+1)*batch_size],
        )

# g = batch_generator(inputs,targets,batch_size=10)
# next(g)

In [18]:
%%time 

batch_size = len(targets)//100
    
# history = model.fit(
#     inputs, 
#     targets, 
#     epochs=2, 
#     verbose=1, 
#     callbacks=[earlystop],
#     validation_split=.1,
#     batch_size=batch_size,
# )

# history = model.fit( inputs, targets, epochs=2, verbose=1, batch_size=batch_size,)

history = model.fit( 
    batch_generator(inputs,targets,batch_size=batch_size), 
    epochs=2, 
    verbose=1,
    callbacks=[earlystop],
#     validation_split=.1,
)

Epoch 1/2


2022-03-22 17:45:09.336980: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8300


Epoch 2/2
CPU times: user 10.2 s, sys: 3.71 s, total: 13.9 s
Wall time: 14.2 s


-----

In [None]:
# from matplotlib import pyplot as plt

# score = model.evaluate(x_test, y_test, verbose=False)
# print('Test accuracy:', score[1])

# history_dict = history.history
# history_dict.keys()

# acc = history.history['accuracy']
# val_acc = history.history['val_accuracy']
# loss = history.history['loss']
# val_loss = history.history['val_loss']

# epochs = range(1, len(acc) + 1)
# plt.plot(epochs, loss, 'b', label='Training loss')
# plt.plot(epochs, val_loss, 'r', label='Validation loss')
# plt.title('Training and validation loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid()

# plt.show()

# plt.clf()   # clear figure
# acc_values = history_dict['accuracy']
# val_acc_values = history_dict['val_accuracy']

# plt.plot(epochs, acc, 'b', label='Training acc')
# plt.plot(epochs, val_acc, 'r', label='Validation acc')
# plt.title('Training and validation accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.grid()

# plt.show()

# ---

-----

In [None]:
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize as nltk_sentence_split

tokenizer = Tokenizer()

def dataset_preparation(data):
    # basic cleanup
    corpus = text = nltk_sentence_split(data)
    
    # data.lower().split("\n")

    # tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    # create input sequences using list of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    # pad sequences 
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.np_utils.to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len, total_words

In [None]:
# dataset_preparation(data)

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.models import Sequential

def create_model(predictors, label, max_sequence_len, total_words):
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
    model.add(LSTM(150, return_sequences = True))
    # model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
    model.fit(predictors, label, epochs=100, verbose=1, callbacks=[earlystop])
    print( model.summary() )
    return model 

In [None]:
import gzip
FILE_DATA = '../data/dostoevsky-besy-p2.txt.gz'
with gzip.open(FILE_DATA,'rt',encoding='utf-8') as f: data = f.read()     
print(len(data))

In [None]:
predictors, label, max_sequence_len, total_words = dataset_preparation(data)

In [None]:
model = create_model(predictors, label, max_sequence_len, total_words)

In [None]:
from random import sample
from nltk.tokenize import sent_tokenize as nltk_sentence_split
from nltk.tokenize import word_tokenize as nltk_tokenize_word

text = [ 
    nltk_tokenize_word(s) # разбиваем предложения на слова
    for s in nltk_sentence_split(data) # режем текст на отдельные предложения
]

def get_sample(text,min_len=10):
    for _ in range(100):
        sentence = sample(text,1)[0]
        # print(len(sentence),sentence)
        if len(sentence)>min_len:
            return ' '.join(sentence[:(min_len//2)] )
    return '<empty>'

In [None]:
init_sentence = get_sample(text) 

sentence = init_sentence
answer = []

for _ in range(7):
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted_index = np.argmax( model.predict(token_list, verbose=0) )
    predicted_word = tokenizer.index_word[predicted_index]
    answer.append(predicted_word)
    sentence+=' '+predicted_word

In [None]:
init_sentence + ' | ' + ' '.join(answer)

---

In [None]:
# tokenizer.fit_on_texts(text)

In [None]:
# total_words = len(tokenizer.word_index) + 1

In [None]:
# line = text[1]
# tokenizer.texts_to_sequences([line])[0]


# token_list[:i+1]
# for line in text
# for i,token_idx in enumerate( tokenizer.texts_to_sequences([line])[0] )


In [None]:
# input_sequences = []
# for line in text:
#     token_list = tokenizer.texts_to_sequences([line])[0]
#     for i in range(1, len(token_list)):
#         n_gram_sequence = token_list[:i+1]
#         input_sequences.append(n_gram_sequence)

In [None]:
# import numpy as np
# from keras.preprocessing.sequence import pad_sequences

# max_sequence_len = max([len(x) for x in input_sequences])
# input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
# # input_sequences

In [None]:
 # create predictors and label
# predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

In [None]:
# import keras.utils as ku 
# label = ku.np_utils.to_categorical(label, num_classes=total_words)

In [None]:
# from keras.preprocessing.sequence import pad_sequences
# from keras.layers import Embedding, LSTM, Dense, Dropout
# from keras.preprocessing.text import Tokenizer
# from keras.callbacks import EarlyStopping
# from keras.models import Sequential
# import keras.utils as ku 
# import numpy as np 

In [None]:
# class Tokenizer: Text tokenization utility class.
# Functions
# hashing_trick(...): Converts a text to a sequence of indexes in a fixed-size hashing space.
# one_hot(...): One-hot encodes a text into a list of word indexes of size n.
# text_to_word_sequence(...): Converts a text to a sequence of words (or tokens).
# tokenizer_from_json(...): Parses a JSON tokenizer configuration file and returns a

In [None]:
# text.StateBasedSentenceBreaker 
# break_sentences(
#     doc
# )

In [None]:
# import tensorflow
# tensorflow.__version__

In [None]:
# from keras.preprocessing.text import Tokenizer
# class Tokenizer: Text tokenization utility class.
# Functions
# hashing_trick(...): Converts a text to a sequence of indexes in a fixed-size hashing space.
# one_hot(...): One-hot encodes a text into a list of word indexes of size n.
# text_to_word_sequence(...): Converts a text to a sequence of words (or tokens).
# tokenizer_from_json(...): Parses a JSON tokenizer configuration file and returns a



In [None]:
# from random import sample
# from nltk.tokenize import sent_tokenize as nltk_sentence_split
# from nltk.tokenize import word_tokenize as nltk_tokenize_word

# text = [ 
#     nltk_tokenize_word(s) # разбиваем предложения на слова
#     for s in nltk_sentence_split(text) # режем текст на отдельные предложения
# ]
# print('предложений: %i\n'%(len(text)))

# text = nltk_sentence_split(data)
# sample(text,2)