# Рекурентные сети для обработки последовательностей

#### ДЗ 6. Провести сравнение RNN, LSTM, GRU на датасете отзывов (из предыдущих занятий/материалов)

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import datetime
from stop_words import get_stop_words
from string import punctuation
from pymorphy2 import MorphAnalyzer
from nltk.tokenize import word_tokenize
import regex as re
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from nltk.probability import FreqDist
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN , LSTM, GRU, Masking
from tensorflow.keras.callbacks import TensorBoard 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#from keras.objectives import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping 
from gensim.models import Word2Vec
import multiprocessing


In [15]:
seed = 1234
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Constants
max_words = 2000
max_len = 50
epochs = 20
batch_size = 4096
vec_len = 10
num_cl = 2

sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

# Load data
rev_df = pd.read_excel('./data/отзывы за лето.xls', parse_dates=[2])

# Specify fields under focus 
TARGET = 'target'
PROCESSED = 'preprocessed'

In [16]:
rev_df

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14
...,...,...,...
20654,1,"Ну и шляпа,с роот правами бесполезная прога,ра...",2017-06-01
20655,5,Ок,2017-06-01
20656,4,Доволен,2017-06-01
20657,1,"Песопаснасть, рут ни нужын",2017-06-01


In [17]:
def preprocess_text(txt):
       
    txt = str(txt)
    txt = re.sub(r'( )+', " ", re.sub(r'[^\w\s]', " ", txt)).strip()
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    txt = " ".join(txt)
    txt = re.sub("\sне", "не", txt)
    return txt


def get_splits(data):
    
    #Разделение датасета на 3 сплита
        
    print(f"Preparing splits...", end=" ")
    
    # Split dataset on 3 folds
    train, test = train_test_split(data, test_size=0.6, shuffle=True, random_state=seed)
    valid, test = train_test_split(test, test_size=0.5, shuffle=True, random_state=seed)
    
    return train, valid, test   


def prepare_data(df: pd.DataFrame,
                 n_classes: int,
                 processed_field: str = PROCESSED,
                 target_field: str = TARGET) -> pd.DataFrame:
    
    assert n_classes in [2,5], 'Bad Number of classes'
    
    print(f"Data preparing...", end="")
    
    """
    Предобработка данных под тип классификатора. Преобразование таргета для one-hot encoding
    Ожидает на вход n_classes: 2 | 5
    """
    
    
    df[processed_field] = df['Content'].apply(preprocess_text)
    
    if n_classes == 2:
        df = df[df['Rating'] != 3]
        df[target_field] = (df['Rating'] > 3).astype('uint8')
        
    else:
        df[target_field] = df['Rating'] - 1
    
    return df


In [7]:
def build_train_voc(df_train, processed_field=PROCESSED):
    
    # Train словарь из выбранных токенов
       
    
    train_corpus = " ".join(df_train[processed_field])
   
    tokens = word_tokenize(train_corpus)
    
    tokens_filtered = [word for word in tokens if word.isalnum()]
    dist = FreqDist(tokens_filtered)
    tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]
    
    print('Top-10 filtered tokens:', tokens_filtered_top[:10])
    
    vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}
    
    return vocabulary


def text_to_sequence(text, maxlen):
              
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]



def plot_history(history):
    
    #Потери и точность модели  
        
    fig, ax = plt.subplots(1,2, figsize=(16,4))

    ax[0].plot(history.history['loss'], label='train loss')
    ax[0].plot(history.history['val_loss'], label='validation loss')
    ax[0].set_xlabel('Epochs')
    ax[0].set_ylabel('Loss')
    ax[0].legend()

    ax[1].plot(history.history['accuracy'], label='train accuracy')
    ax[1].plot(history.history['val_accuracy'], label='validation accuracy')
    ax[1].set_xlabel('Epochs')
    ax[1].set_ylabel('Accuracy')
    ax[1].legend()

    plt.show()

In [41]:
def model_run(memory_cells_type: callable,
                num_cl: int,
                max_words: int,
                train_len: int
               ):
    
    """
    Построение рекурентной модели    
    num_cl   Количество классов модели
    max_words Максимальное количество слов в текстовой последовательности
    max_len   Длина вектора последовательностей
        
    """
    model = Sequential()

    model.add(
        Embedding(input_dim=max_words,
                  input_length=train_len,
                  output_dim=30,
                  trainable=True,
                  mask_zero=True))
    model.add(Masking(mask_value=0.0))

    model.add(memory_cells_type(64))
    model.add(Dense(64, activation='tanh'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    print("Done. Model Summary:")
    print(model.summary())
    
    return model

In [18]:
data = prepare_data(df = rev_df, n_classes = num_cl)
df_train, df_val, df_test = get_splits(data)


text_train = df_train[PROCESSED].values
text_valid = df_val[PROCESSED].values
text_test = df_test[PROCESSED].values

tokenizer = Tokenizer(num_words=None, 
                      lower = False,
                      split = ' ')
tokenizer.fit_on_texts(text_train)

sequences_train = tokenizer.texts_to_sequences(text_train)
sequences_val = tokenizer.texts_to_sequences(text_valid)
sequences_test = tokenizer.texts_to_sequences(text_test)

word_cnt = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)
X_test = pad_sequences(sequences_test, maxlen=training_length)

y_train = df_train[TARGET].values
y_val = df_val[TARGET].values
y_test = df_test[TARGET].values

Data preparing...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_field] = (df['Rating'] > 3).astype('uint8')


Preparing splits... 

In [42]:
model_RNN = model_run(memory_cells_type=SimpleRNN,
                    num_cl=num_cl,
                    max_words=word_cnt,
                    train_len=training_length)

Done. Model Summary:
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 91, 30)            153120    
_________________________________________________________________
masking_8 (Masking)          (None, 91, 30)            0         
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 64)                6080      
_________________________________________________________________
dense_14 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 65        
Total params: 163,425
Trainable params: 163,425
Non-trainable params: 0
___________________________

In [43]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model_RNN.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.2,
                    callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20


In [55]:
score_RNN = model_RNN.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score_RNN[0])
print('Test accuracy:', score_RNN[1])



Test score: 0.42980125546455383
Test accuracy: 0.8419986367225647


## LSTM

In [45]:
model_LSTM = model_run(memory_cells_type=LSTM,
                    num_cl=num_cl,
                    max_words=word_cnt,
                    train_len=training_length)

Done. Model Summary:
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 91, 30)            153120    
_________________________________________________________________
masking_9 (Masking)          (None, 91, 30)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24320     
_________________________________________________________________
dense_16 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 65        
Total params: 181,665
Trainable params: 181,665
Non-trainable params: 0
___________________________

In [46]:
early_stopping=EarlyStopping(monitor='val_loss')  

history = model_LSTM.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.2,
                    callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [57]:
score_LSTM = model_LSTM.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score_LSTM[0])
print('Test accuracy:', score_LSTM[1])



Test score: 0.4343211352825165
Test accuracy: 0.8419986367225647


## GRU

In [47]:
model_GRU = model_run(memory_cells_type=GRU,
                    num_cl=num_cl,
                    max_words=word_cnt,
                    train_len=training_length)

Done. Model Summary:
Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 91, 30)            153120    
_________________________________________________________________
masking_10 (Masking)         (None, 91, 30)            0         
_________________________________________________________________
gru (GRU)                    (None, 64)                18432     
_________________________________________________________________
dense_18 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 65        
Total params: 175,777
Trainable params: 175,777
Non-trainable params: 0
__________________________

In [48]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model_GRU.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


In [58]:
score_GRU = model_GRU.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', score_GRU[0])
print('Test accuracy:', score_GRU[1])



Test score: 0.37918421626091003
Test accuracy: 0.8419986367225647


## Сравнение результатов

In [60]:
%%time

src = model_RNN.predict(X_test)
print(f'Test score: {round(roc_auc_score(y_test, src),3)}')

Test score: 0.822
CPU times: user 2.45 s, sys: 262 ms, total: 2.71 s
Wall time: 1.01 s


In [61]:
%%time

src = model_LSTM.predict(X_test)
print(f'Test score: {round(roc_auc_score(y_test, src),3)}')

Test score: 0.885
CPU times: user 6.49 s, sys: 448 ms, total: 6.94 s
Wall time: 3.48 s


In [62]:
%%time

src = model_GRU.predict(X_test)
print(f'Test score: {round(roc_auc_score(y_test, src),3)}')

Test score: 0.899
CPU times: user 4.9 s, sys: 284 ms, total: 5.18 s
Wall time: 2.57 s


Самый быстрый и неточный RNN, лучший GRU, при этом не такой медленный как LSTM