In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import TensorBoard 
from keras.metrics import categorical_crossentropy
from keras.callbacks import EarlyStopping  

In [2]:
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/GB_NLP/отзывы за лето.xls')
df.head(5)

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [3]:
df.Rating.unique()

array([5, 4, 2, 3, 1])

#### Препроцессинг текста

In [4]:
# !pip install razdel

In [5]:
# !pip install pymorphy2

In [6]:
from razdel import tokenize

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as sw

from pymorphy2 import MorphAnalyzer
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
morph = MorphAnalyzer()

In [8]:
punkt = set(string.punctuation)

In [9]:
stopwords_rus = set(sw.words('russian'))

In [10]:
def do_rem_sw(text):

  return " ".join([_.text for _ in tokenize(text) if _.text not in stopwords_rus])

def do_normalize(text):
  return " ".join([morph.parse(_)[0].normal_form for _ in text.split() if _ not in punkt])


def do_preprocessing(text):
  text = str(text)
  text = text.lower()
  if len(text)>0:
    text = do_rem_sw(text)
    text = do_normalize(text)
  return text

In [11]:
do_preprocessing(df.Content[1])

'целое удобноной приложение ... минус хотеть слишком большой доступ персональный данные телефон приходиться пользоваться ограниченный режим'

In [12]:
df['Content_prep'] = df.Content.apply(lambda x: do_preprocessing(x))
df.head(5)

Unnamed: 0,Rating,Content,Date,Content_prep
0,5,It just works!,2017-08-14,it just works
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,целое удобноной приложение ... минус хотеть сл...
2,5,Отлично все,2017-08-14,отлично
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,стать зависать 1 работа антивирус далёкий нику...
4,5,"Очень удобно, работает быстро.",2017-08-14,очень удобно работать быстро


### Разделим рейтинг на 2 класса

In [26]:
df['Class'] = 0
df.loc[df.Rating > 2, 'Class'] = 1
df.loc[df.Rating <= 2, 'Class'] = 0
df.head(3)

Unnamed: 0,Rating,Content,Date,Content_prep,Class
0,5,It just works!,2017-08-14,it just works,1
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,целое удобноной приложение ... минус хотеть сл...,1
2,5,Отлично все,2017-08-14,отлично,1


#### Разделим на train и test

In [34]:
X_train, X_val, y_train, y_val = train_test_split(df.Content_prep, df.Class, random_state=42, stratify=df.Rating, test_size=.3)
X_train.shape, X_val.shape

((14461,), (6198,))

In [35]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(X_train)

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_val = tokenizer.texts_to_sequences(X_val)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in X_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

In [32]:
num_classes = 2

In [36]:
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)


### Обучим на сверточной сетке

In [55]:
model = Sequential()
model.add(Embedding(input_dim=word_count, output_dim=128, input_length=training_length))
model.add(Conv1D(128, 3))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Dropout(0.25))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [56]:
model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [57]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


#### Обучим на RNN

In [62]:
model_rnn = Sequential()

model_rnn.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=128,
              trainable=True,
              mask_zero=True))
model_rnn.add(Masking(mask_value=0.0))

model_rnn.add(SimpleRNN(64))
model_rnn.add(Dense(64, activation='relu'))
model_rnn.add(Dropout(0.25))
model_rnn.add(Dense(num_classes, activation='softmax'))

model_rnn.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [63]:
early_stopping=EarlyStopping(monitor='val_loss')  


history = model_rnn.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


Применение rnn в данном случае не дало большого прироста

#### Попробуем LSTM

In [60]:
model_lstm = Sequential()

model_lstm.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model_lstm.add(Masking(mask_value=0.0))
model_lstm.add(LSTM(64, recurrent_dropout=0.2))
model_lstm.add(Dense(64, activation='relu'))
model_lstm.add(Dropout(0.25))
model_lstm.add(Dense(num_classes, activation='softmax'))

model_lstm.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model_lstm.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


Применение LSTM дало небольшой прирост оценки

#### Попробуем объединить свертку и lstm

In [91]:
model_conv_lstm = Sequential()

model_conv_lstm.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=128,
              trainable=True,
              mask_zero=True))
model_conv_lstm.add(Masking(mask_value=0.0))
model_conv_lstm.add(Conv1D(128, 3, padding='same'))
model_conv_lstm.add(Dense(64, activation='relu'))
model_conv_lstm.add(LSTM(64, recurrent_dropout=0.2))
model_conv_lstm.add(Dropout(0.25))
model_conv_lstm.add(Dense(num_classes, activation='softmax'))

model_conv_lstm.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping=EarlyStopping(monitor='val_loss')  


history = model_conv_lstm.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


В данном примере чистый LSTM показал лучший результат. 