In [1]:
import plotly
import spacy
import string
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from google.colab import drive

from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.layers import Activation, Dense, Dropout, Embedding, LSTM
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone


# Монтирование гугл-диска, установка констант, чтение датасета


In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
DATA_DIR = r'drive/MyDrive/Учеба/ОиРС/Семестр 8/Тексты/'
DS_FILENAME = r'restaurant_reviews.csv'

nlp = spacy.load('en_core_web_sm')
PUNCTUATION = set(string.punctuation).union({'..', '...', '....', '.....', '``', "''"})
STOP_WORDS = set(stopwords.words('english'))

In [6]:
reviews = pd.read_csv(DATA_DIR + DS_FILENAME, sep=',')
reviews

Unnamed: 0,review,is_good
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [7]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   review   1000 non-null   object
 1   is_good  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [8]:
print(f'Число положительных отзывов: {sum(reviews["is_good"])}\n'
      f'Всего отзывов: {len(reviews)}')

Число положительных отзывов: 500
Всего отзывов: 1000


# Подготовка датасета к обучению, обучение модели Word2Vec

In [9]:
def preprocess_text(text):
  def is_numeric(v):
    try:
      _ = float(v)
    except ValueError:
      return False
    return True

  words = word_tokenize(text.replace('/', ' ').lower())
  return [word for word in words if word not in STOP_WORDS and word not in PUNCTUATION and not is_numeric(word)]

In [10]:
def stamming(text):
  stemmer = SnowballStemmer('english')
  words = [stemmer.stem(word) for word in text if word and word not in STOP_WORDS]
  return ' '.join(words)

In [11]:
reviews['review'] = reviews['review'].astype('object')
for i in range(len(reviews)):
  text = preprocess_text(reviews.at[i, 'review'])
  text = stamming(text)
  reviews.at[i, 'review'] = text
reviews

Unnamed: 0,review,is_good
0,wow love place,1
1,crust good,0
2,tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1
...,...,...
995,think food flavor textur lack,0
996,appetit instant gone,0
997,overal impress would go back,0
998,whole experi underwhelm think ll go ninja sush...,0


In [12]:
reviews_train, reviews_test = train_test_split(reviews, test_size=0.3, random_state=42)

In [13]:
word_collection = [rev.split() for rev in reviews_train['review']]
wv_model = Word2Vec(vector_size=300, window=3, min_count=5, workers=8)
wv_model.build_vocab(word_collection)
wv_model.train(word_collection, total_examples=len(word_collection), epochs=32)

(47134, 129824)

In [14]:
tzer = Tokenizer()
tzer.fit_on_texts(reviews_train['review'])
total_words = len(tzer.word_index) + 1

In [15]:
x_train = pad_sequences(tzer.texts_to_sequences(reviews_train['review']), maxlen=300)
x_test = pad_sequences(tzer.texts_to_sequences(reviews_test['review']), maxlen=300)

In [16]:
encoder = LabelEncoder()
encoder.fit(reviews_train['is_good'].tolist())
y_train = encoder.transform(reviews_train['is_good'].tolist())
y_test = encoder.transform(reviews_train['is_good'].tolist())
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [17]:
embedding_matrix = np.zeros((total_words, 300))
for word, i in tzer.word_index.items():
  if word in wv_model.wv:
    embedding_matrix[i] = wv_model.wv[word]

# Конфигурирование нейросети и обучение

In [18]:
model = Sequential()
model.add(Embedding(total_words, 300, weights=[embedding_matrix], input_length=300, trainable=True))
model.add(Dropout(0.2))
model.add(LSTM(100, dropout=0.2))
model.add(Dense(50))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=4),
             EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [19]:
history = model.fit(x_train, y_train,
                    batch_size=32,
                    epochs=16,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/16



Epoch 2/16



Epoch 3/16



Epoch 4/16



Epoch 5/16



Epoch 6/16



Epoch 7/16



Epoch 8/16



Epoch 9/16



Epoch 10/16



Epoch 11/16



Epoch 12/16



Epoch 13/16



Epoch 14/16



Epoch 15/16



Epoch 16/16





# Анализ результатов

In [20]:
pd.options.plotting.backend = 'plotly'

In [21]:
history = pd.DataFrame(model.history.history)
history.drop(['lr'], axis=1, inplace=True)
fig = history.plot(title='Потери', template='simple_white',
                   labels=dict(index='Эпоха', value='Значение', variable='option'))
fig.update_xaxes(showgrid=True)
fig.update_yaxes(showgrid=True)
fig.show()

In [22]:
def predict(text):
  text_pad = pad_sequences(tzer.texts_to_sequences(text), maxlen=300)
  pred = model.predict(text_pad)
  return int(pred[np.argmax(pred)][0] < 0.5)

In [23]:
predict('Too salty!!!')



0

In [24]:
predict('Will be back again!')



1

In [25]:
predict('I am not impressed. the food is delicious, the service is good, but I won’t come here again')



0

In [26]:
# график плохой, его надо доделать

texts = ['tasty food', 'terrible service', 'cold soup', 'cool soup', 'the best food in town', 'I got food poisoning in your restaurant']
texts = [stamming(preprocess_text(text)) for text in texts]

x_temp = pad_sequences(tzer.texts_to_sequences(texts), maxlen=300)
y_temp = model.predict(x_temp)

y_pred_binary_new = (y_temp >= 0.5).astype(int)

tsne = TSNE(n_components=2, random_state=42, perplexity = 1.0)
embedded_points = tsne.fit_transform(x_temp)

tsne_df = pd.DataFrame(embedded_points, columns=['x', 'y'])
fig = plotly.express.scatter(tsne_df, )
fig.update_xaxes(showgrid=True)
fig.update_yaxes(showgrid=True)
fig.show()

