В данном проекте происходит тренировка модели для определения тональности оставленного отзыва

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, LSTM
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np

In [2]:
num_words = 10000
max_review_len = 200

In [3]:
# загрузка набора данных
!wget https://www.dropbox.com/s/ufbhk3kadtnn6h0/yelp_review_polarity_csv.tgz?dl=1 -O yelp_review_polarity_csv.tgz
# распаковка архива
!tar -xvf yelp_review_polarity_csv.tgz

--2023-10-26 19:06:34--  https://www.dropbox.com/s/ufbhk3kadtnn6h0/yelp_review_polarity_csv.tgz?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:6018:18::a27d:312
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/dl/ufbhk3kadtnn6h0/yelp_review_polarity_csv.tgz [following]
--2023-10-26 19:06:34--  https://www.dropbox.com/s/dl/ufbhk3kadtnn6h0/yelp_review_polarity_csv.tgz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uce105238b319b4594b95a414e21.dl.dropboxusercontent.com/cd/0/get/CGWn1Epcn8InUyngIgZmwH9UgGUC-RoqQ_h8n6-6bfSPzzkNg3411K0W8ybJwf1pjrl27fI_m4-xGOhCcbzEggt1_8Y2BIdO3pRXdEThpvbPzdV_4aGRAayHuI31_FaqX_0skPe7Kkc5V9InFt74wlzn/file?dl=1# [following]
--2023-10-26 19:06:34--  https://uce105238b319b4594b95a414e21.dl.dropboxusercontent.com/cd/0/get/CGWn1Epcn8InUyngIgZmwH9UgGUC-RoqQ_h8n6-6bfSPzzkNg341

In [4]:
train = pd.read_csv(
    'yelp_review_polarity_csv/train.csv',
    header=None,
    names=['Class', 'Review']
)

In [5]:
reviews = train['Review']
y_train = train['Class'] - 1

In [6]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(reviews)

In [7]:
sequences = tokenizer.texts_to_sequences(reviews)

In [8]:
x_train = pad_sequences(sequences, maxlen=max_review_len)

Модель строится на базе LSTM слоя с 16 нейронами

In [9]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_review_len))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

In [10]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [11]:
model_save_path = 'best_model.h5'
checkpoint_callback = ModelCheckpoint(
    model_save_path,
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

In [12]:
model.fit(
    x_train,
    y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.1,
    callbacks=[checkpoint_callback]
)

Epoch 1/5
Epoch 1: val_accuracy improved from -inf to 0.95146, saving model to best_model.h5
Epoch 2/5
   2/3938 [..............................] - ETA: 4:33 - loss: 0.1429 - accuracy: 0.9375

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.95146 to 0.95830, saving model to best_model.h5
Epoch 3/5
Epoch 3: val_accuracy did not improve from 0.95830
Epoch 4/5
Epoch 4: val_accuracy improved from 0.95830 to 0.96086, saving model to best_model.h5
Epoch 5/5
Epoch 5: val_accuracy did not improve from 0.96086


<keras.src.callbacks.History at 0x7ed287ccb400>

In [13]:
test = pd.read_csv(
    'yelp_review_polarity_csv/test.csv',
    header=None,
    names=['Class', 'Review']
)

In [14]:
x_test = pad_sequences(tokenizer.texts_to_sequences(test['Review']), maxlen=max_review_len)
y_test = test['Class'] - 1

In [15]:
model.load_weights(model_save_path)

In [16]:
model.evaluate(x_test, y_test, verbose=1)



[0.12904097139835358, 0.9513947367668152]

Точность модели составила ~95%