In [None]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences 


# Load the IMDB dataset and split it into training and test sets
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

# Tokenize the text and convert it to sequences
tokenizer = Tokenizer(num_words=10000)
x_train_str = [str(text) for text in x_train]
tokenizer.fit_on_texts(x_train_str)
x_train = tokenizer.texts_to_sequences(x_train_str)
x_test_str = [str(text) for text in x_test]
x_test = tokenizer.texts_to_sequences(x_test_str)

# Pad the sequences to a fixed length
maxlen = 100
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

# Load pre-trained Word2Vec model
w2v_model = Word2Vec.load('w2v_model.bin')

# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in w2v_model.wv.key_to_index:
        embedding_matrix[i] = w2v_model.wv[word]


# Define the model architecture
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False))
model.add(Dropout(0.2))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

# Define early stopping and learning rate reduction callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1, mode='min')

# Train the model
history = model.fit(
    x_train, y_train,
    batch_size=128,
    epochs=15,
    validation_data=(x_test, y_test),
    callbacks=[early_stopping, reduce_lr]
)

# Evaluate the model on the test set
score, acc = model.evaluate(x_test, y_test, batch_size=128)
print('Test score:', score)
print('Test accuracy:', acc)



Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
 26/196 [==>...........................] - ETA: 47s - loss: 0.6183 - acc: 0.6689

In [26]:
# Get predictions on the test set
# Get predicted probabilities on the test set
y_pred_prob = model.predict(x_test)

# Convert probabilities to classes
y_pred = np.argmax(y_pred_prob, axis=1)

# Convert the integer labels to sentiment strings
sentiments = ['negative', 'positive']
y_test_str = np.array([sentiments[label] for label in y_test])
y_pred_str = np.array([sentiments[label] for label in y_pred])

# Store the results in a CSV file
results = pd.DataFrame({'Review': x_test_str, 'Actual Sentiment': y_test_str, 'Predicted Sentiment': y_pred_str})
results.to_csv('imdb_sentiments.csv', index=False)

print('Saved results to CSV file.')


Saved results to CSV file.
