In [61]:
# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D
from keras.layers import LSTM, Dropout
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras import callbacks
import re
import pandas as pd
from sklearn import preprocessing
import random
from keras.models import load_model
import warnings

# Set random seed for reproducibility
np.random.seed(7)

# Ignore warnings
warnings.filterwarnings('ignore')

In [9]:
with open(r'reviews.txt', 'r', encoding='utf-8') as file:
    reviews = [review.strip() for review in file]  # Strip to remove any extra spaces and newlines
df = pd.DataFrame(reviews, columns=['review'])
df.to_csv('UnLabeledReviews.csv', index=False)

In [14]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Download the VADER lexicon
nltk.download('vader_lexicon')

# Initialize the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Load the preprocessed reviews
df = pd.read_csv('UnLabeledReviews.csv')

# Function to categorize sentiment
def categorize_sentiment(review):
    sentiment_score = sia.polarity_scores(review)
    if sentiment_score['compound'] >= 0.05:
        return 'positive'
    elif sentiment_score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'negative'

# Apply the function to categorize each review
df['Sentiment'] = df['review'].apply(categorize_sentiment)

# Save the labeled reviews to a new CSV file
df.to_csv('LabeledReviews.csv', index=False)

print("Reviews have been categorized and saved to 'LabeledReviews.csv'.")

Reviews have been categorized and saved to 'LabeledReviews.csv'.


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mariy\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [19]:
df = pd.read_csv("LabeledReviews.csv", header=0)
y = df.pop('Sentiment')
X = df
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [22]:
for i,j in X_train.iterrows():
    if(i==0):
        print(j['review'])

I bought this during the offer time. The installation service was excellent. Don't expect excellent quality viewing. Worth for the money we pay and does it's job wellRead more


In [20]:
def clean_phrase(phrase):
    #Remove punctuation (with a regular expression) and convert to lower case
    words = (re.sub("[^a-zA-Z]", " ", phrase)).lower()
    return words

In [25]:
# run preprocessing function on train dataset
clean_phrases = []

for x in X_train['review']:
    new = clean_phrase(x)
    clean_phrases.append(new)
    
# run preprocessing function  on test dataset
test_clean_phrases = []

for xw in X_test['review']:
    new_test = clean_phrase(xw)
    test_clean_phrases.append(new_test)

In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer

max_features = 10000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(clean_phrases)
X = tokenizer.texts_to_sequences(clean_phrases)

tokenizer1 = Tokenizer(num_words=max_features, split=' ')
tokenizer1.fit_on_texts(test_clean_phrases)
Y = tokenizer1.texts_to_sequences(test_clean_phrases)


In [32]:
X_train = sequence.pad_sequences(X, maxlen=600)
X_test = sequence.pad_sequences(Y, maxlen=600)
print(X_train.shape)
print(X_test.shape)

(62, 600)
(16, 600)


In [33]:
le = preprocessing.LabelEncoder()
le.fit(['negative','positive'])
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [34]:
y_test

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1])

In [35]:
X_test

array([[  0,   0,   0, ...,  23,  24,  25],
       [  0,   0,   0, ...,  77,  78,  79],
       [  0,   0,   0, ...,  15,   5,  16],
       ...,
       [  0,   0,   0, ...,   0,   0,   1],
       [  0,   0,   0, ..., 154,  24,  25],
       [  0,   0,   0, ...,   0,   0,   1]])

# Creating and Training the model

In [40]:
cbks = [callbacks.ModelCheckpoint(filepath='./model.keras', monitor='val_loss', save_best_only=True),
            callbacks.EarlyStopping(monitor='val_loss', patience=2),
            callbacks.TensorBoard(log_dir='.\logs', histogram_freq=0, write_graph=True, write_images=False,profile_batch = 40000)]

In [58]:
# create the model
top_words = 10000
embedding_vecor_length = 32
max_review_length = 600
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_shape=(max_review_length,)))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# filepath="weights_best_cnn.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
# callbacks_list = [checkpoint]
model.fit(X_train, y_train, epochs=5, batch_size=256,verbose = 1,callbacks = cbks,validation_split=0.2)
scores = model.evaluate(X_train, y_train, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

None
Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.7551 - loss: 0.6918 - val_accuracy: 1.0000 - val_loss: 0.6760
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490ms/step - accuracy: 0.8571 - loss: 0.6805 - val_accuracy: 1.0000 - val_loss: 0.6593
Accuracy: 88.71%


In [43]:
model = load_model('model.keras')
test_pred = (model.predict(X_train) > 0.5).astype("int32")
print(test_pred)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step
[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]]


In [44]:
from matplotlib.colors import ListedColormap
y = list(y_test)
pred = list(test_pred)

In [45]:
# print((y))
x = [0,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,1,0,0,0,1,1,1]
print(type(y))

<class 'list'>


In [46]:
for i in range(len(y)):
    if( y[i] == 2):
        y[i] = 0.5

In [47]:
y

[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]

In [49]:
y_train

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Evaluating the Model

In [59]:
# Final evaluation of the model
# create the model
top_words = 10000
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_shape=(max_review_length,)))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.load_weights("model.keras")
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

None
Accuracy: 87.50%


In [60]:
table = PrettyTable()
table.field_names = ['Model', 'Accuracy']
table.add_row(['CNN using LSTM training', 88.71])
table.add_row(['CNN using LSTM test data', 87.50])
print(table)

+--------------------------+----------+
|          Model           | Accuracy |
+--------------------------+----------+
| CNN using LSTM training  |  88.71   |
| CNN using LSTM test data |   87.5   |
+--------------------------+----------+
