# Sentiment Analysis of Product Reviews using LSTM
This project aims to perform sentiment analysis on product reviews using an LSTM model. The dataset contains the following columns:
- **Product Name**
- **Product Price**
- **Rating**
- **Review**
- **Summary**

Here I will preprocess the text data, convert ratings to binary sentiment labels (positive/negative), and train an LSTM model to classify the sentiment of reviews.



In [None]:
!pip install tensorflow pandas nltk scikit-learn




In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [1]:
!pip install tensorflow pandas nltk scikit-learn




In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Load the dataset
df = pd.read_csv('Reviews.csv')

# Convert the Rate column to numeric, handling errors
df['Rate'] = pd.to_numeric(df['Rate'], errors='coerce')

# Fill any NaN values in the Rate column with 0 (or another appropriate value)
df['Rate'] = df['Rate'].fillna(0)

# Prepare the labels (convert Rate to binary sentiment: 1 for positive, 0 for negative)
df['Sentiment'] = df['Rate'].apply(lambda x: 1 if x > 3 else 0)

# Combine Review and Summary, handling missing values
df['CombinedReview'] = df['Review'].fillna('') + " " + df['Summary'].fillna('')

# Apply preprocessing to the combined text
df['CleanedText'] = df['CombinedReview'].apply(preprocess_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['CleanedText'], df['Sentiment'], test_size=0.2, random_state=42)

# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to ensure equal length
maxlen = 100  # You can adjust this based on the average review length
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)


In [7]:
# Define the LSTM model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=maxlen),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()




In [9]:
# Train the model
history = model.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_data=(X_test_pad, y_test))


Epoch 1/3
[1m2564/2564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m974s[0m 380ms/step - accuracy: 0.9664 - loss: 0.0933 - val_accuracy: 0.9628 - val_loss: 0.1028
Epoch 2/3
[1m2564/2564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m981s[0m 380ms/step - accuracy: 0.9687 - loss: 0.0857 - val_accuracy: 0.9609 - val_loss: 0.1041
Epoch 3/3
[1m2564/2564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m980s[0m 382ms/step - accuracy: 0.9706 - loss: 0.0799 - val_accuracy: 0.9628 - val_loss: 0.1060


In [10]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Predicting and classification report
from sklearn.metrics import classification_report

y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


[1m1282/1282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 77ms/step - accuracy: 0.9621 - loss: 0.1072
Test Accuracy: 0.9628
[1m1282/1282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 75ms/step
              precision    recall  f1-score   support

           0       0.95      0.87      0.91      8804
           1       0.97      0.99      0.98     32207

    accuracy                           0.96     41011
   macro avg       0.96      0.93      0.94     41011
weighted avg       0.96      0.96      0.96     41011



In [11]:
# Save the model
model.save('sentiment_analysis_lstm.h5')

# Load the model
from tensorflow.keras.models import load_model
model = load_model('sentiment_analysis_lstm.h5')




In [12]:
# Sample input
sample_review = "This product is amazing!"
cleaned_review = preprocess_text(sample_review)
review_seq = tokenizer.texts_to_sequences([cleaned_review])
review_pad = pad_sequences(review_seq, maxlen=maxlen)
prediction = model.predict(review_pad)
print("Predicted Sentiment:", 'positive' if prediction[0][0] > 0.5 else 'negative')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step
Predicted Sentiment: positive
