### Sentiment Analysis using RNN with Amazon Product Reviews dataset which has around 25000 customer reviews

### Step 1: Importing necessary Libraries . Note : "Tensorflow require python version between 3.7 to 3.10"



In [None]:
pip install pandas

In [None]:
pip install seaborn

In [None]:
pip install nltk

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import re 
import seaborn as sns 
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import tensorflow as tf 
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout, Embedding, BatchNormalization 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.utils import pad_sequences 

import warnings 
warnings.filterwarnings('ignore')


### Step 2: Loading the dataset



In [None]:
data = pd.read_csv('AmazonReview.csv') 

# Printing shape of the dataset 
print(data.shape) 
# printing columns and rows information 
print(data.info())


# Step 3: Preprocessing data cleaning


In [None]:
# looking for NULL values 
print("Null Values:\n", data.isna().sum()) 

# dropping null values 
data = data.dropna() 

# again checking for NULL values 
print("Null Values after dropping:\n", data.isna().sum())


In [None]:
# count of unique values in Sentiment column 
data['Sentiment'].value_counts()


In [None]:
import nltk
nltk.download('punkt')  
nltk.download('punkt_tab') 


print(nltk.data.find('tokenizers/punkt'))

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')

# Define stop words
stop_words = set(stopwords.words('english'))

# Function to clean reviews
def clean_reviews(text): 
    if isinstance(text, str):  # Check if the text is a string
        text = text.lower()  # Convert to lower case
        text = word_tokenize(text)  # Tokenization of words
        text = [word for word in text if word not in stop_words]  # Stop words removal
    else:
        text = []  # Handle non-string or NaN values as empty list
    return text 

# Handle non-string values in the 'Review' column
data['Review'] = data['Review'].astype(str)  # Convert all to strings
data['Review'] = data['Review'].fillna('')   # Fill NaN values with empty strings

# Apply the cleaning function
data['Review'] = data['Review'].apply(clean_reviews)


### Step 4: Tokenization & Text Encoding 


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

# Initialize the tokenizer
tokenizer = Tokenizer() 

# Convert all the reviews to a list 
reviews_to_list = data['Review'].tolist() 
tokenizer.fit_on_texts(reviews_to_list) 

# Generate text sequences 
text_sequences = tokenizer.texts_to_sequences(reviews_to_list)

# Set the maximum number of words in a sequence
max_words = 500

# Padding sequences to ensure they all have the same length
X = pad_sequences(text_sequences, maxlen=max_words)

# One hot encoding the Sentiment column
data = pd.get_dummies(data, columns=['Sentiment'])

# Generate the output labels y
y = data[['Sentiment_1', 'Sentiment_2', 'Sentiment_3', 'Sentiment_4', 'Sentiment_5']] 

# Print the shapes of X and y
print(X.shape, y.shape)


(25000, 500) (25000, 5)


### Step 5: Train-Test Split (80% - 20%)

In [None]:
pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Tokenization
tokenizer = Tokenizer()
reviews_to_list = data['Review'].tolist()
tokenizer.fit_on_texts(reviews_to_list)
text_sequences = tokenizer.texts_to_sequences(reviews_to_list)

# Padding sequences
max_words = 500
X = pad_sequences(text_sequences, maxlen=max_words)

# Generate the output labels y
y = data[['Sentiment_1', 'Sentiment_2', 'Sentiment_3', 'Sentiment_4', 'Sentiment_5']]

# Train Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Print the shapes of the training and testing sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(20000, 500) (5000, 500) (20000, 5) (5000, 5)


### Step 6: Model Building, Compiling and Training

### 1. Build the Model using RNN.



In [38]:
# Creating a RNN model 
rnn = Sequential(name="Simple_RNN") 
rnn.add(Embedding(len(tokenizer.word_index)+1, 
						max_words, 
						input_length=max_words)) 

rnn.add(SimpleRNN(128,activation='relu',return_sequences=True)) 

rnn.add(SimpleRNN(64,activation='relu',return_sequences=False)) 

rnn.add(Dense(5, activation='softmax')) 

# printing model summary 
print(rnn.summary())


None


### 2. Compiling , Traning and Testing the model

In [None]:
# Compiling model 
rnn.compile( 
	loss="categorical_crossentropy", 
	optimizer='adam', 
	metrics=['accuracy'] 
) 

# Training the model 
history = rnn.fit(X_train, y_train, 
						batch_size=64, 
						epochs=2, 
						verbose=1, 
						validation_data = (X_test, y_test)) 

# Printing model score on test data 
print("Score :", rnn.evaluate(X_test, y_test, verbose=1))


In [None]:
metrics = history.history
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')



### Testing the trained model


In [None]:
def predict_review_rating(text):
    # Tokenize the input text
    text_sequences_test = tokenizer.texts_to_sequences([text])
    
    # Pad the sequence to ensure it matches the input length expected by the model
    testing = pad_sequences(text_sequences_test, maxlen=max_words)
    
    # Predict the rating (output class) using the trained model
    y_pred_test = np.argmax(model.predict(testing), axis=1)
    
    # Return the predicted rating
    return y_pred_test[0] + 1

# Testing the prediction function
rating1 = predict_review_rating('Worst product')
print("The rating according to the review is: ", rating1)

rating2 = predict_review_rating('Awesome product, I will recommend this to other users.')
print("The rating according to the review is: ", rating2)