In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM
from keras.layers import Bidirectional, GlobalMaxPool1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder

In [4]:
df = pd.read_csv(r"Dataset\Product Reviews.csv")

In [5]:
df.head()

Unnamed: 0,asin,name,date,rating,review
0,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-09-06,1,I bought this hair oil after viewing so many g...
1,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-08-14,5,Used This Mama Earth Newly Launched Onion Oil ...
2,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-10-19,1,So bad product...My hair falling increase too ...
3,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-09-16,1,Product just smells similar to navarathna hair...
4,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-08-18,5,I have been trying different onion oil for my ...


In [6]:
# only keep column of rating and review in df
df[['review']] 

Unnamed: 0,review
0,I bought this hair oil after viewing so many g...
1,Used This Mama Earth Newly Launched Onion Oil ...
2,So bad product...My hair falling increase too ...
3,Product just smells similar to navarathna hair...
4,I have been trying different onion oil for my ...
...,...
2777,Long lasting freshness throughout the day.
2778,My preferred soap
2779,ठीक नहीं लगा
2780,Super Product


In [7]:
df.shape

(2782, 5)

In [8]:
# Making label encoder for all ratings
label_encoder = LabelEncoder()

In [9]:
reviewList = df['review'].tolist()
for i in reviewList:
    print(i)

I bought this hair oil after viewing so many good comments. But this product is not good enough.First of all it's Expensive...Second thing the amount of the product is low (half bottle) YES!The bottle is not completely filled with oil. If you cheating on your customers #Mamaearth trust me on this you can't fool people more than once. Now I know that your Brand is not good enough. I am not going to buy any product from your Brand again.Thumbs down for mamaearth onion oil !!
Used This Mama Earth Newly Launched Onion Oil twice, and i must say im already impressed by the results !It prevents hair loss, helps control premature greying, dryness, dandruff, scalp eruptions and many other hair problems after its regular use !To avoid dry and frizzy hair, make sure that you use this oil to your hairs twice a week.Oiling provides your scalp with essential nutrients, and also strengthens the hair roots. Mamaearth onion oil works best for your hair during all the seasons.
So bad product...My hair f

In [10]:
# Converting the null reviews to strings
tempList = []

for review in reviewList:

    tempList.append(str(review))
    if type(review) != str:
        print(review)

reviewList = tempList
print(reviewList)

nan
nan
nan
nan
nan
nan


Text Preprocessing

In [11]:
print(df['rating'].map(type).value_counts())

rating
<class 'int'>    2782
Name: count, dtype: int64


In [12]:
# Makes tokenizer only remember the top 5000 words
# and replace all other words with "<UnknownWord>"
tokenizer = Tokenizer(num_words=5000, oov_token="<UnknownWord>")

# Tokenizer finally learns on top 5000 words of different reviews
tokenizer.fit_on_texts(reviewList)

In [13]:
# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(reviewList)
sequences

[[6,
  89,
  8,
  40,
  125,
  34,
  3045,
  29,
  211,
  14,
  915,
  16,
  8,
  12,
  5,
  15,
  14,
  239,
  115,
  11,
  39,
  32,
  535,
  481,
  313,
  2,
  482,
  11,
  2,
  12,
  5,
  373,
  458,
  122,
  664,
  2,
  122,
  5,
  15,
  571,
  665,
  18,
  125,
  35,
  19,
  1008,
  22,
  53,
  614,
  290,
  483,
  73,
  22,
  8,
  19,
  459,
  1985,
  173,
  66,
  57,
  346,
  71,
  6,
  206,
  28,
  53,
  110,
  5,
  15,
  14,
  239,
  6,
  93,
  15,
  273,
  7,
  51,
  80,
  12,
  31,
  53,
  110,
  225,
  1009,
  388,
  10,
  290,
  709,
  125],
 [70,
  8,
  572,
  573,
  1147,
  1303,
  709,
  125,
  574,
  4,
  6,
  218,
  333,
  1010,
  334,
  666,
  74,
  2,
  335,
  3,
  1617,
  40,
  916,
  336,
  1304,
  3046,
  3047,
  3048,
  758,
  667,
  3049,
  4,
  211,
  65,
  40,
  818,
  34,
  45,
  291,
  37,
  7,
  819,
  131,
  4,
  1305,
  40,
  219,
  207,
  28,
  19,
  37,
  8,
  125,
  7,
  53,
  575,
  574,
  9,
  374,
  3050,
  615,
  53,
  667,
  18,
  710,
  1986,
 

In [14]:
# show word mapping to each index
print(tokenizer.word_index)

# show word counts of each word in the reviews
print(tokenizer.word_counts)



In [15]:
# Pad sequences to make them of equal length
max_len = 100
X = pad_sequences(sequences, maxlen=max_len, padding='post')

In [16]:
X

array([[  6,  89,   8, ...,   0,   0,   0],
       [ 70,   8, 572, ...,   0,   0,   0],
       [ 29, 147,  12, ...,   0,   0,   0],
       ...,
       [  1,   1,   1, ...,   0,   0,   0],
       [440,  12,   0, ...,   0,   0,   0],
       [ 54, 830, 994, ...,   0,   0,   0]])

In [17]:
# Labels
y = np.array(df['rating'])
print(y)
print(np.unique(y))  # or y_test

[1 5 1 ... 2 4 5]
[1 2 3 4 5]


In [18]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# subtract 1 so all values start from 0 instead of 1
y_train = y_train - 1
y_test = y_test - 1

In [20]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPool1D(),
    Dense(64, activation='relu'),
    Dense(5, activation='softmax')
])



In [21]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
print(np.unique(y_train))  # or y_test

[0 1 2 3 4]


In [23]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 235ms/step - accuracy: 0.4964 - loss: 1.3817 - val_accuracy: 0.5081 - val_loss: 1.2689
Epoch 2/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 205ms/step - accuracy: 0.5473 - loss: 1.1777 - val_accuracy: 0.6697 - val_loss: 0.8933
Epoch 3/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 212ms/step - accuracy: 0.7270 - loss: 0.6892 - val_accuracy: 0.7379 - val_loss: 0.6787
Epoch 4/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 192ms/step - accuracy: 0.8402 - loss: 0.4265 - val_accuracy: 0.7989 - val_loss: 0.6417
Epoch 5/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 142ms/step - accuracy: 0.8889 - loss: 0.3023 - val_accuracy: 0.8474 - val_loss: 0.4716
Epoch 6/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 140ms/step - accuracy: 0.9169 - loss: 0.2159 - val_accuracy: 0.8582 - val_loss: 0.4621
Epoch 7/10
[1m70/70[

<keras.src.callbacks.history.History at 0x1dd0b4684c0>

In [24]:
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.9198 - loss: 0.3750
Test Accuracy: 0.92


In [25]:
test_text = ["superb product"]
test_seq = tokenizer.texts_to_sequences(test_text)
test_pad = pad_sequences(test_seq, maxlen=max_len, padding='post')
pred = model.predict(test_pad)

for ratingProbability in pred:
    predictedIndex = np.argmax(ratingProbability)
    predictedRating = predictedIndex + 1
    print(f"Predicted Rating for review is {predictedRating}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted Rating for review is 5


In [26]:
# Save the model
model.save("reviewToRatingModel.h5")



In [27]:
# Save Tokenizer
import pickle

with open("Reviewtokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)