In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
from datasets import load_dataset

# Load the Amazon Reviews dataset, specifying the configuration name
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", name="raw_review_All_Beauty", split="full")
print(dataset[0])

{'rating': 5.0, 'title': 'Such a lovely scent but not overpowering.', 'text': "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!", 'images': [], 'asin': 'B00YQ6X8EO', 'parent_asin': 'B00YQ6X8EO', 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'timestamp': 1588687728923, 'helpful_vote': 0, 'verified_purchase': True}


In [None]:
import pandas as pd
df = pd.DataFrame(dataset)

# Keep only the 'text' and 'rating' columns
df = df[['text', 'rating']]

# Display first few rows
df.head()

Unnamed: 0,text,rating
0,This spray is really nice. It smells really go...,5.0
1,"This product does what I need it to do, I just...",4.0
2,"Smells good, feels great!",5.0
3,Felt synthetic,1.0
4,Love it,5.0


In [None]:
# Define function to convert ratings to sentiment labels
def label_sentiment(rating):
    if rating >= 4:
        return 1  # Positive
    elif rating == 3:
        return 0  # Neutral
    else:
        return -1  # Negative

# Apply the function to create sentiment labels
df['sentiment'] = df['rating'].apply(label_sentiment)

# Drop the 'rating' column (we don't need it anymore)
df = df.drop(columns=['rating'])

# Show class distribution
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,500107
-1,145114
0,56307


In [None]:
import re

# Function to clean review text
def clean_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = text.lower()  # Lowercase
    return text

# Apply the cleaning function
df['cleaned_text'] = df['text'].apply(clean_text)

# Display a sample
df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,This spray is really nice. It smells really go...,this spray is really nice it smells really goo...
1,"This product does what I need it to do, I just...",this product does what i need it to do i just ...
2,"Smells good, feels great!",smells good feels great
3,Felt synthetic,felt synthetic
4,Love it,love it


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set vocabulary size
VOCAB_SIZE = 25000
MAX_LENGTH = 100  # Max words per review

# Tokenizer initialization
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

# Pad sequences to ensure equal length
padded_sequences = pad_sequences(sequences, maxlen=MAX_LENGTH, padding="post")

# Convert sentiment labels to numerical format
sentiment_labels = df['sentiment'].map({-1: 0, 0: 1, 1: 2}).values

# Display sample tokenized review
print(padded_sequences[0])

[   8  298   10   44   64    5  144   44   35  293   16   44  205    4
   82    2 1440    3   42  220    5  195   30   24  139    6  146   12
    5  226    7   53    2  414    3  197    3   21    6  146   12   17
  781 1128    3   97 4185    7   92  579   19 5984 1087   22   72 1525
  369   19    8  155    5    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout

# Define model parameters
EMBEDDING_DIM = 128  # Size of word embeddings
RNN_UNITS = 64  # Number of RNN units

# Build the RNN model
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),
    SimpleRNN(RNN_UNITS, return_sequences=False),  # Simple RNN layer
    Dropout(0.3),  # Dropout for regularization
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # Output layer (3 classes: Negative, Neutral, Positive)
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()



In [None]:
# Split data into training & validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(padded_sequences, sentiment_labels, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/5
[1m17539/17539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1029s[0m 59ms/step - accuracy: 0.7161 - loss: 0.7418 - val_accuracy: 0.7128 - val_loss: 0.6878
Epoch 2/5
[1m17539/17539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1072s[0m 61ms/step - accuracy: 0.7217 - loss: 0.6948 - val_accuracy: 0.7142 - val_loss: 0.7654
Epoch 3/5
[1m17539/17539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1073s[0m 61ms/step - accuracy: 0.7164 - loss: 0.7561 - val_accuracy: 0.7146 - val_loss: 0.7635
Epoch 4/5
[1m17539/17539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1042s[0m 59ms/step - accuracy: 0.7328 - loss: 0.7095 - val_accuracy: 0.7338 - val_loss: 0.6698
Epoch 5/5
[1m17539/17539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1092s[0m 59ms/step - accuracy: 0.7510 - loss: 0.6852 - val_accuracy: 0.7243 - val_loss: 0.6560


In [None]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy:.4f}")

[1m4385/4385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 10ms/step - accuracy: 0.7256 - loss: 0.6533
Validation Accuracy: 0.7243


In [None]:
def predict_sentiment(text):
    cleaned_text = clean_text(text)  # Apply preprocessing
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=MAX_LENGTH, padding="post")
    prediction = model.predict(padded_sequence)
    sentiment = ["Negative", "Neutral", "Positive"]
    return sentiment[prediction.argmax()]

# Test example
test_review = "wonderful! I love it."
print("Sentiment:", predict_sentiment(test_review))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 317ms/step
Sentiment: Positive
