In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
from datasets import load_dataset
import pandas as pd
from google.colab import drive
import random as random
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout, Lambda
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tensorflow.keras.regularizers import l2

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset with streaming enabled
dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_review_Home_and_Kitchen",
    streaming=True,  # Enable streaming to handle large data
    trust_remote_code=True
)

subset = []

for count, row in enumerate(dataset["full"]):
    subset.append(row)
    if count + 1 == 130000:
        break

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(subset)

# Define the path to save the CSV in Google Drive
output_path = "/content/drive/My Drive/home_reviews_130000.csv"

# Save the DataFrame to CSV
df.to_csv(output_path, index=False)

print(f"CSV file saved at: {output_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CSV file saved at: /content/drive/My Drive/home_reviews_130000.csv


In [None]:
df = pd.read_csv("/content/drive/My Drive/home_reviews_130000.csv")
print(df.head())
print(df.columns)
print(df.info())


   rating                                            title  \
0     1.0   Received Used & scratched item! Purchased new!   
1     5.0         Excellent for moving & storage & floods!   
2     2.0  Lid very loose- needs a gasket imo. Small base.   
3     5.0                              Best purchase ever!   
4     5.0                              Excellent for yarn!   

                                                text  \
0  Livid.  Once again received an obviously used ...   
1  I purchased these for multiple reasons. The ma...   
2  [[VIDEOID:c87e962bc893a948856b0f1b285ce6cc]] I...   
3  If you live at a higher elevation like me (5k ...   
4  I use these to store yarn. They easily hold 12...   

                                              images        asin parent_asin  \
0                                                 []  B007WQ9YNO  B09XWYG6X1   
1                                                 []  B09H2VJW6K  B0BXDLF8TW   
2  [{'small_image_url': 'https://m.media-amazon.c.

In [None]:
# Binary target: helpful or not
df['helpful'] = (df['helpful_vote'] > 0).astype(int)

# Features to use
categorical_cols = ['user_id', 'asin', 'parent_asin', 'verified_purchase']
numerical_cols = ['rating', 'timestamp']

# Convert boolean to int
df['verified_purchase'] = df['verified_purchase'].astype(int)

df = df.groupby('user_id').filter(lambda x: len(x) >= 3)

In [None]:
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
# Step 0: Compute sentiment scores using VADER
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
df['sentiment'] = df['text'].apply(lambda x: sid.polarity_scores(str(x))['compound'])

# Step 1: Generate text embeddings for review texts
print("Generating text embeddings...")
st_model = SentenceTransformer('all-mpnet-base-v2')
text_embeddings = st_model.encode(df['text'].tolist(), show_progress_bar=True)
print("Text embeddings shape:", text_embeddings.shape)  # (num_samples, embedding_dim)

# Convert to float32 for TensorFlow compatibility
text_embeddings = text_embeddings.astype(np.float32)
embedding_size = 8

num_users = df['user_id'].nunique()
num_asins = df['asin'].nunique()
num_parent_asins = df['parent_asin'].nunique()
num_verified = 2  # binary

# Define inputs
user_input = Input(shape=(1,), name='user_id')
asin_input = Input(shape=(1,), name='asin')
parent_asin_input = Input(shape=(1,), name='parent_asin')
verified_input = Input(shape=(1,), name='verified_purchase')

rating_input = Input(shape=(1,), name='rating')
timestamp_input = Input(shape=(1,), name='timestamp')

sentiment_input = Input(shape=(1,), name='sentiment')  # New sentiment input

embedding_dim = text_embeddings.shape[1]
text_input = Input(shape=(embedding_dim,), name='text_embedding')

# Embeddings for categorical features
user_embed = Embedding(input_dim=num_users + 1, output_dim=embedding_size)(user_input)
asin_embed = Embedding(input_dim=num_asins + 1, output_dim=embedding_size)(asin_input)
parent_asin_embed = Embedding(input_dim=num_parent_asins + 1, output_dim=embedding_size)(parent_asin_input)
verified_embed = Embedding(input_dim=num_verified + 1, output_dim=embedding_size)(verified_input)

user_embed_flat = Flatten()(user_embed)
asin_embed_flat = Flatten()(asin_embed)
parent_asin_embed_flat = Flatten()(parent_asin_embed)
verified_embed_flat = Flatten()(verified_embed)

embeddings = [user_embed_flat, asin_embed_flat, parent_asin_embed_flat, verified_embed_flat]

# FM interaction function
def fm_interaction(embeds):
    sum_square = tf.square(tf.reduce_sum(embeds, axis=1))
    square_sum = tf.reduce_sum(tf.square(embeds), axis=1)
    cross_term = 0.5 * tf.reduce_sum(sum_square - square_sum, axis=1, keepdims=True)
    return cross_term

stacked_embeds = Lambda(lambda x: tf.stack(x, axis=1))(embeddings)
fm_out = Lambda(fm_interaction)(stacked_embeds)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Generating text embeddings...


Batches:   0%|          | 0/4063 [00:00<?, ?it/s]

Text embeddings shape: (130000, 768)


In [None]:
# Deep part: concatenate embeddings + numeric features + text embeddings + sentiment
deep_input = Concatenate()([
    user_embed_flat, asin_embed_flat, parent_asin_embed_flat, verified_embed_flat,
    rating_input, timestamp_input,
    text_input,
    sentiment_input  # Include sentiment here
])

x = Dense(128, activation='relu')(deep_input)
x = Dropout(0.5)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.4)(x)
x = Dense(32, activation='relu')(x)

concat = Concatenate()([fm_out, x])
output = Dense(1, activation='sigmoid')(concat)

model = Model(
    inputs=[user_input, asin_input, parent_asin_input, verified_input,
            rating_input, timestamp_input, text_input, sentiment_input],
    outputs=output
)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
model.summary()

In [None]:
# Step 3: Prepare target and split data
y = df['helpful'].values  # binary target

# Split into train (80%) and temp (20%)
df_train, df_temp, y_train, y_temp, text_train, text_temp, sentiment_train, sentiment_temp = train_test_split(
    df, y, text_embeddings, df['sentiment'].values, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
df_val, df_test, y_val, y_test, text_val, text_test, sentiment_val, sentiment_test = train_test_split(
    df_temp, y_temp, text_temp, sentiment_temp, test_size=0.5, random_state=42)

# Helper function to build input dict including sentiment
def build_input_dict(dataframe, text_embeds, sentiment_scores):
    return {
        'user_id': dataframe['user_id'].values,
        'asin': dataframe['asin'].values,
        'parent_asin': dataframe['parent_asin'].values,
        'verified_purchase': dataframe['verified_purchase'].values,
        'rating': dataframe['rating'].values,
        'timestamp': dataframe['timestamp'].values,
        'text_embedding': text_embeds,
        'sentiment': sentiment_scores
    }

X_train = build_input_dict(df_train, text_train, sentiment_train)
X_val = build_input_dict(df_val, text_val, sentiment_val)
X_test = build_input_dict(df_test, text_test, sentiment_test)

In [None]:
random.seed(88)
np.random.seed(88)
tf.random.set_seed(88)

# Step 4: Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    verbose=2
)

Epoch 1/10
1625/1625 - 11s - 7ms/step - AUC: 0.9961 - loss: 0.0627 - val_AUC: 0.6637 - val_loss: 1.2165
Epoch 2/10
1625/1625 - 6s - 4ms/step - AUC: 0.9989 - loss: 0.0295 - val_AUC: 0.6620 - val_loss: 1.4734
Epoch 3/10
1625/1625 - 10s - 6ms/step - AUC: 0.9993 - loss: 0.0242 - val_AUC: 0.6592 - val_loss: 1.5444
Epoch 4/10
1625/1625 - 11s - 6ms/step - AUC: 0.9994 - loss: 0.0212 - val_AUC: 0.6534 - val_loss: 1.8663
Epoch 5/10
1625/1625 - 5s - 3ms/step - AUC: 0.9996 - loss: 0.0189 - val_AUC: 0.6545 - val_loss: 2.0475
Epoch 6/10
1625/1625 - 10s - 6ms/step - AUC: 0.9996 - loss: 0.0165 - val_AUC: 0.6519 - val_loss: 2.3909
Epoch 7/10
1625/1625 - 6s - 4ms/step - AUC: 0.9997 - loss: 0.0162 - val_AUC: 0.6481 - val_loss: 2.3065
Epoch 8/10
1625/1625 - 9s - 6ms/step - AUC: 0.9997 - loss: 0.0152 - val_AUC: 0.6526 - val_loss: 2.3835
Epoch 9/10
1625/1625 - 6s - 4ms/step - AUC: 0.9997 - loss: 0.0138 - val_AUC: 0.6483 - val_loss: 2.8895
Epoch 10/10
1625/1625 - 10s - 6ms/step - AUC: 0.9997 - loss: 0.0131 -

In [None]:
test_metrics = model.evaluate(X_test, y_test, verbose=2)
test_loss, test_auc = test_metrics[0], test_metrics[1]

y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = np.mean(y_pred_binary == y_test)

print(f"Test Loss: {test_loss:.4f}, Test AUC: {test_auc:.4f}, Test Accuracy: {accuracy:.4f}")

407/407 - 1s - 3ms/step - AUC: 0.6649 - loss: 3.1560
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Test Loss: 3.1560, Test AUC: 0.6649, Test Accuracy: 0.5813
