In [2]:
import pandas as pd

# Load the Amazon Books Reviews dataset
amazon_reviews_path = 'data/book_reviews.csv'  # Adjust file path as needed
amazon_df = pd.read_csv(amazon_reviews_path)
print("Amazon Books Reviews dataset loaded:")
print(amazon_df.head())

# Load the GoodReads Books dataset (with Description and Genre)
goodreads_path = 'data/goodreads_dataset.csv'  # Adjust file path as needed
goodreads_df = pd.read_csv(goodreads_path)
print("\nGoodReads Books dataset loaded:")
print(goodreads_df.head())


Amazon Books Reviews dataset loaded:
           Id                               Title  Price         User_id  \
0  0671551345  Night World: Daughters Of Darkness    NaN   ADB0JID2XRFYR   
1  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
2  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
3  0671551345  Night World: Daughters Of Darkness    NaN  A1V0SFB3AXM8JK   
4  0671551345  Night World: Daughters Of Darkness    NaN             NaN   

                                       profileName review/helpfulness  \
0  Harmony-Faith Charisma Izabela Jazmyn McDonague                1/3   
1                                              NaN                1/3   
2                                              NaN                1/3   
3                        K. Davis "The Rose Bride"                0/2   
4                                              NaN                0/0   

   review/score  review/time  \
0           5.0   1076457600   
1  

Preliminary Data Exploration

Next, we check the structure and basic statistics for both datasets. This helps us identify any issues like missing values or duplicates.

In [3]:
# Display basic info about each dataset
print("Amazon Reviews DataFrame Info:")
amazon_df.info()
print("\nGoodReads DataFrame Info:")
goodreads_df.info()

# Check for missing values in each dataset
print("\nMissing values in Amazon Reviews:")
print(amazon_df.isnull().sum())
print("\nMissing values in GoodReads dataset:")
print(goodreads_df.isnull().sum())


Amazon Reviews DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17158 entries, 0 to 17157
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  17158 non-null  object 
 1   Title               17158 non-null  object 
 2   Price               10897 non-null  float64
 3   User_id             14573 non-null  object 
 4   profileName         14572 non-null  object 
 5   review/helpfulness  17158 non-null  object 
 6   review/score        17158 non-null  float64
 7   review/time         17158 non-null  int64  
 8   review/summary      17155 non-null  object 
 9   review/text         17158 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 1.3+ MB

GoodReads DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14712 entries, 0 to 14711
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           ------

Data Cleaning

We clean the data by removing duplicates and handling missing values. For now, we drop rows with missing data; you can later choose to impute values if necessary.

In [4]:
# Remove duplicate entries from both datasets
amazon_df_clean = amazon_df.drop_duplicates()
goodreads_df_clean = goodreads_df.drop_duplicates()

# Drop rows with missing values
amazon_df_clean = amazon_df_clean.dropna()
goodreads_df_clean = goodreads_df_clean.dropna()

# Verify cleaning steps
print("Cleaned Amazon Reviews shape:", amazon_df_clean.shape)
print("Cleaned GoodReads shape:", goodreads_df_clean.shape)


Cleaned Amazon Reviews shape: (9296, 10)
Cleaned GoodReads shape: (3831, 14)


Merging the Datasets

 we merge the Amazon Reviews into it by matching the GoodReads isbn with the Amazon Id. We include the review/text column from the Amazon dataset.

In [34]:
# Inspect the common columns to verify merge keys
print("Columns in Amazon Reviews:", amazon_df_clean.columns)
print("Columns in GoodReads dataset:", goodreads_df_clean.columns)

# Merge on GoodReads' 'isbn' and Amazon's 'Id'
# We take only the necessary columns from Amazon (Id and review/text)
merged_df = pd.merge(goodreads_df_clean,
                     amazon_df_clean[['Id', 'review/text']],
                     how='left',
                     left_on='isbn',
                     right_on='Id')


merged_df = merged_df.drop(columns=['Id'])

print("Merged DataFrame preview:")
print(merged_df.head())
print("Merged DataFrame shape:", merged_df.shape)


Columns in Amazon Reviews: Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')
Columns in GoodReads dataset: Index(['title', 'titleComplete', 'description', 'genres', 'isbn', 'publisher',
       'author', 'characters', 'places', 'ratingHistogram', 'ratingsCount',
       'reviewsCount', 'numPages', 'language'],
      dtype='object')
Merged DataFrame preview:
                       title  \
0          Project Hail Mary   
1    The Talented Mr. Ripley   
2   Tell the Wolves I'm Home   
3      P.S. I Still Love You   
4  The House on Mango Street   

                                       titleComplete  \
0                                  Project Hail Mary   
1               The Talented Mr. Ripley (Ripley, #1)   
2                           Tell the Wolves I'm Home   
3  P.S. I Still Love You (To All the Boys I've Lo...   
4                          The House on Mango 

Exporting the Cleaned and Merged Data

After merging, we export the cleaned individual datasets and the merged dataset for later use in our model training and evaluation stages.

In [6]:
# Export the cleaned individual datasets and merged dataset to CSV files
amazon_df_clean.to_csv('data/amazon_reviews_clean.csv', index=False)
goodreads_df_clean.to_csv('data/goodreads_clean.csv', index=False)
merged_df.to_csv('data/merged_books_data.csv', index=False)

print("Cleaned datasets and merged data exported successfully!")


Cleaned datasets and merged data exported successfully!


Feature Extraction for Content-Based Recommender

First, we’ll create a combined text field from the book description and review text. Then, we’ll use scikit-learn’s TF-IDF vectorizer to extract features.

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the merged dataset
merged_df = pd.read_csv('data/merged_books_data.csv')
print("Merged dataset loaded. Sample:")
print(merged_df.head())

# Create a new column 'text_features' combining description and review text
merged_df['text_features'] = merged_df['description'].fillna('') + " " + merged_df['review/text'].fillna('')
print("\nCombined text features preview:")
print(merged_df[['title', 'text_features']].head())

# Use TF-IDF to convert text data to vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['text_features'])

print("\nTF-IDF matrix shape:", tfidf_matrix.shape)


Merged dataset loaded. Sample:
                       title  \
0          Project Hail Mary   
1    The Talented Mr. Ripley   
2   Tell the Wolves I'm Home   
3      P.S. I Still Love You   
4  The House on Mango Street   

                                       titleComplete  \
0                                  Project Hail Mary   
1               The Talented Mr. Ripley (Ripley, #1)   
2                           Tell the Wolves I'm Home   
3  P.S. I Still Love You (To All the Boys I've Lo...   
4                          The House on Mango Street   

                                         description  \
0  Ryland Grace is the sole survivor on a despera...   
1  Since his debut in 1955, Tom Ripley has evolve...   
2  In this striking literary debut, Carol Rifka B...   
3  Lara Jean didn’t expect to really fall for Pet...   
4  Acclaimed by critics, beloved by readers of al...   

                                              genres        isbn  \
0  ['Science Fiction Fantasy', 'Au

Building the Content-Based Recommender

Using the TF-IDF vectors, we compute cosine similarity between books. This allows us to define a function that, given a book title, recommends similar books.

In [33]:
import numpy as np

# Compute cosine similarity between TF-IDF feature vectors
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping from book title to index
indices = pd.Series(merged_df.index, index=merged_df['title']).drop_duplicates()

def recommend_books(title, cosine_sim=cosine_sim, df=merged_df, indices=indices, top_n=5):
    # Get the index of the book that matches the title
    idx = indices.get(title)
    if idx is None:
        return "Book not found in our dataset."
    
    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Skip the first one since it is the book itself
    sim_scores = sim_scores[1: top_n+1]
    # Get the indices of the most similar books
    book_indices = [i[0] for i in sim_scores]
    
    # Return the top n most similar books
    return df[['title', 'genre']].iloc[book_indices]




Genre Classification Model using Keras

We now build a simple feed-forward neural network to classify a book’s genre based on its text features. This serves as a training task and evaluation metric for our feature extraction.

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assume 'genre' is the target and 'text_features' is the input
texts = merged_df['text_features'].astype(str)
labels = merged_df['genres']

# Encode genres into numerical labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Tokenize the texts
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_seq_length = 200  # adjust as necessary
X = pad_sequences(sequences, maxlen=max_seq_length)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


Training samples: 6061
Test samples: 1516


Building and Training the Neural Network

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

# Define the model architecture
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_seq_length),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

print(model.summary())

# Train the model
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.1)




None
Epoch 1/10
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0633 - loss: 7.4021 - val_accuracy: 0.0741 - val_loss: 5.9534
Epoch 2/10
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0721 - loss: 5.7189 - val_accuracy: 0.1285 - val_loss: 5.9591
Epoch 3/10
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.1029 - loss: 5.4080 - val_accuracy: 0.1417 - val_loss: 6.1061
Epoch 4/10
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.1554 - loss: 5.0371 - val_accuracy: 0.2834 - val_loss: 6.0818
Epoch 5/10
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.2503 - loss: 4.6476 - val_accuracy: 0.2916 - val_loss: 6.2306
Epoch 6/10
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.2840 - loss: 4.3891 - val_accuracy: 0.3081 - val_loss: 6.2602
Epoch 7/10
[1m171/171

Evaluating the Genre Classifier

In [11]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3700 - loss: 7.2689
Test Accuracy: 0.3746701776981354


Purpose:
We want to convert the text of all 17,000 Amazon reviews into numbers that the computer can work with. We do this using a TF-IDF vectorizer. TF-IDF stands for “Term Frequency-Inverse Document Frequency” and helps us capture which words are most important in each review.

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

reviews = amazon_df_clean['review/text'].astype(str)

# Create a TF-IDF vectorizer; you can adjust max_features as needed.
tfidf_vectorizer_reviews = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the review texts into a numerical matrix.
tfidf_reviews_matrix = tfidf_vectorizer_reviews.fit_transform(reviews)

print("TF-IDF matrix for 17k reviews shape:", tfidf_reviews_matrix.shape)


TF-IDF matrix for 17k reviews shape: (9296, 5000)


We want to build a simple web interface where a user can type in a review. We’ll use Gradio—a tool that makes it easy to create user interfaces for machine learning applications.

In [27]:
import gradio as gr

# Assuming merged_df is our merged dataframe with a 'title' column.
book_titles = merged_df['title'].unique().tolist()

def user_review_input(book_title, user_review):
    # Verify the book exists in our dataset
    if book_title not in book_titles:
        return "Book not found in our dataset."
    return f"Review for '{book_title}':\n{user_review}"

# Create a Gradio interface with two inputs: a dropdown for the book title and a textbox for the review.
interface = gr.Interface(
    fn=user_review_input, 
    inputs=[
        gr.Dropdown(choices=book_titles, label="Select Book Title"),
        gr.Textbox(lines=5, placeholder="Enter your review here...", label="Your Review")
    ], 
    outputs="text", 
    title="Book Review Input",
    description="Select a book and enter your review."
)

interface.launch()




* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




This section takes a review entered by the user, converts it into numbers using the same TF-IDF model we built for the 17k Amazon reviews, and then finds the most similar reviews from our dataset. In simple terms, it’s like comparing the “fingerprint” of the new review to all existing reviews to see which ones match best.

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_reviews(user_review, top_n=5):
    # Convert the user's review into its numerical (TF-IDF) representation.
    user_review_vector = tfidf_vectorizer_reviews.transform([user_review])
    
    # Compute similarity scores between the user's review and all 17k reviews.
    similarities = cosine_similarity(user_review_vector, tfidf_reviews_matrix)
    
    # Find the indices of the top_n most similar reviews.
    similar_indices = similarities[0].argsort()[-top_n:][::-1]
    
    # Retrieve the matching review texts from the Amazon reviews dataset.
    similar_reviews = amazon_df_clean.iloc[similar_indices][['review/text']].reset_index(drop=True)
    
    # Convert the result to a string for display.
    return similar_reviews.to_string(index=False)


import gradio as gr

review_similarity_interface = gr.Interface(
    fn=get_similar_reviews,
    inputs=gr.Textbox(lines=5, placeholder="Enter your review text...", label="Your Review"),
    outputs="text",
    title="Similar Reviews Finder",
    description="Enter a review and see the top similar reviews from our dataset."
)

review_similarity_interface.launch()



* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




This part uses VADER—a tool that understands the mood of text—to analyze the sentiment of a review. In addition, we use a simple rule: if the review’s rating is above 4, we consider it positive; below 4, negative.

In [31]:
import gradio as gr
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer


nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer.
sia = SentimentIntensityAnalyzer()

def analyze_review_sentiment(review_text, rating):
    # Use VADER to compute sentiment scores.
    sentiment_scores = sia.polarity_scores(review_text)
    compound_score = sentiment_scores['compound']
    
    # Use the numeric rating to decide the review's sentiment.
    if rating > 4:
        rating_sentiment = "Positive"
    else:
        rating_sentiment = "Negative"
    
    # Format the result for display.
    result_str = (
        f"Review: {review_text}\n"
        f"Compound Score: {compound_score}\n"
        f"Rating Sentiment (by rule): {rating_sentiment}\n"
        f"Full VADER Scores: {sentiment_scores}"
    )
    return result_str

# Create a Gradio interface that takes a review and a rating as inputs.
sentiment_interface = gr.Interface(
    fn=analyze_review_sentiment,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter your review text...", label="Your Review"),
        gr.Slider(minimum=1, maximum=5, step=0.1, label="Rating")
    ],
    outputs="text",
    title="VADER Sentiment Analysis",
    description="Enter a review and its rating to see a sentiment analysis."
)

sentiment_interface.launch()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Mom\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.


