In [2]:
import pandas as pd

# Load the Amazon Books Reviews dataset
amazon_reviews_path = 'data/book_reviews.csv'  # Adjust file path as needed
amazon_df = pd.read_csv(amazon_reviews_path)
print("Amazon Books Reviews dataset loaded:")
print(amazon_df.head())

# Load the GoodReads Books dataset (with Description and Genre)
goodreads_path = 'data/goodreads_dataset.csv'  # Adjust file path as needed
goodreads_df = pd.read_csv(goodreads_path)
print("\nGoodReads Books dataset loaded:")
print(goodreads_df.head())


Amazon Books Reviews dataset loaded:
           Id                               Title  Price         User_id  \
0  0671551345  Night World: Daughters Of Darkness    NaN   ADB0JID2XRFYR   
1  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
2  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
3  0671551345  Night World: Daughters Of Darkness    NaN  A1V0SFB3AXM8JK   
4  0671551345  Night World: Daughters Of Darkness    NaN             NaN   

                                       profileName review/helpfulness  \
0  Harmony-Faith Charisma Izabela Jazmyn McDonague                1/3   
1                                              NaN                1/3   
2                                              NaN                1/3   
3                        K. Davis "The Rose Bride"                0/2   
4                                              NaN                0/0   

   review/score  review/time  \
0           5.0   1076457600   
1  

Preliminary Data Exploration

Next, we check the structure and basic statistics for both datasets. This helps us identify any issues like missing values or duplicates.

In [4]:
# Display basic info about each dataset
print("Amazon Reviews DataFrame Info:")
amazon_df.info()
print("\nGoodReads DataFrame Info:")
goodreads_df.info()

# Check for missing values in each dataset
print("\nMissing values in Amazon Reviews:")
print(amazon_df.isnull().sum())
print("\nMissing values in GoodReads dataset:")
print(goodreads_df.isnull().sum())


Amazon Reviews DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17158 entries, 0 to 17157
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  17158 non-null  object 
 1   Title               17158 non-null  object 
 2   Price               10897 non-null  float64
 3   User_id             14573 non-null  object 
 4   profileName         14572 non-null  object 
 5   review/helpfulness  17158 non-null  object 
 6   review/score        17158 non-null  float64
 7   review/time         17158 non-null  int64  
 8   review/summary      17155 non-null  object 
 9   review/text         17158 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 1.3+ MB

GoodReads DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14712 entries, 0 to 14711
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           ------

Data Cleaning

We clean the data by removing duplicates and handling missing values. For now, we drop rows with missing data; you can later choose to impute values if necessary.

In [5]:
# Remove duplicate entries from both datasets
amazon_df_clean = amazon_df.drop_duplicates()
goodreads_df_clean = goodreads_df.drop_duplicates()

# Drop rows with missing values
amazon_df_clean = amazon_df_clean.dropna()
goodreads_df_clean = goodreads_df_clean.dropna()

# Verify cleaning steps
print("Cleaned Amazon Reviews shape:", amazon_df_clean.shape)
print("Cleaned GoodReads shape:", goodreads_df_clean.shape)


Cleaned Amazon Reviews shape: (9296, 10)
Cleaned GoodReads shape: (3831, 14)


Merging the Datasets

 we merge the Amazon Reviews into it by matching the GoodReads isbn with the Amazon Id. We include the review/text column from the Amazon dataset.

In [6]:
# Inspect the common columns to verify merge keys
print("Columns in Amazon Reviews:", amazon_df_clean.columns)
print("Columns in GoodReads dataset:", goodreads_df_clean.columns)

# Merge on GoodReads' 'isbn' and Amazon's 'Id'
# We take only the necessary columns from Amazon (Id and review/text)
merged_df = pd.merge(goodreads_df_clean,
                     amazon_df_clean[['Id', 'review/text']],
                     how='left',
                     left_on='isbn',
                     right_on='Id')

# Optionally, drop the extra 'Id' column from the merge
merged_df = merged_df.drop(columns=['Id'])

print("Merged DataFrame preview:")
print(merged_df.head())
print("Merged DataFrame shape:", merged_df.shape)


Columns in Amazon Reviews: Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')
Columns in GoodReads dataset: Index(['title', 'titleComplete', 'description', 'genres', 'isbn', 'publisher',
       'author', 'characters', 'places', 'ratingHistogram', 'ratingsCount',
       'reviewsCount', 'numPages', 'language'],
      dtype='object')
Merged DataFrame preview:
                       title  \
0          Project Hail Mary   
1    The Talented Mr. Ripley   
2   Tell the Wolves I'm Home   
3      P.S. I Still Love You   
4  The House on Mango Street   

                                       titleComplete  \
0                                  Project Hail Mary   
1               The Talented Mr. Ripley (Ripley, #1)   
2                           Tell the Wolves I'm Home   
3  P.S. I Still Love You (To All the Boys I've Lo...   
4                          The House on Mango 

Exporting the Cleaned and Merged Data

After merging, we export the cleaned individual datasets and the merged dataset for later use in our model training and evaluation stages.

In [8]:
# Export the cleaned individual datasets and merged dataset to CSV files
amazon_df_clean.to_csv('data/amazon_reviews_clean.csv', index=False)
goodreads_df_clean.to_csv('data/goodreads_clean.csv', index=False)
merged_df.to_csv('data/merged_books_data.csv', index=False)

print("Cleaned datasets and merged data exported successfully!")


Cleaned datasets and merged data exported successfully!


Feature Extraction for Content-Based Recommender

First, we’ll create a combined text field from the book description and review text. Then, we’ll use scikit-learn’s TF-IDF vectorizer to extract features.

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the merged dataset
merged_df = pd.read_csv('data/merged_books_data.csv')
print("Merged dataset loaded. Sample:")
print(merged_df.head())

# Create a new column 'text_features' combining description and review text
merged_df['text_features'] = merged_df['description'].fillna('') + " " + merged_df['review/text'].fillna('')
print("\nCombined text features preview:")
print(merged_df[['title', 'text_features']].head())

# Use TF-IDF to convert text data to vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['text_features'])

print("\nTF-IDF matrix shape:", tfidf_matrix.shape)


Merged dataset loaded. Sample:
                       title  \
0          Project Hail Mary   
1    The Talented Mr. Ripley   
2   Tell the Wolves I'm Home   
3      P.S. I Still Love You   
4  The House on Mango Street   

                                       titleComplete  \
0                                  Project Hail Mary   
1               The Talented Mr. Ripley (Ripley, #1)   
2                           Tell the Wolves I'm Home   
3  P.S. I Still Love You (To All the Boys I've Lo...   
4                          The House on Mango Street   

                                         description  \
0  Ryland Grace is the sole survivor on a despera...   
1  Since his debut in 1955, Tom Ripley has evolve...   
2  In this striking literary debut, Carol Rifka B...   
3  Lara Jean didn’t expect to really fall for Pet...   
4  Acclaimed by critics, beloved by readers of al...   

                                              genres        isbn  \
0  ['Science Fiction Fantasy', 'Au

Building the Content-Based Recommender

Using the TF-IDF vectors, we compute cosine similarity between books. This allows us to define a function that, given a book title, recommends similar books.

In [10]:
import numpy as np

# Compute cosine similarity between TF-IDF feature vectors
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping from book title to index
indices = pd.Series(merged_df.index, index=merged_df['title']).drop_duplicates()

def recommend_books(title, cosine_sim=cosine_sim, df=merged_df, indices=indices, top_n=5):
    # Get the index of the book that matches the title
    idx = indices.get(title)
    if idx is None:
        return "Book not found in our dataset."
    
    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Skip the first one since it is the book itself
    sim_scores = sim_scores[1: top_n+1]
    # Get the indices of the most similar books
    book_indices = [i[0] for i in sim_scores]
    
    # Return the top n most similar books
    return df[['title', 'genre']].iloc[book_indices]

# Example usage:
print("Recommendations for 'Some Book Title':")
print(recommend_books("Some Book Title"))


Recommendations for 'Some Book Title':
Book not found in our dataset.


Genre Classification Model using Keras

We now build a simple feed-forward neural network to classify a book’s genre based on its text features. This serves as a training task and evaluation metric for our feature extraction.

In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assume 'genre' is the target and 'text_features' is the input
texts = merged_df['text_features'].astype(str)
labels = merged_df['genre']

# Encode genres into numerical labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Tokenize the texts
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_seq_length = 200  # adjust as necessary
X = pad_sequences(sequences, maxlen=max_seq_length)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


SyntaxError: invalid syntax (3157163696.py, line 2)