In [1]:
# Import pandas
import pandas as pd
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

#import numpy
import numpy as np

#import from keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

#import gradio
import gradio as gr

#import VADER
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [4]:

# Load the Amazon Books Reviews dataset
amazon_reviews_path = 'Resources/book_reviews.csv'  # Adjust file path as needed
amazon_df = pd.read_csv(amazon_reviews_path)
print("Amazon Books Reviews dataset loaded:")
print(amazon_df.head())

# Load the GoodReads Books dataset (with Description and Genre)
goodreads_path = 'Resources/goodreads_dataset.csv'  # Adjust file path as needed
goodreads_df = pd.read_csv(goodreads_path)
print("\nGoodReads Books dataset loaded:")
print(goodreads_df.head())

Amazon Books Reviews dataset loaded:
           Id                               Title  Price         User_id  \
0  0671551345  Night World: Daughters Of Darkness    NaN   ADB0JID2XRFYR   
1  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
2  0671551345  Night World: Daughters Of Darkness    NaN             NaN   
3  0671551345  Night World: Daughters Of Darkness    NaN  A1V0SFB3AXM8JK   
4  0671551345  Night World: Daughters Of Darkness    NaN             NaN   

                                       profileName review/helpfulness  \
0  Harmony-Faith Charisma Izabela Jazmyn McDonague                1/3   
1                                              NaN                1/3   
2                                              NaN                1/3   
3                        K. Davis "The Rose Bride"                0/2   
4                                              NaN                0/0   

   review/score  review/time  \
0           5.0   1076457600   
1  

In [5]:
# Display basic info about each dataset
print("Amazon Reviews DataFrame Info:")
amazon_df.info()
print("\nGoodReads DataFrame Info:")
goodreads_df.info()

# Check for missing values in each dataset
print("\nMissing values in Amazon Reviews:")
print(amazon_df.isnull().sum())
print("\nMissing values in GoodReads dataset:")
print(goodreads_df.isnull().sum())

Amazon Reviews DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17158 entries, 0 to 17157
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  17158 non-null  object 
 1   Title               17158 non-null  object 
 2   Price               10897 non-null  float64
 3   User_id             14573 non-null  object 
 4   profileName         14572 non-null  object 
 5   review/helpfulness  17158 non-null  object 
 6   review/score        17158 non-null  float64
 7   review/time         17158 non-null  int64  
 8   review/summary      17155 non-null  object 
 9   review/text         17158 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 1.3+ MB

GoodReads DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14712 entries, 0 to 14711
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           ------

In [6]:
#  Get the number of reviews by score:
amazon_df['review/score'].value_counts()

review/score
5.0    10556
4.0     3265
3.0     1458
1.0      979
2.0      900
Name: count, dtype: int64

In [9]:
#Group scores using OneHotEncoder:


# # encoded_ratings_df = pd.DataFrame(encoded_ratings.toarray(), columns=encoder.get_feature_names_out(['review/score']))  # Convert to DataFrame
# # final_df = pd.concat([amazon_df, encoded_ratings_df], axis=1)  # Concatenate the original DataFrame with the encoded DataFrame

# encoder = OneHotEncoder()
# encoded_ratings = encoder.fit_transform(amazon_df[['review/score']])
# encoded_ratings_df = pd.DataFrame(encoded_ratings, columns=encoder.get_feature_names_out(['review/score']))
# final_df = pd.concat([amazon_df, encoded_ratings_df], axis=1)

ValueError: Shape of passed values is (17158, 1), indices imply (17158, 5)

In [10]:
# Remove duplicate entries from both datasets
amazon_df_clean = amazon_df.drop_duplicates()
goodreads_df_clean = goodreads_df.drop_duplicates()

# Drop rows with missing values
amazon_df_clean = amazon_df_clean.dropna()
goodreads_df_clean = goodreads_df_clean.dropna()

# Verify cleaning steps
print("Cleaned Amazon Reviews shape:", amazon_df_clean.shape)
print("Cleaned GoodReads shape:", goodreads_df_clean.shape)

Cleaned Amazon Reviews shape: (9296, 10)
Cleaned GoodReads shape: (3831, 14)


In [11]:
# Inspect the common columns to verify merge keys
print("Columns in Amazon Reviews:", amazon_df_clean.columns)
print("Columns in GoodReads dataset:", goodreads_df_clean.columns)

# Merge on GoodReads' 'isbn' and Amazon's 'Id'
# We take only the necessary columns from Amazon (Id and review/text)
merged_df = pd.merge(goodreads_df_clean,
                     amazon_df_clean[['Id', 'review/text']],
                     how='left',
                     left_on='isbn',
                     right_on='Id')


merged_df = merged_df.drop(columns=['Id'])

print("Merged DataFrame preview:")
print(merged_df.head())
print("Merged DataFrame shape:", merged_df.shape)

Columns in Amazon Reviews: Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')
Columns in GoodReads dataset: Index(['title', 'titleComplete', 'description', 'genres', 'isbn', 'publisher',
       'author', 'characters', 'places', 'ratingHistogram', 'ratingsCount',
       'reviewsCount', 'numPages', 'language'],
      dtype='object')
Merged DataFrame preview:
                       title  \
0          Project Hail Mary   
1    The Talented Mr. Ripley   
2   Tell the Wolves I'm Home   
3      P.S. I Still Love You   
4  The House on Mango Street   

                                       titleComplete  \
0                                  Project Hail Mary   
1               The Talented Mr. Ripley (Ripley, #1)   
2                           Tell the Wolves I'm Home   
3  P.S. I Still Love You (To All the Boys I've Lo...   
4                          The House on Mango 

In [13]:
# Export the cleaned individual datasets and merged dataset to CSV files
amazon_df_clean.to_csv('Resources/amazon_reviews_clean.csv', index=False)
goodreads_df_clean.to_csv('Resources/goodreads_clean.csv', index=False)
merged_df.to_csv('Resources/merged_books_data.csv', index=False)

print("Cleaned datasets and merged data exported successfully!")

Cleaned datasets and merged data exported successfully!


In [15]:
# Load the merged dataset
merged_df = pd.read_csv('Resources/merged_books_data.csv')
print("Merged dataset loaded. Sample:")
print(merged_df.head())

# Create a new column 'text_features' combining description and review text
merged_df['text_features'] = merged_df['description'].fillna('') + " " + merged_df['review/text'].fillna('')
print("\nCombined text features preview:")
print(merged_df[['title', 'text_features']].head())

# Use TF-IDF to convert text data to vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['text_features'])

print("\nTF-IDF matrix shape:", tfidf_matrix.shape)

Merged dataset loaded. Sample:
                       title  \
0          Project Hail Mary   
1    The Talented Mr. Ripley   
2   Tell the Wolves I'm Home   
3      P.S. I Still Love You   
4  The House on Mango Street   

                                       titleComplete  \
0                                  Project Hail Mary   
1               The Talented Mr. Ripley (Ripley, #1)   
2                           Tell the Wolves I'm Home   
3  P.S. I Still Love You (To All the Boys I've Lo...   
4                          The House on Mango Street   

                                         description  \
0  Ryland Grace is the sole survivor on a despera...   
1  Since his debut in 1955, Tom Ripley has evolve...   
2  In this striking literary debut, Carol Rifka B...   
3  Lara Jean didn’t expect to really fall for Pet...   
4  Acclaimed by critics, beloved by readers of al...   

                                              genres        isbn  \
0  ['Science Fiction Fantasy', 'Au

In [20]:
#assign sentiment score to review and description text
text_features = merged_df['text_features']

analyzer = SentimentIntensityAnalyzer()
sentiment_scores = text_features.apply(lambda x: analyzer.polarity_scores(x)['compound'])
for index, score in sentiment_scores.items():
    print(f"User review's sentiment score for review {index} is: {score}")

print(f"User review's sentiment score is : {sentiment_scores}")

LookupError: 
**********************************************************************
  Resource [93mvader_lexicon[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('vader_lexicon')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93msentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt[0m

  Searched in:
    - '/Users/laurenchristiansen/nltk_data'
    - '/opt/anaconda3/envs/dev/nltk_data'
    - '/opt/anaconda3/envs/dev/share/nltk_data'
    - '/opt/anaconda3/envs/dev/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [21]:
vectorizer = TfidfVectorizer(stop_words="english")
# Vectorize text
vectorized_text = vectorizer.fit_transform(text_features["text_features"])
# Vectorize user's review
tf_user = vectorizer.transform(["text_features"])

KeyError: 'text_features'

In [None]:
# # Calculate the cosine similarity
# similarity = cosine_similarity(tf_user, tf_reviews)

In [None]:
# # Filter the reviews based on ratings according to user review's sentiment
# if sentiment >= 0:
#     reviews_filtered = reviews[reviews['review/score'] >= 4]
# else:
#     reviews_filtered = reviews[reviews['review/score'] < 4]

In [22]:
# Compute cosine similarity between TF-IDF feature vectors
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping from book title to index
indices = pd.Series(merged_df.index, index=merged_df['title']).drop_duplicates()

def recommend_books(title, cosine_sim=cosine_sim, df=merged_df, indices=indices, top_n=5):
    # Get the index of the book that matches the title
    idx = indices.get(title)
    if idx is None:
        return "Book not found in our dataset."
    
    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Skip the first one since it is the book itself
    sim_scores = sim_scores[1: top_n+1]
    # Get the indices of the most similar books
    book_indices = [i[0] for i in sim_scores]
    
    # Return the top n most similar books
    return df[['title', 'genre']].iloc[book_indices]

In [23]:
# Assume 'genre' is the target and 'text_features' is the input
texts = merged_df['text_features'].astype(str)
labels = merged_df['genres']

# Encode genres into numerical labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Tokenize the texts
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_seq_length = 200  # adjust as necessary
X = pad_sequences(sequences, maxlen=max_seq_length)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)

print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])

Training samples: 6061
Test samples: 1516


In [24]:
# Define the model architecture
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_seq_length),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

print(model.summary())

# Train the model
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 64)           640000    
                                                                 
 global_average_pooling1d (G  (None, 64)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 3368)              218920    
                                                                 
Total params: 863,080
Trainable params: 863,080
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
  1/171 [..............................] - ETA: 30s - loss: 8.1215 - accuracy: 0.0000e+00

2025-03-08 13:05:05.870116: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.3588390648365021


In [26]:
reviews = amazon_df_clean['review/text'].astype(str)

# Create a TF-IDF vectorizer; you can adjust max_features as needed.
tfidf_vectorizer_reviews = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the review texts into a numerical matrix.
tfidf_reviews_matrix = tfidf_vectorizer_reviews.fit_transform(reviews)

print("TF-IDF matrix for 17k reviews shape:", tfidf_reviews_matrix.shape)

TF-IDF matrix for 17k reviews shape: (9296, 5000)


In [27]:
# Assuming merged_df is our merged dataframe with a 'title' column.
book_titles = merged_df['title'].unique().tolist()

def user_review_input(book_title, user_review):
    # Verify the book exists in our dataset
    if book_title not in book_titles:
        return "Book not found in our dataset."
    return f"Review for '{book_title}':\n{user_review}"

# Create a Gradio interface with two inputs: a dropdown for the book title and a textbox for the review.
interface = gr.Interface(
    fn=user_review_input, 
    inputs=[
        gr.Dropdown(choices=book_titles, label="Select Book Title"),
        gr.Textbox(lines=5, placeholder="Enter your review here...", label="Your Review")
    ], 
    outputs="text", 
    title="Book Review Input",
    description="Select a book and enter your review."
)

interface.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [None]:
import streamlit as st

#Title, Header, Subheader
st.title("Book Recommender")
st.header("Tell us about a book you like, and we'll recommend something you might like just as much!")
st.subheader("Please follow the prompts below.")

#Set selection box
selection = st.selectbox()
st.write("You've selected: ")

In [28]:
def get_similar_reviews(user_review, top_n=5):
    # Convert the user's review into its numerical (TF-IDF) representation.
    user_review_vector = tfidf_vectorizer_reviews.transform([user_review])
    
    # Compute similarity scores between the user's review and all 17k reviews.
    similarities = cosine_similarity(user_review_vector, tfidf_reviews_matrix)
    
    # Find the indices of the top_n most similar reviews.
    similar_indices = similarities[0].argsort()[-top_n:][::-1]
    
    # Retrieve the matching review texts from the Amazon reviews dataset.
    similar_reviews = amazon_df_clean.iloc[similar_indices][['review/text']].reset_index(drop=True)
    
    # Convert the result to a string for display.
    return similar_reviews.to_string(index=False)



In [29]:
# review_similarity_interface = gr.Interface(
#     fn=get_similar_reviews,
#     inputs=gr.Textbox(lines=5, placeholder="Enter your review text...", label="Your Review"),
#     outputs="text",
#     title="Similar Reviews Finder",
#     description="Enter a review and see the top similar reviews from our dataset."
# )

# review_similarity_interface.launch()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




In [30]:
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer.
sia = SentimentIntensityAnalyzer()

def analyze_review_sentiment(review_text, rating):
    # Use VADER to compute sentiment scores.
    sentiment_scores = sia.polarity_scores(review_text)
    compound_score = sentiment_scores['compound']
    
    # Use the numeric rating to decide the review's sentiment.
    if rating > 4:
        rating_sentiment = "Positive"
    else:
        rating_sentiment = "Negative"
    
    # Format the result for display.
    result_str = (
        f"Review: {review_text}\n"
        f"Compound Score: {compound_score}\n"
        f"Rating Sentiment (by rule): {rating_sentiment}\n"
        f"Full VADER Scores: {sentiment_scores}"
    )
    return result_str

# Create a Gradio interface that takes a review and a rating as inputs.
sentiment_interface = gr.Interface(
    fn=analyze_review_sentiment,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter your review text...", label="Your Review"),
        gr.Slider(minimum=1, maximum=5, step=0.1, label="Rating")
    ],
    outputs="text",
    title="VADER Sentiment Analysis",
    description="Enter a review and its rating to see a sentiment analysis."
)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/laurenchristiansen/nltk_data...


In [None]:
# #histogram encoder
# data = {
#     'ratingHistogram': ['0-1', '1-2', '2-3', '3-4', '4-5']
# }
# df = pd.DataFrame(data)

# # Initialize the OneHotEncoder
# encoder = OneHotEncoder(sparse=False)

# # Fit and transform the data
# encoded_data = encoder.fit_transform(goodreads_df[['ratingHistogram']])

# # Create a DataFrame with the encoded data
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['ratingHistogram']))

# # Concatenate the original DataFrame with the encoded DataFrame
# result_df = pd.concat([df, encoded_df], axis=1)

# # Display the result
# print(result_df)

In [None]:
# # Example DataFrame with ratings
# goodreads_df = {'Film': ['Film A', 'Film B', 'Film C'],
#         'Rating': [4.5, 3.5, 4.0]}
# ratings_df = pd.DataFrame(ratings)

# # Function to classify sentiment based on rating
# def classify_sentiment(rating):
#     if rating > 4:
#         return 'Positive'
#     else:
#         return 'Negative'

# # Apply the function to the Rating column
# goodreads_df['Sentiment'] = df['Rating'].apply(classify_sentiment)

# # Display the DataFrame
# print(df)

In [13]:
# # Import the spaCy library
# import spacy
# # Load the small English language model for spaCy
# nlp = spacy.load("en_core_web_sm")



In [14]:
# # Tokenize the first sentence using token.text
# reviews = reviews_df["review/text"]
# print(reviews)

# spacy_reviews = nlp(reviews)
# [token.text for token in spacy_reviews]

0        This is 1 of da bst books dat i have EVER read! @ my school, we are doing a play on this & im playin Mary-Lynette. i cant wait 2 get to the last chapters when they finally give in 2 each other! Gr...
1        first of all i thought that this was one of lj smith's best books she has written adn also the funniest. i love all the characters but my fave one in the book is Ash. he's really a hottie and a ba...
2        Once started I couldn't put it down, literally. I didn't stop til I'd read it through.Three sisters on the run from the Night Worlds patriachal society, they visit Oregon. Their brother finds out ...
3        This book is probably, in my opinion, one of (if not THE) worst in the Night World Series. It is Ash's story this time, who's soulmate just happens to be a human. (which Ironically was shadowed up...
4                                         The plot and characters are incredible. Everyone that likes the supernatural should read this book, and all the other Nigh

ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'pandas.core.series.Series'>