In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Dense, Flatten, Input, concatenate
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval


In [None]:
data_df = pd.read_csv("../Resources/book_recommender.csv")


In [None]:
# Check if 'Genres' column exists
if 'Genres' in data_df.columns:
	# Convert the 'Genres' column from string representation of list to actual list
	data_df['Genres'] = data_df['Genres'].apply(lambda x: literal_eval(x) if isinstance(x, str) else [])

	# Normalize by stripping spaces and making genres lowercase for consistency
	data_df['Genres'] = data_df['Genres'].apply(lambda genres: [genre.strip().lower() for genre in genres])

	# Initialize MultiLabelBinarizer
	mlb = MultiLabelBinarizer()

	# Apply MultiLabelBinarizer to transform genres into multiple columns
	genres_encoded = pd.DataFrame(mlb.fit_transform(data_df['Genres']), columns=mlb.classes_)

	# Add encoded genres to the original dataframe
	data_df = pd.concat([data_df, genres_encoded], axis=1)

	# Function to convert column names to title case (first letter uppercase for each word)
	data_df.columns = [col.replace("_", " ").title() for col in data_df.columns]

	# Drop Genres
	data_df = data_df.drop(columns=['Genres'])

# Clean non-value for Characters and Places
data_df['Characters'] = data_df['Characters'].fillna('')
data_df['Places'] = data_df['Places'].fillna('')
data_df



In [None]:
# Selecting numerical features for scaling (Price and ratings)
numerical_features = ["Price", "1 Star", "2 Star", "3 Star", "4 Star", "5 Star"]
scaler = MinMaxScaler()

# Apply scaling
data_df[numerical_features] = scaler.fit_transform(data_df[numerical_features])

# Splitting dataset

train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

In [None]:
# Identify potential genre columns (binary indicators)
genre_columns = [col for col in data_df.columns if data_df[col].nunique() == 2 and sorted(data_df[col].unique()) == [0, 1]]

# Group similar genres under a unified category
genre_mapping = {
    "Fantasy": ["Fantasy", "Epic Fantasy", "High Fantasy", "Urban Fantasy"],
    "Science Fiction": ["Science Fiction", "Hard Science Fiction", "Military Science Fiction", "Space Opera"],
    "Historical": ["Historical", "Historical Fiction", "Historical Romance"],
    "Mystery & Thriller": ["Mystery", "Cozy Mystery", "Mystery Thriller", "Suspense", "Legal Thriller", "Crime"],
    "Romance": ["Romance", "Contemporary Romance", "Historical Romance", "Regency Romance", "Paranormal Romance"],
    "Horror & Supernatural": ["Horror", "Paranormal", "Supernatural", "Ghosts", "Vampires", "Witches", "Witchcraft"],
    "Comics & Graphic Novels": ["Comic Book", "Comics", "Graphic Novels", "Manga", "Comics Manga", "Shojo", "Superheroes", "Batman", "Superman"],
    "Nonfiction": ["Biography", "Autobiography", "Memoir", "Essays", "History", "Political Science", "Sociology", "Psychology", "Philosophy", "Self Help", "Personal Development", "Reference"],
    "Religion & Spirituality": ["Christian", "Christian Fiction", "Theology", "Paganism", "Islam", "Spirituality"],
    "Arts & Entertainment": ["Art", "Art Design", "Music", "Film", "Theatre", "Drawing", "Design"],
    "Business & Technology": ["Business", "Management", "Economics", "Entrepreneurship", "Finance", "Banking", "Computer Science", "Programming", "Software", "Technology", "Technical"],
}

# Create a new column for each unified genre category
for unified_genre, similar_genres in genre_mapping.items():
   data_df[unified_genre] = data_df[similar_genres].max(axis=1)

# Drop old individual genre columns to avoid redundancy
df_genres = data_df.drop(columns=[genre for sublist in genre_mapping.values() for genre in sublist if genre in data_df.columns])

# Display the new list of genre columns
new_genre_columns = [col for col in data_df.columns if data_df[col].nunique() == 2 and sorted(data_df[col].unique()) == [0, 1]]

In [None]:
# Ensure 'adjusted_sentiment_score' exists
if 'adjusted_sentiment_score' not in data_df.columns:
    import numpy as np
    data_df['adjusted_sentiment_score'] = np.random.uniform(-1, 1, size=len(data_df))  # Simulated sentiment scores

# Categorize sentiment into Positive, Neutral, or Negative
data_df['adjusted_sentiment_category'] = data_df['adjusted_sentiment_score'].apply(
    lambda x: 'Positive' if x > 0.05 else ('Negative' if x < -0.05 else 'Neutral')
)

# Verify if the column now exists
print(data_df[['adjusted_sentiment_score', 'adjusted_sentiment_category']].head())


# Best book recommendations

In [None]:
# Define function for best recommendations with user input
def get_user_recommendations():
    """
    Interactive function to get the best book recommendations based on user input.
    """
    print("Welcome to the Book Recommendation System!")
    
    # Get user input for filtering criteria
    genre = input("Enter preferred genre from the list: Fantasy, Science Fiction, Historical, Mystery, Romance, Horror & Supernatural, Comics & Graphic Novels, Nonfiction, Religion & Spirituality, Arts & Entertainment, Business & Technology").strip()
    sentiment_category = input("Enter sentiment preference Positive, Neutral, Negative: ").strip().capitalize()
    min_ratings = int(input("Enter minimum number of ratings for popularity filtering (default: 10): ") or 10)
    max_price = float(input("Enter maximum price limit (default: 10): ") or 10)
    top_n = int(input("Enter number of top recommendations to display (default: 10): ") or 10)
    
    # Ensure valid sentiment category
    if sentiment_category not in ["Positive", "Neutral", "Negative"]:
        print("Invalid sentiment category. Defaulting to Positive.")
        sentiment_category = "Positive"
    
    # Filter the dataset based on user preferences
    filtered_books = data_df.copy()
    
    # Apply genre-based filtering
    if genre and genre in filtered_books.columns:
        filtered_books = filtered_books[filtered_books[genre] == 1]

    # Apply sentiment-based filtering
    filtered_books = filtered_books[filtered_books['adjusted_sentiment_category'] == sentiment_category]

    # Apply popularity-based filtering (minimum ratings count)
    filtered_books = filtered_books[filtered_books['Ratings Count'] >= min_ratings]

    # Apply price-based filtering
    filtered_books = filtered_books[filtered_books['Price'] <= max_price]

    # Rank books based on adjusted sentiment score + ratings count (weighted ranking)
    filtered_books['final_score'] = (filtered_books['adjusted_sentiment_score'] * 0.6) + \
                                    (filtered_books['Ratings Count'] / filtered_books['Ratings Count'].max() * 0.4)

    # Sort books by final score in descending order
    filtered_books = filtered_books.sort_values(by='final_score', ascending=False)

    # Select the top N recommendations
    recommendations = filtered_books[['Title', 'Author', 'Price', 'Ratings Count', 'adjusted_sentiment_score']].head(top_n)

    print(recommendations.rename_axis('Rank').to_string(index=False))

    
# Execute user-based recommendation function
get_user_recommendations()


In [None]:
# Modify function to return recommendations and plot analysis
def get_ranked_recommendations_with_chart(genre=None, sentiment_category="Positive", min_ratings=100, max_price=50, top_n=10):
    """
    Generate the best book recommendations based on user-specified preferences with ranking and visualization.

    Parameters:
    - genre (str): Preferred genre (optional).
    - sentiment_category (str): Sentiment preference ('Positive', 'Neutral', 'Negative').
    - min_ratings (int): Minimum number of ratings for popularity filtering.
    - max_price (float): Maximum price limit.
    - top_n (int): Number of top recommendations to display.

    Returns:
    - Dataframe of ranked recommended books.
    """
    # Ensure valid sentiment category
    if sentiment_category not in ["Positive", "Neutral", "Negative"]:
        sentiment_category = "Positive"

    # Filter the dataset based on user preferences
    filtered_books = data_df.copy()

    # Apply genre-based filtering
    if genre and genre in filtered_books.columns:
        filtered_books = filtered_books[filtered_books[genre] == 1]

    # Apply sentiment-based filtering
    filtered_books = filtered_books[filtered_books['adjusted_sentiment_category'] == sentiment_category]

    # Apply popularity-based filtering (minimum ratings count)
    filtered_books = filtered_books[filtered_books['Ratings Count'] >= min_ratings]

    # Apply price-based filtering
    filtered_books = filtered_books[filtered_books['Price'] <= max_price]

    # Rank books based on adjusted sentiment score + ratings count (weighted ranking)
    filtered_books['final_score'] = (filtered_books['adjusted_sentiment_score'] * 0.6) + \
                                    (filtered_books['Ratings Count'] / filtered_books['Ratings Count'].max() * 0.4)

    # Sort books by final score in descending order
    filtered_books = filtered_books.sort_values(by='final_score', ascending=False)

    # Select the top N recommendations
    recommendations = filtered_books[['Title', 'Author', 'Price', 'Ratings Count', 'adjusted_sentiment_score', 'final_score']].head(top_n)

    # Add ranking column
    recommendations.insert(0, "Rank", range(1, len(recommendations) + 1))

  

    # Plot bar chart for visualization
    plt.figure(figsize=(10, 6))
    plt.barh(recommendations['Title'], recommendations['final_score'], color='purple')
    plt.xlabel("Final Score")
    plt.ylabel("Book Title")
    plt.title("Top Ranked Book Recommendations")
    plt.gca().invert_yaxis()  # Invert y-axis to show highest-ranked at the top
    plt.show()

# Generate the analysis chart with ranked recommendations
get_ranked_recommendations_with_chart(genre="Fiction", sentiment_category="Positive", min_ratings=500, max_price=30, top_n=20)


In [None]:
# Modify function to return worst recommendations and plot analysis
def get_worst_recommendations_with_chart(genre=None, sentiment_category= "Negative", min_ratings=100, max_price=50, top_n=10):
    """
    Generate the worst book recommendations based on user-specified preferences with ranking and visualization.

    Parameters:
    - genre (str): Preferred genre (optional).
    - sentiment_category (str): Sentiment preference ('Positive', 'Neutral', 'Negative').
    - min_ratings (int): Minimum number of ratings for popularity filtering.
    - max_price (float): Maximum price limit.
    - top_n (int): Number of worst recommendations to display.

    Returns:
    - Dataframe of ranked worst recommended books.
    """
    # Ensure valid sentiment category
    if sentiment_category not in ["Positive", "Neutral", "Negative"]:
        sentiment_category = "egative"

    # Filter the dataset based on user preferences
    filtered_books = data_df.copy()

    # Apply genre-based filtering
    if genre and genre in filtered_books.columns:
        filtered_books = filtered_books[filtered_books[genre] == 1]

    # Apply sentiment-based filtering
    filtered_books = filtered_books[filtered_books['adjusted_sentiment_category'] == sentiment_category]

    # Apply popularity-based filtering (minimum ratings count)
    filtered_books = filtered_books[filtered_books['Ratings Count'] >= min_ratings]

    # Apply price-based filtering
    filtered_books = filtered_books[filtered_books['Price'] <= max_price]

    # Rank books based on adjusted sentiment score + ratings count (weighted ranking)
    filtered_books['final_score'] = (filtered_books['adjusted_sentiment_score'] * 0.6) + \
                                    (filtered_books['Ratings Count'] / filtered_books['Ratings Count'].max() * 0.4)

    # Sort books by final score in ascending order (worst recommendations)
    filtered_books = filtered_books.sort_values(by='final_score', ascending=True)

    # Select the worst N recommendations
    recommendations = filtered_books[['Title', 'Author', 'Price', 'Ratings Count', 'adjusted_sentiment_score', 'final_score']].head(top_n)

    # Add ranking column
    recommendations.insert(0, "Rank", range(1, len(recommendations) + 1))

  

    # Plot bar chart for visualization
    plt.figure(figsize=(10, 6))
    plt.barh(recommendations['Title'], recommendations['final_score'], color='purple')
    plt.xlabel("Final Score")
    plt.ylabel("Book Title")
    plt.title("Worst Ranked Book Recommendations")
    plt.gca().invert_yaxis()  # Invert y-axis to show lowest-ranked at the top
    plt.show()

# Generate the analysis chart with worst ranked recommendations
get_worst_recommendations_with_chart(genre="Fiction", sentiment_category="Positive", min_ratings=500, max_price=30, top_n=20)


# Genre based filtering

In [None]:
# Identify potential genre columns (binary indicators)
genre_columns = [col for col in data_df.columns if data_df[col].nunique() == 2 and sorted(data_df[col].unique()) == [0, 1]]

def filter_books_by_genre(genre, data_df):
    """Filters books based on selected genre."""
    if genre and genre in data_df.columns:
        return data_df[data_df[genre] == 1][["Title", "Author"]].head(10)
    else:
        return pd.DataFrame(columns=["Title", "Author"])

# List available genres for user selection
user_genre = input("Enter a genre from the list: Fantasy, Science Fiction, Historical, Mystery, Romance, Horror & Supernatural, Comics & Graphic Novels, Nonfiction, Religion & Spirituality, Arts & Entertainment, Business & Technology").strip()

# Get filtered book recommendations
filtered_books = filter_books_by_genre(user_genre, data_df)

# Display results
if not filtered_books.empty:
    print("Here are the top 10 book recommendations based on your selected genre:")
    print(filtered_books.reset_index(drop=True).rename_axis('Rank').rename(index=lambda x: x + 1))
else:
    print("\n❌ No matching books found. Try a different genre!")

# Sentiment based filetring

In [None]:
# Initialize VADER Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_user_sentiment(user_input):
    """Analyzes the sentiment of user input using VADER"""
    sentiment_score = analyzer.polarity_scores(user_input)['compound']
    if sentiment_score < -0.2:
        return "Negative"
    elif sentiment_score > 0.2:
        return "Positive"
    else:
        return "Neutral"

def classify_review_score(score):
    """Classifies Review Score into Positive, Neutral, or Negative"""
    if score >= 4:
        return "Positive"
    elif score == 3:
        return "Neutral"
    else:
        return "Negative"

def filter_books_by_sentiment(sentiment, df):
    """Filters books based on sentiment classification and sorts by sentiment score (highest to lowest)."""
    filtered = df[df["Sentiment Classification"] == sentiment][["Title", "Author", "Sentiment Score", "Review Score"]]
    return filtered.sort_values(by="Sentiment Score", ascending=False).head(10)  # Get top 10


data_df["Sentiment Score"] = (
    data_df["1 Star"] * -1.0 +
    data_df["2 Star"] * -0.5 +
    data_df["3 Star"] * 0.0 +
    data_df["4 Star"] * 0.5 +
    data_df["5 Star"] * 1.0
) / (data_df["1 Star"] + data_df["2 Star"] + data_df["3 Star"] + data_df["4 Star"] + data_df["5 Star"])

data_df["Sentiment Classification"] = data_df["Sentiment Score"].apply(lambda score: "Negative" if score < -0.2 else "Positive" if score > 0.2 else "Neutral")
data_df["Review Sentiment"] = data_df["Review Score"].apply(classify_review_score)

# Get user input
user_input = input("What rating would you like to see? Positive, Neutral, Negative: ")
user_sentiment = analyze_user_sentiment(user_input)

# Get filtered book recommendations
filtered_books = filter_books_by_sentiment(user_sentiment, data_df)

# Display results
if not filtered_books.empty:
    print("Here are the top 10 book recommendations based on your sentiment:")
    print(filtered_books.reset_index(drop=True).rename_axis('Rank').rename(index=lambda x: x + 1))
else:
    print("\n❌ No matching books found for your sentiment preference.")

# Popularity based filtering

In [None]:
def filter_books_by_popularity(df, top_n=10):
    """Filters books based on popularity using Ratings Count."""
    return data_df.sort_values(by="Ratings Count", ascending=False).head(top_n)[["Title", "Author", "Ratings Count"]]

# Get user input for number of recommendations
try:
    top_n = int(input("How many popular books would you like to see? (Default: 10): ") or 10)
except ValueError:
    print("Invalid input. Showing top 10 popular books.")
    top_n = 10

# Get filtered book recommendations based on popularity
filtered_books = filter_books_by_popularity(data_df, top_n)

# Display results
if not filtered_books.empty:
    print(f"Here are the top {top_n} popular book recommendations:")
    print(filtered_books.reset_index(drop=True).rename_axis('Rank').rename(index=lambda x: x + 1))
else:
    print("\n❌ No matching books found!")

# Price based filtering

In [None]:
def filter_books_by_price(data_df, min_price=0, max_price=100):
    """Filters books based on a price range."""
    return data_df[(data_df["Price"] >= min_price) & (data_df["Price"] <= max_price)][["Title", "Author", "Price"]].sort_values(by="Price").head(10)

# Get user input for price range
try:
    min_price = float(input("Enter minimum price (Default: 0): ") or 0)
    max_price = float(input("Enter maximum price (Default: 100): ") or 100)
except ValueError:
    print("Invalid input. Showing books in the default price range (0 - 100).")
    min_price, max_price = 0, 100

# Get filtered book recommendations based on price range
filtered_books = filter_books_by_price(data_df, min_price, max_price)

# Display results
if not filtered_books.empty:
    print(f"Here are the books in the price range {min_price} - {max_price}:")
    print(filtered_books.reset_index(drop=True).rename_axis('Rank').rename(index=lambda x: x + 1))
else:
    print("No books found within the selected price range.")

# NLP-based Sentiment Analysis for Book Recommendation

In [None]:
# Download necessary NLTK data
nltk.download('vader_lexicon')

## Sentiment Analysis using VADER

In [None]:

# Initialize VADER Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Apply sentiment scoring on book descriptions or reviews (if available)
if 'Description' in data_df.columns:
    data_df['sentiment_score'] = data_df['Description'].astype(str).apply(lambda x: sia.polarity_scores(x)['compound'])
else:
    print("No 'Description' column found. Using a placeholder sentiment score.")
    data_df['sentiment_score'] = 0  # Placeholder if no descriptions exist

# Categorize sentiment into Positive, Neutral, Negative
data_df['sentiment_category'] = data_df['sentiment_score'].apply(lambda x: 'Positive' if x > 0.05 else ('Negative' if x < -0.05 else 'Neutral'))

# Display sample results
data_df[['Title', 'sentiment_score', 'sentiment_category']].head(10)

## Sentiment-Based Book Recommendation

In [None]:

def recommend_books_by_sentiment(user_sentiment, top_n=5):
    '''
    Recommend books based on the user's sentiment preference.
    user_sentiment: str ('Positive', 'Neutral', 'Negative')
    top_n: Number of books to recommend
    '''
    filtered_books = data_df[data_df['sentiment_category'] == user_sentiment]
    
    if filtered_books.empty:
        return "No books match the selected sentiment category."
    
    return filtered_books[['Title', 'sentiment_score']].sort_values(by='sentiment_score', ascending=False).head(top_n)

# Example: Recommend books with positive sentiment
recommend_books_by_sentiment('Positive', top_n=10)

## Incorporating Review Scores into Sentiment Analysis

In [None]:
# Check if a 'Review Score' column exists, otherwise create a simulated one (if necessary)
if 'Review Score' not in data_df.columns:
    data_df['Review Score'] = np.random.randint(1, 6, size=len(data_df))  # Simulating scores between 1 and 5

# Normalize Review Score to align with sentiment analysis (scale 0 to 1)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_df['normalized_review_score'] = scaler.fit_transform(data_df[['Review Score']])

# Adjust sentiment score using review scores (weighted approach)
data_df['adjusted_sentiment_score'] = (data_df['sentiment_score'] * 0.7) + (data_df['normalized_review_score'] * 0.3)

# Reclassify sentiment based on adjusted scores
data_df['adjusted_sentiment_category'] = data_df['adjusted_sentiment_score'].apply(
    lambda x: 'Positive' if x > 0.05 else ('Negative' if x < -0.05 else 'Neutral'))

# Display updated dataset
data_df[['Title', 'Review Score', 'normalized_review_score', 'adjusted_sentiment_score', 'adjusted_sentiment_category']].head(10)

## Enhanced Sentiment-Based Book Recommendation

In [None]:
# Enhanced Recommendation System: Books are recommended based on the adjusted sentiment score
def recommend_books_by_adjusted_sentiment(user_sentiment, top_n=10):
    '''
    Recommend books based on the user's sentiment preference (adjusted by review scores).
    user_sentiment: str ('Positive', 'Neutral', 'Negative')
    top_n: Number of books to recommend
    '''
    filtered_books = data_df[data_df['adjusted_sentiment_category'] == user_sentiment]
    
    if filtered_books.empty:
        return "No books match the selected sentiment category."
    
    return filtered_books[['Title', 'adjusted_sentiment_score']].sort_values(by='adjusted_sentiment_score', ascending=False).head(top_n)
