Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Load and Preprocess Data

In [2]:
books_df = pd.read_csv(r'C:\Users\DELL\Desktop\audible_insights\data\cleaned_books.csv')


In [3]:
books_df.head(5)

Unnamed: 0,Book Name,Author,Number of Reviews_x,Price_x,Number of Reviews_y,Price_y,Description,Listening Time,Ranks and Genre,Rating
0,Think Like a Monk: The Secret of How to Harnes...,Jay Shetty,313.0,10080.0,371.0,10080,"Over the past three years, Jay Shetty has beco...",10 hours and 54 minutes,",#1 in Audible Audiobooks & Originals (See Top...",4.9
1,Ikigai: The Japanese Secret to a Long and Happ...,Héctor García,3658.0,615.0,3682.0,615,Brought to you by Penguin.,3 hours and 23 minutes,",#2 in Audible Audiobooks & Originals (See Top...",4.6
2,The Subtle Art of Not Giving a F*ck: A Counter...,Mark Manson,20174.0,10378.0,20306.0,10378,"In this generation-defining self-help guide, a...",5 hours and 17 minutes,",#3 in Audible Audiobooks & Originals (See Top...",4.4
3,Atomic Habits: An Easy and Proven Way to Build...,James Clear,4614.0,888.0,4678.0,888,Brought to you by Penguin.,5 hours and 35 minutes,",#5 in Audible Audiobooks & Originals (See Top...",4.6
4,Life's Amazing Secrets: How to Find Balance an...,Gaur Gopal Das,4302.0,1005.0,4308.0,1005,"Stop going through life, Start growing throug...",6 hours and 25 minutes,",#6 in Audible Audiobooks & Originals (See Top...",4.6


Create Content-Based Filtering Model

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

data = pd.read_csv(r'C:\Users\DELL\Desktop\audible_insights\data\cleaned_books.csv')

data['Description'] = data['Description'].fillna('')  

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['Description'])  

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get recommendations based on cosine similarity
def get_recommendations(book_name, cosine_sim=cosine_sim):
    
    # Check if the book exists in the dataset
    if book_name not in data['Book Name'].values:
        return f"Book '{book_name}' not found in the dataset. Showing recommendations for the first book instead."

    # Get index of the book that matches the title
    idx = data.index[data['Book Name'] == book_name].tolist()[0]

    # pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Books sort based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get indices most similar books
    sim_scores = sim_scores[1:11]  # To get top 10 similar books
    book_indices = [i[0] for i in sim_scores]

    # Returns top 10 most similar books
    return data['Book Name'].iloc[book_indices]

#To Get recommendations for a book
book_name = 'Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now'  
recommended_books = get_recommendations(book_name)

print("Recommended Books:")
print(recommended_books)


Recommended Books:
1765               The Classic Tales Podcast, Season Five
1262    The Facebook Effect: The Inside Story of the C...
2710    Built to Serve: Find Your Purpose and Become t...
3046    The Rise and Fall of the Dinosaurs: The Untold...
27                             Raavan: Enemy of Aryavarta
2279                             On the Origin of Species
2315                   God Save the Hon'ble Supreme Court
925                  Social Media Marketing Workbook 2020
376     50 Self-Help Classics to Guide You to Financia...
927                     10 Essential Pieces of Literature
Name: Book Name, dtype: object


Clustering-Based Recommendation

In [5]:
# Clustering books using KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign cluster labels-books
books_df['cluster'] = kmeans.labels_




 Collaborative Filtering Model

In [6]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

data = pd.read_csv(r'C:\Users\DELL\Desktop\audible_insights\data\cleaned_books.csv')

data['Description'] = data['Description'].fillna('')  

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['Description']) 

# Collaborative Filtering using Nearest Neighbors (KNN)
knn = NearestNeighbors(n_neighbors=5, metric='cosine') 
knn.fit(tfidf_matrix)

# For given book, recommend the top 5 similar books,but instead of passing just one row, pass the entire matrix for similarity calculation
distances, indices = knn.kneighbors(tfidf_matrix[0:1])  # passing 2D array with one sample

recommended_books = data.iloc[indices[0]]

print("Recommended Books:")
print(recommended_books[['Book Name', 'Author']])  



Recommended Books:
                                              Book Name             Author
0     Think Like a Monk: The Secret of How to Harnes...         Jay Shetty
1765             The Classic Tales Podcast, Season Five    Agatha Christie
1262  The Facebook Effect: The Inside Story of the C...  David Kirkpatrick
2710  Built to Serve: Find Your Purpose and Become t...    Evan Carmichael
3046  The Rise and Fall of the Dinosaurs: The Untold...     Steve Brusatte


Content-based Filtering Recommendations

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# content-based recommendations
def get_content_based_recommendations(book_name, cosine_sim):
    # Index of the book that matches the title
    idx = data.index[data['Book Name'] == book_name].tolist()[0]

    # Get pairwise similarity scores 
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sorting books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get indices of most similar books
    sim_scores = sim_scores[1:11] 
    book_indices = [i[0] for i in sim_scores]

    return data['Book Name'].iloc[book_indices]


Combine Content-based and Collaborative Recommendations

In [8]:
# Function to combine content-based and collaborative filtering recommendations
def combine_recommendations(content_recommendations, collaborative_recommendations):
    # Combine both recommendations and remove duplicates
    combined_recommendations = list(set(content_recommendations) | set(collaborative_recommendations))
    
    # Return the combined recommendations
    return combined_recommendations


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

data = pd.read_csv(r'C:\Users\DELL\Desktop\audible_insights\data\cleaned_books.csv')

# Ensure 'Description' column has no NaN values
data['Description'] = data['Description'].fillna('')

# 'Description' column for content-based filtering
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['Description'])  # Using 'Description' column

# cosine similarity to calculate similarity between books
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# content-based recommendations
def get_content_based_recommendations(book_name, cosine_sim):
    # chwecking bookname in the dataset
    if book_name not in data['Book Name'].values:
        return f"Error: Book '{book_name}' not found in dataset."
    
    # Get the index of the book that matches the title
    idx = data.index[data['Book Name'] == book_name].tolist()[0]

    # pairwise similarity scores books
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the most similar books
    sim_scores = sim_scores[1:11]  # Get top 10 similar books
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return data['Book Name'].iloc[book_indices]

# Collaborative Filtering Model using Nearest Neighbors (KNN)
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(tfidf_matrix)

# Function for collaborative filtering (example)
def get_collaborative_filtering_recommendations(user_id):
    # collaborative filtering logic,books based on some collaborative filtering logic

    return data['Book Name'].head(5)  # Example placeholder

# combining content-based and collaborative filtering
def combine_recommendations(content_recommendations, collaborative_recommendations):
    # Combine the two lists (you can refine the combination method)
    return list(set(content_recommendations).union(set(collaborative_recommendations)))

# Example usage:
book_name = 'Extraordinary Leadership'  # book name from your dataset
user_id = 0  # actual user ID

# content-based recommendations of existing books
content_recommendations = get_content_based_recommendations(book_name, cosine_sim)

# If book not found, returns error message
if isinstance(content_recommendations, str):  # Check if it's an error message
    print(content_recommendations)
else:
    # collaborative filtering recommendations (assuming you have a user-item matrix)
    collaborative_recommendations = get_collaborative_filtering_recommendations(user_id)

    # Combine results from both approaches
    hybrid_recommendations = combine_recommendations(content_recommendations, collaborative_recommendations)

    # hybrid recommendations
    print("Hybrid Recommended Books:")
    for book in hybrid_recommendations:
        print(book)


Hybrid Recommended Books:
The E-Myth Revisited: Why Most Small Businesses Don't Work and What to Do About It
Atomic Habits: An Easy and Proven Way to Build Good Habits and Break Bad Ones
Losing My Virginity
Life's Amazing Secrets: How to Find Balance and Purpose in Your Life
Crime and Punishment
Common Stocks and Uncommon Profits
Ikigai: The Japanese Secret to a Long and Happy Life
Living with the Himalayan Masters
The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life
O Jerusalem: Day by Day and Minute by Minute the Historic Struggle for Jerusalem and the Birth of Israel
The Alchemist: A Fable About Following Your Dream
The Fountainhead
Think and Grow Rich
Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now
Surely You're Joking, Mr. Feynman!


Putting Everything Together

In [10]:
book_name = 'Extraordinary Leadership'
user_id = 0  # Replace with actual user ID

# content-based recommendations
content_recommendations = get_content_based_recommendations(book_name, cosine_sim)

# collaborative filtering recommendations (assuming you have a user-item matrix)
collaborative_recommendations = get_collaborative_filtering_recommendations(user_id)

# Combine results both approaches
hybrid_recommendations = combine_recommendations(content_recommendations, collaborative_recommendations)

# hybrid recommendations
print("Hybrid Recommended Books:")
for book in hybrid_recommendations:
    print(book)


Hybrid Recommended Books:
The E-Myth Revisited: Why Most Small Businesses Don't Work and What to Do About It
Atomic Habits: An Easy and Proven Way to Build Good Habits and Break Bad Ones
Losing My Virginity
Life's Amazing Secrets: How to Find Balance and Purpose in Your Life
Crime and Punishment
Common Stocks and Uncommon Profits
Ikigai: The Japanese Secret to a Long and Happy Life
Living with the Himalayan Masters
The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life
O Jerusalem: Day by Day and Minute by Minute the Historic Struggle for Jerusalem and the Birth of Israel
The Alchemist: A Fable About Following Your Dream
The Fountainhead
Think and Grow Rich
Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now
Surely You're Joking, Mr. Feynman!


 Evaluate the Models

In [11]:
# Evaluate model performance with example
y_true = [1, 0, 1, 0, 1]  # True labels (1 for relevant, 0 for irrelevant)
y_pred = [1, 0, 0, 1, 1]  # Predicted labels

# precision and recall
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

# RMSE for collaborative filtering 
rmse = np.sqrt(mean_squared_error(y_true, y_pred))


Generate Book Recommendations

In [12]:
def recommend_books(user_preferences):
    content_recommendations = get_content_based_recommendations(user_preferences)
    collaborative_recommendations = get_collaborative_filtering_recommendations(user_preferences)
    
    # Combined final recommendations
    return hybrid_recommendations


Save the Models

In [13]:
import joblib

joblib.dump(kmeans, 'kmeans_model.pkl')
joblib.dump(knn, 'knn_model.pkl')


['knn_model.pkl']