In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import re

In [2]:
bookData = pd.read_csv('dataset.csv')

In [3]:
print(bookData.columns.tolist())

['description', 'rating-avg', 'rating-count', 'title']


In [4]:
bookData.drop(['bestsellers-rank', 'dimension-x', 'dimension-y', 'dimension-z', 'edition', 'edition-statement', 'for-ages', 'isbn10', 'isbn13', 'lang', 'publication-date', 'publication-place', 'url', 'weight','format','id','illustrations-note','image-checksum','image-path','image-url','imprint','index-date'], axis=1, inplace=True)

KeyError: "['bestsellers-rank', 'dimension-x', 'dimension-y', 'dimension-z', 'edition', 'edition-statement', 'for-ages', 'isbn10', 'isbn13', 'lang', 'publication-date', 'publication-place', 'url', 'weight', 'format', 'id', 'illustrations-note', 'image-checksum', 'image-path', 'image-url', 'imprint', 'index-date'] not found in axis"

In [None]:
print(bookData.columns.tolist())

['authors', 'categories', 'description', 'rating-avg', 'rating-count', 'title']


In [None]:
print(bookData.head(10))

        authors                                     categories  \
0           [1]  [214, 220, 237, 2646, 2647, 2659, 2660, 2679]   
1        [2, 3]                                    [235, 3386]   
2           [4]                         [358, 2630, 360, 2632]   
3  [5, 6, 7, 8]                              [377, 2978, 2980]   
4           [9]                                   [2813, 2980]   
5      [10, 11]                                   [1520, 1532]   
6     [6, 7, 8]                              [377, 2978, 2980]   
7      [12, 13]                                         [2980]   
8          [14]                        [3223, 700, 1521, 2820]   
9          [15]         [37, 46, 2784, 2942, 2980, 2912, 3385]   

                                         description  rating-avg  \
0  SOLDIER FIVE is an elite soldier's explosive m...        4.03   
1  John Moran and Carl Williams were the two bigg...        3.60   
2  Sir Phillip knew that Eloise Bridgerton was a ...        3.88   
3

In [None]:
bookData.isnull().sum()

authors              0
categories           0
description      80087
rating-avg      440130
rating-count    440130
title                0
dtype: int64

In [None]:
bookData_filtered = bookData.dropna()

In [None]:
bookData_filtered.count()

authors         644881
categories      644881
description     644881
rating-avg      644881
rating-count    644881
title           644881
dtype: int64

In [None]:
bookData_filtered.to_excel("bookData.xlsx")

In [None]:
financialBookData = pd.read_excel('bookData.xlsx', 'FinancialBookData')

In [None]:
financialBookData.head(10)

Unnamed: 0,description,rating-avg,rating-count,title
0,"Options traders know all about leverage, and s...",4.0,1,Options for Swing Trading : Leverage and Low R...
1,Chalkboards and projectors are familiar tools ...,4.04,23,Intentional Tech : Principles to Guide the Use...
2,Grant funding has become increasingly crucial ...,4.0,2,Grant Seeking in Higher Education : Strategies...
3,The consensus among educators nationwide is th...,3.5,2,Prioritizing the Common Core : Identifying Spe...
4,One of the most important assets you have is y...,3.87,141,Earn What You're Really Worth : Maximize Your ...
5,Written by a practitioner with years working i...,4.0,3,XVA Desks - A New Era for Risk Management : Un...
6,Businesses are not maximising their scientific...,4.67,9,Scientists in Every Boardroom : Harnessing the...
7,"Money makes the world go round, but it doesn't...",3.41,17,Your Balanced Budget
8,From the author of Real Life Money and the wom...,5.0,1,The Real Life Money Journal : A practical guid...
9,Evidence-Based Technical Analysis examines how...,3.62,105,Evidence-Based Technical Analysis : Applying t...


In [None]:
financialBookData.isnull().sum()

description     0
rating-avg      0
rating-count    0
title           0
dtype: int64

In [None]:
user_title = input("Enter a book title: ")

Enter a book title: Dora the Explorer


In [None]:
#importing stopwords to be removed from description
english_stopwords = stopwords.words('english')

In [None]:
# Data Preprocessing with Stopword Removal
def preprocess_text(text):
  """
  Preprocesses text data for better recommendation accuracy, including stopword removal.
  """
  text = text.lower()  # Convert to lowercase
  text = re.sub(r"[^a-z0-9\s]", "", text)  # Remove non-alphanumeric characters
  words = [word for word in text.split() if word not in english_stopwords]  # Remove stopwords
  return " ".join(words)

In [None]:
financialBookData['preprocessed_description'] = financialBookData['description'].apply(preprocess_text)  # Assuming descriptions are available

In [None]:
financialBookData['preprocessed_description']

0       options traders know leverage swing traders ke...
1       chalkboards projectors familiar tools college ...
2       grant funding become increasingly crucial univ...
3       consensus among educators nationwide indepth i...
4       one important assets earning ability ability s...
                              ...                        
6810    highly prized ability make financial plans cer...
6811    next boom jack w plunkett widely followed anal...
6812    roughly year managing director international m...
6813    man born free everywhere debta declared econom...
6814    inequality charged topic measures income inequ...
Name: preprocessed_description, Length: 6815, dtype: object

In [None]:
# Content-based Filtering using TF-IDF and cosine similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(financialBookData['preprocessed_description'])

user_vector = vectorizer.transform([user_title.lower()])
cosine_similarities = cosine_similarity(user_vector, tfidf_matrix)
content_based_scores = cosine_similarities.flatten()

In [None]:
tfidf_matrix

<6815x32834 sparse matrix of type '<class 'numpy.float64'>'
	with 357327 stored elements in Compressed Sparse Row format>

In [None]:
user_vector

<1x32834 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [None]:
financialBookData.columns

Index(['description', 'rating-avg', 'rating-count', 'title',
       'preprocessed_description'],
      dtype='object')

In [None]:
financialBookData.shape

(6815, 5)

In [None]:
rating_matrix = financialBookData.pivot_table(index='description', columns='title', values='rating-avg', aggfunc='count').fillna(0)

In [None]:
# Calculate item-to-item similarity using cosine similarity
item_similarities = cosine_similarity(rating_matrix)

In [None]:
def get_item_sim_recommendations(user_title, item_similarities, k=5):
  """
  Recommends books similar to the user-entered title based on item-to-item collaborative filtering.

  Even if the user-entered title is not found, it returns recommendations based on item similarities
  or an empty list if no title is found.
  """

  try:
      # Using get_loc() for potentially duplicate titles
      user_index = financialBookData['title'].eq(user_title).idxmax()
      similar_items = item_similarities[user_index]
      similar_items_sorted = similar_items.argsort()[-k:]  # Sort for top k similar items
      return financialBookData.loc[similar_items_sorted[1:]]['title'].tolist()

  except KeyError:
      print(f"Book '{user_title}' not found in data. Returning empty recommendations.")
      return []


In [None]:
# Hybrid Recommendation
def get_hybrid_recommendations(user_title, alpha=0.6, beta=0.4, k=5):
  """
  Calculates hybrid recommendation scores based on content-based and collaborative filtering (item-to-item).
  """
  content_recommendations = financialBookData.iloc[content_based_scores.argsort()[-k:]][['title']].values.ravel()  # Top k content-based recommendations
  collaborative_recommendations = get_item_sim_recommendations(user_title, item_similarities)  # Top k collaborative recommendations (if book found)

  if collaborative_recommendations is None:
    return content_recommendations  # Use only content-based if book not found
  else:
    merged_recommendations = list(content_recommendations[:4]) + \
                               list(collaborative_recommendations[:4])

  # ... (Rest of your code to display or use recommendations)

  return merged_recommendations

In [None]:
# Get Hybrid Recommendations
hybrid_recommendations = get_hybrid_recommendations(user_title, item_similarities)

# Print Hybrid Recommendations
if hybrid_recommendations:
  print(f"Hybrid Recommendations for '{user_title}':")
  for i, book in enumerate(hybrid_recommendations,start=1):
    print(f"{i}. {book}")
else:
  print("\nTrying Item-to-Item Recommendations (if hybrid no result):")
  collab_recommendations = get_item_sim_recommendations(user_title, item_similarities)

  if collab_recommendations:
    print(f"Item-to-Item Recommendations for '{user_title}':")
    for i, book in enumerate(collab_recommendations, start=1):
        print(f"{i}. {book}")
  else:
    print("Book not found or no item-to-item recommendations available.")

Hybrid Recommendations for 'Dora the Explorer':
1. Fisher Investments on Energy
2. The Basics of Process Improvement
3. Mastering Elliott Wave Principle : Elementary Concepts, Wave Patterns, and Practice Exercises
4. The Great Investors : Lessons on Investing from Master Traders
5. The Complete Guide to Property Investing Success
6. Instant Millionaires : The Secrets of Overnight Success
7. The Richest Man in Babylon : Blueprint for Financial Success - Lesson 1: The Man Who Desired Much Gold & the Richest Man in Babylon Tells His Syste
8. Options for Swing Trading : Leverage and Low Risk to Maximize Short-Term Trading
