<a href="https://colab.research.google.com/github/mukthakaja/book-recommender/blob/main/book_recommender_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

muktha_ratings = pd.read_csv('/content/drive/My Drive/goodreads_library_export.csv')
df1 = pd.DataFrame(muktha_ratings)
goodreads_dataset_trial = pd.read_csv('/content/drive/My Drive/popular_books.csv')
goodreads_dataset = pd.DataFrame(goodreads_dataset_trial)

df1, goodreads_dataset

Mounted at /content/drive


(       Book Id                                              Title  \
 0    198563734                                     Summer Romance   
 1    201145400        Love Unwritten (Lakefront Billionaires, #2)   
 2    127305686             Icon and Inferno (Stars and Smoke, #2)   
 3    197108968                              The Family Experiment   
 4    199261152                                 A Novel Love Story   
 ..         ...                                                ...   
 550      24178                                    Charlotte’s Web   
 551    6547258                    The Final Empire (Mistborn, #1)   
 552   17675462               The Raven Boys (The Raven Cycle, #1)   
 553     249747                    Artemis Fowl (Artemis Fowl, #1)   
 554          3  Harry Potter and the Sorcerer's Stone (Harry P...   
 
                 Author          Author l-f              Additional Authors  \
 0     Annabel Monaghan   Monaghan, Annabel                             NaN   


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


muktha_ratings = muktha_ratings.rename(columns={'Book Id': 'common_book_id'}) # renaming columns with the same name to make it easier for merging
goodreads_dataset = goodreads_dataset.rename(columns={'goodreads_book_id': 'common_book_id'})


merged_ratings = muktha_ratings.merge(goodreads_dataset, on='common_book_id') # merging the datasets w.r.t common book id


goodreads_dataset['combined_features'] = goodreads_dataset['original_title'].fillna('') + ' ' + goodreads_dataset['authors'].fillna('') # tfidf doesn't work on NaN values, so filling with empty string values


tfidf_vectorizer = TfidfVectorizer() # initializing tf-idf vectorizer and fit_transform
tfidf_matrix = tfidf_vectorizer.fit_transform(goodreads_dataset['combined_features'])


cosine_metric = cosine_similarity(tfidf_matrix, tfidf_matrix) # using the cosine similarity metric to calculate and recommend


def book_recommender(muktha_ratings, goodreads_dataset, cosine_metric, top_n=50): # function for the book recommender
    muktha_ratings = muktha_ratings.sort_values(by='My Rating', ascending=False)
    book_recommendations = pd.Series(dtype='float64')

    for _, row in muktha_ratings.iterrows():
        matched_books = goodreads_dataset[goodreads_dataset['common_book_id'] == row['common_book_id']] # to match and recommend common books
        if matched_books.empty:
            #print(f"No match found for book id {row['common_book_id']}")
            continue

        index = matched_books.index[0]
        similarity_scores = pd.Series(cosine_metric[index])
        similarity_scores = similarity_scores.sort_values(ascending=False)

        highest_books = similarity_scores.iloc[1:top_n + 1] # will calculate similarity scores based on top n - n variant based on user's wants
        book_recommendations = pd.concat([book_recommendations, highest_books])

    book_recommendations = book_recommendations.groupby(book_recommendations.index).mean()
    book_recommendations = book_recommendations.sort_values(ascending=False)

    book_recommendations = book_recommendations.head(top_n)
    recommended_books = goodreads_dataset.loc[book_recommendations.index]

    return recommended_books


book_recommendations = book_recommender(muktha_ratings, goodreads_dataset, cosine_metric) # obtaining book recs


top_books = book_recommendations[['title', 'authors', 'original_publication_year', 'average_rating']].head(100) # too many rows ; unreadable format ... therefore, reducing to a readable format with the imp. info

#^^
readable_output = []
for index, row in top_books.iterrows():
    book_details = (
        f"Title: {row['title']}\n"
        f"Author(s): {row['authors']}\n"
        f"Publication Year: {int(row['original_publication_year'])}\n"
        f"Average Rating: {row['average_rating']}\n"
        f"{'-'*40}\n"
    )
    readable_output.append(book_details)


readable_output_string = "\n".join(readable_output) # joining & displaying final o/p
print(readable_output_string)


Title: Artemis Fowl Boxed Set, Bks 1-5 (Artemis Fowl, #1-5)
Author(s): Eoin Colfer
Publication Year: 2003
Average Rating: 4.29
----------------------------------------

Title: The Hitchhiker's Guide to the Galaxy: A Trilogy in Four Parts
Author(s): Douglas Adams
Publication Year: 1986
Average Rating: 4.51
----------------------------------------

Title: Found (The Missing, #1)
Author(s): Margaret Peterson Haddix
Publication Year: 2008
Average Rating: 3.97
----------------------------------------

Title: Four: A Divergent Story Collection (Divergent, #0.1 - 0.4)
Author(s): Veronica Roth
Publication Year: 2014
Average Rating: 4.1
----------------------------------------

Title: The Evolution of Mara Dyer (Mara Dyer, #2)
Author(s): Michelle Hodkin
Publication Year: 2012
Average Rating: 4.33
----------------------------------------

Title: The Retribution of Mara Dyer (Mara Dyer, #3)
Author(s): Michelle Hodkin
Publication Year: 2014
Average Rating: 4.14
------------------------------------