In [58]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import joblib

In [59]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [60]:
df.drop_duplicates(subset=['isbn13', 'title'], inplace=True)
df.dropna(subset=['title', 'authors'], inplace=True)

In [61]:
for col in ['subtitle', 'categories', 'description']:
    df[col] = df[col].fillna('')

In [62]:
df['average_rating'] = df['average_rating'].fillna(0.0)
df['num_pages'] = df['num_pages'].fillna(0)
df['ratings_count'] = df['ratings_count'].fillna(0)
df['published_year'] = df['published_year'].fillna(0)

In [63]:
df['combined'] = (
    df['title'] + ' ' +
    df['subtitle'] + ' ' +
    df['authors'] + ' ' +
    df['categories'] + ' ' +
    df['description']
)

In [64]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

In [65]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [66]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [67]:
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
joblib.dump(cosine_sim, 'cosine_sim.pkl')
joblib.dump(indices, 'title_indices.pkl')
df.to_pickle('books_df.pkl')

print("Model and data saved successfully.")

Model and data saved successfully.


In [68]:

def recommend_books(read_books, top_n=5):
    """Return top N book recommendations based on list of read books."""
    book_indices = [indices[book] for book in read_books if book in indices]

    if not book_indices:
        print("None of the input books found in dataset.")
        return pd.DataFrame()

    sim_scores = sum(cosine_sim[i] for i in book_indices) / len(book_indices)
    sim_scores = list(enumerate(sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommended_indices = [
        i for i, _ in sim_scores if df.iloc[i]['title'] not in read_books
    ][:top_n]

    return df.iloc[recommended_indices][[
        'isbn13', 'title', 'authors', 'average_rating', 'published_year'
    ]]

if __name__ == "__main__":
    test_books = ["Spider's Web", "The Four Loves"]
    print("\nTop Recommendations for:", test_books)
    print(recommend_books(test_books, top_n=5))


Top Recommendations for: ["Spider's Web", 'The Four Loves']
             isbn13                                              title  \
224   9780060652852                                         C.S. Lewis   
427   9780061129735                                  The Art of Loving   
5     9780006280934                                The Problem of Pain   
99    9780020442806  The Chronicles of Narnia: Lion, the witch and ...   
1611  9780312970079                                       Black Coffee   

                  authors  average_rating  published_year  
224           C. S. Lewis            4.31          1996.0  
427           Erich Fromm            4.03          2006.0  
5     Clive Staples Lewis            4.09          2002.0  
99    Clive Staples Lewis            4.26          1978.0  
1611      Agatha Christie            3.48          1999.0  
