In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import process
from sklearn.linear_model import LinearRegression


In [None]:
data = pd.read_csv('/kaggle/input/bookkeywords/keywords.csv')

In [4]:
# Transform author names, publisher into single token
data["author2"] = data["author"].str.strip().str.replace(' ','_')
data["publisher2"] = data["publisher"].str.strip().str.replace(' ','_')

Unnamed: 0,url,title,description,imageUrl,genres,publisher,author,characters,language,series,lemmatized_description_x,sum_x,lemmatized_description_y,sum_y,keywords,author2,publisher2
0,https://www.goodreads.com/book/show/2165.The_O...,the old man and the sea,librarian note alternate cover edition found h...,https://images-na.ssl-images-amazon.com/images...,american classic literature novels 20th centur...,scribner,ernest hemingway,manolin santiago,English,,librarian note alternate cover edition found h...,the old man and the sea ernest_hemingway scrib...,librarian note alternate cover edition found h...,the old man and the sea ernest_hemingway scrib...,marlin novel nobel literature author,ernest_hemingway,scribner
1,https://www.goodreads.com/book/show/10507293-t...,the selection,thirty five girls selection chance lifetime op...,https://images-na.ssl-images-amazon.com/images...,fantasy teen fiction science fiction young adu...,harperteen,kiera cass,america singer prince maxon aspen leger,English,the selection,thirty five girl selection chance lifetime opp...,the selection kiera_cass harperteen thirty fiv...,thirty five girl selection chance lifetime opp...,the selection kiera_cass harperteen thirty fiv...,prince dreamed lifetime life nightmare,kiera_cass,harperteen
2,https://www.goodreads.com/book/show/5148.A_Sep...,a separate peace,american classic great bestseller thirty years...,https://images-na.ssl-images-amazon.com/images...,historical fiction novels fiction high school ...,scribner,john knowles,gene forrester phineas,English,,american classic great bestseller thirty year ...,a separate peace john_knowles scribner america...,american classic great bestseller thirty year ...,a separate peace john_knowles scribner america...,adolescence bestseller knowles peace innocence,john_knowles,scribner


In [5]:
# Merge all textual information to create summary
data["sum"] = data[["title", 'author2', 'publisher2', 'keywords','language','series','characters','genres']].fillna('').agg(' '.join, axis=1)
data = data.drop(columns=['author2', 'publisher2'])

data = data.drop(columns=['sum_x','sum_y','lemmatized_description_y','lemmatized_description_x','keywords','description','genres','publisher','author','characters','language','series'])
data.head(3)

Unnamed: 0,url,title,imageUrl,sum
0,https://www.goodreads.com/book/show/2165.The_O...,the old man and the sea,https://images-na.ssl-images-amazon.com/images...,the old man and the sea ernest_hemingway scrib...
1,https://www.goodreads.com/book/show/10507293-t...,the selection,https://images-na.ssl-images-amazon.com/images...,the selection kiera_cass harperteen prince dre...
2,https://www.goodreads.com/book/show/5148.A_Sep...,a separate peace,https://images-na.ssl-images-amazon.com/images...,a separate peace john_knowles scribner adolesc...


In [6]:
model_data = data.copy()
model_data['index'] = range(1, len(model_data) + 1)
model_data.head(2)

Unnamed: 0,url,title,imageUrl,sum,index
0,https://www.goodreads.com/book/show/2165.The_O...,the old man and the sea,https://images-na.ssl-images-amazon.com/images...,the old man and the sea ernest_hemingway scrib...,1
1,https://www.goodreads.com/book/show/10507293-t...,the selection,https://images-na.ssl-images-amazon.com/images...,the selection kiera_cass harperteen prince dre...,2


In [7]:
tfidf_vectorizer = TfidfVectorizer(analyzer = 'word',
                min_df=3,
                max_df = 0.6,
                stop_words="english",
                encoding = 'utf-8')
tfidf_matrix = tfidf_vectorizer.fit_transform(model_data['sum'])

In [9]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [26]:
def find_closest_match(user_input, available_options):
    
    closest_match, score = process.extractOne(user_input, available_options)

    threshold = 80  
    if score >= threshold:
        return closest_match
    else:
        return user_input

In [10]:
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

In [27]:
def content_based_recommender(book_titles):
    valid_titles = []
    invalid_titles = []
    sum_valid_titles = []

    for title in book_titles:
        str_title = str(title).lower()
        str_title = find_closest_match(str_title, model_data['title'].values)
        if str_title in model_data['title'].values:
            valid_titles.append(str_title)
            book_item = model_data[model_data['title']==str_title]
            book_values = book_item['sum'].values
            book_values_str = ', '.join(map(str, book_values))
            sum_valid_titles.append(book_values_str)
        else:
            invalid_titles.append(str_title)

    if not valid_titles:
        print('No valid titles found in the dataset. Please check spelling.')
        return
    query_vecs = tfidf_vectorizer.transform(sum_valid_titles)
    avg_query_vec = np.mean(query_vecs, axis=0)
    avg_query_vec = np.asarray(avg_query_vec)
    
    similarities = cosine_similarity(avg_query_vec, tfidf_matrix)

    indices = np.argpartition(similarities[0], -6)[-6:]
    results = data.iloc[indices]
    results = results.drop(columns=['sum'])
    
#     print(results)
    print("\nRecommendations books:")
    display(results.head(5).style.format({'url': make_clickable, 'imageUrl': show_image}))

    if invalid_titles:
        print(f"\nTitles not found in the dataset: {', '.join(invalid_titles)}")

In [28]:
content_based_recommender(["Harry Potter and the Philosopher's Stone 1"])


Recommendations books:


Unnamed: 0,url,title,imageUrl
1446,Goodreads,harry potter and the prisoner of azkaban,
8315,Goodreads,harry potter and the sorcerer’s stone,
1598,Goodreads,harry potter and the chamber of secrets,
6857,Goodreads,harry potter and the philosopher’s stone,
1463,Goodreads,harry potter and the deathly hallows,


# User

In [13]:
ratings = pd.read_json('/kaggle/input/user-comment/clean_rate_data.jl',lines=True)

In [14]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443077 entries, 0 to 443076
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   book_url     443077 non-null  object
 1   title        443077 non-null  object
 2   user         443077 non-null  object
 3   user_rate    443077 non-null  int64 
 4   user_review  443077 non-null  object
dtypes: int64(1), object(4)
memory usage: 16.9+ MB


In [15]:
# create ratings df
ratings = ratings.drop(columns = ['title','user_review'])
ratings.head()

Unnamed: 0,book_url,user,user_rate
0,https://www.goodreads.com/book/show/205650.Fle...,7407364-luffy-oda-s-version,2
1,https://www.goodreads.com/book/show/205650.Fle...,8655084-phrynne,5
2,https://www.goodreads.com/book/show/205650.Fle...,3569723-ammar,4
3,https://www.goodreads.com/book/show/205650.Fle...,18202588-anna,5
4,https://www.goodreads.com/book/show/205650.Fle...,13056902,4


In [16]:
# create users df
users = ratings['user']
users = users.drop_duplicates()
users.info()

<class 'pandas.core.series.Series'>
Index: 152536 entries, 0 to 443076
Series name: user
Non-Null Count   Dtype 
--------------   ----- 
152536 non-null  object
dtypes: object(1)
memory usage: 2.3+ MB


In [17]:
ratings_merge = pd.merge(ratings, model_data, left_on = 'book_url', right_on='url', how = 'inner')
ratings_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416997 entries, 0 to 416996
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   book_url   416997 non-null  object
 1   user       416997 non-null  object
 2   user_rate  416997 non-null  int64 
 3   url        416997 non-null  object
 4   title      416997 non-null  object
 5   imageUrl   416548 non-null  object
 6   sum        416997 non-null  object
 7   index      416997 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 25.5+ MB


In [18]:
ratings_merge = ratings_merge.drop(columns = ['book_url'])
ratings_merge = ratings_merge.drop_duplicates()

In [38]:
# all_books = pd.read_json('/kaggle/input/book-detail/clean_book_data.jl',lines=True)
# def popularity_based():
#     sorted_books = all_books.sort_values(by=['avgRating', 'ratingsCount'], ascending=[False, False])
#     top_10_books = sorted_books.head(10)
#     return top_10_books[['url','title','description']]
    

In [39]:
popularity_based()

Unnamed: 0,url,title,description
17225,https://www.goodreads.com/book/show/13554552-s...,Sapphire Reign,opens 10 years after the events in kings amp q...
17044,https://www.goodreads.com/book/show/45062775-b...,Broken Promises: The Suspenseful Sequel To The...,for beauty all she wants is for her life to go...
19166,https://www.goodreads.com/book/show/43798164-i...,I'm a F***ing Star: Get on Reality TV,this short but impressively in depth book will...
27376,https://www.goodreads.com/book/show/196939567-...,Original Twin,a twisty crossover thriller that will have rea...
12315,https://www.goodreads.com/book/show/13036945-m...,MAJI The untold adventure of the men of the East,young zebedeo is between boyhood and manhood w...
28635,https://www.goodreads.com/book/show/30229303-g...,Going It Alone: Why Just Writing Your Book Is ...,going it alone why just writing your book is n...
19146,https://www.goodreads.com/book/show/59772668-s...,Surviving The Second Tier,sicily sis jones is the only undefeated colleg...
22112,https://www.goodreads.com/book/show/36427079-n...,Never Marry a Biker,this book is the follow up to pack your bags a...
19631,https://www.goodreads.com/book/show/27424051-f...,F.I.G.H.T.: Family Inspires Great Hope Togethe...,when it is your child the urge to fight is rel...
4802,https://www.goodreads.com/book/show/37757785-a...,Armée des ombres,arm e des ombres


In [20]:
def book_recommend_with_user(user_id):
#     get list of books that user rated
    user_ratings = ratings_merge[ratings_merge['user']==user_id]
    user_book_index = user_ratings['index'].values
    user_book_rates = user_ratings['user_rate']
#     get tfidf vector
    tfidf_user_book = tfidf_matrix[user_book_index, : ]
    
#     build model to predict rate of unread book
    X = tfidf_user_book.toarray()
    y = user_book_rates
    model = LinearRegression()
    model.fit(X, y)
    
    tfidf_rate = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
    tfidf_rate['predict_rate'] = model.predict(tfidf_matrix)
    
#     unread_books = tfidf_rate[~tfidf_rate.index.isin(user_ratings['index'])]
    top_results = tfidf_rate.nlargest(10, 'predict_rate')
    results = data.iloc[top_results.index]
    results = results.drop(columns = ['sum'])
    print("\nRecommendations books:")
    display(results.head(5).style.format({'url': make_clickable, 'imageUrl': show_image}))


In [41]:
user_id_input = '1305690'
user_id = find_closest_match(user_id_input, users.values)
book_recommend_with_user(user_id)

Index([1825, 28234, 3771, 30788, 1692, 9724, 19646, 10894, 2789, 17313], dtype='int64')

Recommendations books:


Unnamed: 0,url,title,imageUrl
1825,Goodreads,a gracious enemy,
28234,Goodreads,the crooked hinge,
3771,Goodreads,a gracious enemy & after the war volume one,
30788,Goodreads,hag's nook,
1692,Goodreads,princess: a true story of life behind the veil in saudi arabia,
