#  Book Recommendation System with Cosine Similarity, Manhattan, and Euclidean Distances

#  import libraries

In [1]:
import numpy as np
import pandas as pd
import pickle

#  load dataset

In [4]:
#  source : https://www.kaggle.com/datasets/zygmunt/goodbooks-10k

In [5]:
books = pd.read_csv('goodbooks/books.csv')

In [6]:
ratings = pd.read_csv('goodbooks/ratings.csv')

In [7]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         10000 non-null  int64  
 1   book_id                    10000 non-null  int64  
 2   best_book_id               10000 non-null  int64  
 3   work_id                    10000 non-null  int64  
 4   books_count                10000 non-null  int64  
 5   isbn                       9300 non-null   object 
 6   isbn13                     9415 non-null   float64
 7   authors                    10000 non-null  object 
 8   original_publication_year  9979 non-null   float64
 9   original_title             9415 non-null   object 
 10  title                      10000 non-null  object 
 11  language_code              8916 non-null   object 
 12  average_rating             10000 non-null  float64
 13  ratings_count              10000 non-null  int6

In [8]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780000000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780000000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780000000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780000000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780000000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [9]:
# Finding Similarities

In [43]:
table_df = ratings.merge(books, on='book_id', how='right')\
                           .filter(['book_id','user_id','rating','title'])
table_df

Unnamed: 0,book_id,user_id,rating,title
0,2767052,,,"The Hunger Games (The Hunger Games, #1)"
1,3,314.0,3.0,Harry Potter and the Sorcerer's Stone (Harry P...
2,3,588.0,1.0,Harry Potter and the Sorcerer's Stone (Harry P...
3,3,2077.0,2.0,Harry Potter and the Sorcerer's Stone (Harry P...
4,3,2487.0,3.0,Harry Potter and the Sorcerer's Stone (Harry P...
...,...,...,...,...
88884,8914,46715.0,4.0,The First World War
88885,8914,47073.0,4.0,The First World War
88886,8914,49663.0,3.0,The First World War
88887,8914,50305.0,3.0,The First World War


In [18]:
table_df.book_id.nunique()

10000

In [19]:
table_df.user_id.nunique()

28907

In [45]:
table_df = table_df.fillna(0)

In [46]:
table_df = table_df[['user_id','book_id','rating','title']]
table_df

Unnamed: 0,user_id,book_id,rating,title
0,0.0,2767052,0.0,"The Hunger Games (The Hunger Games, #1)"
1,314.0,3,3.0,Harry Potter and the Sorcerer's Stone (Harry P...
2,588.0,3,1.0,Harry Potter and the Sorcerer's Stone (Harry P...
3,2077.0,3,2.0,Harry Potter and the Sorcerer's Stone (Harry P...
4,2487.0,3,3.0,Harry Potter and the Sorcerer's Stone (Harry P...
...,...,...,...,...
88884,46715.0,8914,4.0,The First World War
88885,47073.0,8914,4.0,The First World War
88886,49663.0,8914,3.0,The First World War
88887,50305.0,8914,3.0,The First World War


In [21]:

pivot = pd.pivot_table(table_df, values='rating', 
                        index='title', columns='user_id',
                        fill_value=0
                      )
pivot

user_id,0.0,2.0,3.0,4.0,7.0,9.0,10.0,11.0,14.0,15.0,...,53404.0,53406.0,53408.0,53409.0,53416.0,53419.0,53420.0,53422.0,53423.0,53424.0
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#GIRLBOSS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
'Salem's Lot,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"'Tis (Frank McCourt, #2)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"1,000 Places to See Before You Die",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
واحة الغروب,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
يوتوبيا,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ڤيرتيجو,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
キスよりも早く1 [Kisu Yorimo Hayaku 1] (Faster than a Kiss #1),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# cosine_similarity

from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(pivot)
similarity_scores

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
cosine_sim_df = pd.DataFrame(similarity_scores, index=pivot.index, columns=pivot.index)
cosine_sim_df

title,"Angels (Walsh Family, #3)",#GIRLBOSS,'Salem's Lot,"'Tis (Frank McCourt, #2)","1,000 Places to See Before You Die",1/4 جرام,"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works","100 Bullets, Vol. 1: First Shot, Last Call",100 Love Sonnets,100 Selected Poems,...,محال,مخطوطة بن إسحاق: مدينة الموتى,نادي السيارات,هشت کتاب,هيبتا,واحة الغروب,يوتوبيا,ڤيرتيجو,キスよりも早く1 [Kisu Yorimo Hayaku 1] (Faster than a Kiss #1),美少女戦士セーラームーン新装版 1 [Bishōjo Senshi Sailor Moon Shinsōban 1]
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#GIRLBOSS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'Tis (Frank McCourt, #2)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"1,000 Places to See Before You Die",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
واحة الغروب,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
يوتوبيا,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ڤيرتيجو,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
キスよりも早く1 [Kisu Yorimo Hayaku 1] (Faster than a Kiss #1),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
#  euclidean_distances

from sklearn.metrics.pairwise import euclidean_distances

dist_euc = euclidean_distances(pivot)
dist_euc

array([[ 0.        ,  0.        , 35.34119409, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        , 35.34119409, ...,  0.        ,
         0.        ,  0.        ],
       [35.34119409, 35.34119409,  0.        , ..., 35.34119409,
        35.34119409, 35.34119409],
       ...,
       [ 0.        ,  0.        , 35.34119409, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        , 35.34119409, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        , 35.34119409, ...,  0.        ,
         0.        ,  0.        ]])

In [29]:
dist_euc_df = pd.DataFrame(dist_euc, index=pivot.index, columns=pivot.index)
dist_euc_df.head()

title,"Angels (Walsh Family, #3)",#GIRLBOSS,'Salem's Lot,"'Tis (Frank McCourt, #2)","1,000 Places to See Before You Die",1/4 جرام,"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works","100 Bullets, Vol. 1: First Shot, Last Call",100 Love Sonnets,100 Selected Poems,...,محال,مخطوطة بن إسحاق: مدينة الموتى,نادي السيارات,هشت کتاب,هيبتا,واحة الغروب,يوتوبيا,ڤيرتيجو,キスよりも早く1 [Kisu Yorimo Hayaku 1] (Faster than a Kiss #1),美少女戦士セーラームーン新装版 1 [Bishōjo Senshi Sailor Moon Shinsōban 1]
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",0.0,0.0,35.341194,40.32369,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#GIRLBOSS,0.0,0.0,35.341194,40.32369,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,35.341194,35.341194,0.0,53.619026,35.341194,35.341194,35.341194,35.341194,35.341194,35.341194,...,35.341194,35.341194,35.341194,35.341194,35.341194,35.341194,35.341194,35.341194,35.341194,35.341194
"'Tis (Frank McCourt, #2)",40.32369,40.32369,53.619026,0.0,40.32369,40.32369,40.32369,40.32369,40.32369,40.32369,...,40.32369,40.32369,40.32369,40.32369,40.32369,40.32369,40.32369,40.32369,40.32369,40.32369
"1,000 Places to See Before You Die",0.0,0.0,35.341194,40.32369,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# manhattan_distances

from sklearn.metrics.pairwise import manhattan_distances

dist_mht = manhattan_distances(pivot)
dist_mht

array([[  0.,   0., 291., ...,   0.,   0.,   0.],
       [  0.,   0., 291., ...,   0.,   0.,   0.],
       [291., 291.,   0., ..., 291., 291., 291.],
       ...,
       [  0.,   0., 291., ...,   0.,   0.,   0.],
       [  0.,   0., 291., ...,   0.,   0.,   0.],
       [  0.,   0., 291., ...,   0.,   0.,   0.]])

In [32]:
dist_mht_df = pd.DataFrame(dist_mht, index=pivot.index, columns=pivot.index)
dist_mht_df.head()

title,"Angels (Walsh Family, #3)",#GIRLBOSS,'Salem's Lot,"'Tis (Frank McCourt, #2)","1,000 Places to See Before You Die",1/4 جرام,"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works","100 Bullets, Vol. 1: First Shot, Last Call",100 Love Sonnets,100 Selected Poems,...,محال,مخطوطة بن إسحاق: مدينة الموتى,نادي السيارات,هشت کتاب,هيبتا,واحة الغروب,يوتوبيا,ڤيرتيجو,キスよりも早く1 [Kisu Yorimo Hayaku 1] (Faster than a Kiss #1),美少女戦士セーラームーン新装版 1 [Bishōjo Senshi Sailor Moon Shinsōban 1]
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",0.0,0.0,291.0,392.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#GIRLBOSS,0.0,0.0,291.0,392.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,291.0,291.0,0.0,683.0,291.0,291.0,291.0,291.0,291.0,291.0,...,291.0,291.0,291.0,291.0,291.0,291.0,291.0,291.0,291.0,291.0
"'Tis (Frank McCourt, #2)",392.0,392.0,683.0,0.0,392.0,392.0,392.0,392.0,392.0,392.0,...,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
"1,000 Places to See Before You Die",0.0,0.0,291.0,392.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
def recommend(book_name):
    
    # index fetch
    index = np.where(pivot.index==book_name)[0][0]
    similar_cosine = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:6]
    similar_euc = sorted(list(enumerate(dist_euc[index])),key=lambda x:x[1],reverse=True)[1:6]
    similar_manh = sorted(list(enumerate(dist_mht[index])),key=lambda x:x[1],reverse=True)[1:6]
    
    cosine_data = []
    j = 1
    for i in similar_cosine:
        data = str(j) + "- "+ pivot.index[i[0]]
        cosine_data.append(data)
        j += 1

    euc_data = []
    j = 1
    for i in similar_euc:
        data = str(j) + "- "+ pivot.index[i[0]]
        euc_data.append(data)
        j += 1
    
    manh_data = []
    j = 1
    for i in similar_manh:
        data = str(j) + "- "+ pivot.index[i[0]]
        manh_data.append(data)
        j += 1
        
    
    return  print("Cosine Similarity Results:\n",*cosine_data, sep = "\n"), print("\n\n"),\
            print("Euclidean Distances Results:\n",*euc_data, sep = "\n"),\
            print("\n\n"),\
            print("Manhattan Distances Results:\n",*manh_data, sep = "\n")

In [80]:
# get similarities

recommend("Deception Point")

Cosine Similarity Results:

1- Men Are from Mars, Women Are from Venus
2- The Universe in a Nutshell
3- The Millionaire Next Door: The Surprising Secrets of Americas Wealthy
4- The Canterbury Tales
5- The Quiet American



Euclidean Distances Results:

1- The Beautiful and Damned
2- The Taste of Home Cookbook
3- A People's History of the United States
4- Hard Times
5- Villa Incognito



Manhattan Distances Results:

1- Still Life with Woodpecker
2- The Taste of Home Cookbook
3- A People's History of the United States
4- Hard Times
5- The Curious Incident of the Dog in the Night-Time


(None, None, None, None, None)

In [70]:
import pickle

# cosine_scores
pickle.dump(similarity_scores,open('cosine.pkl','wb'))

# euc_scores
pickle.dump(dist_euc,open('euc.pkl','wb'))

# mht_scores
pickle.dump(dist_mht,open('mht.pkl','wb'))

# pivot table
pickle.dump(pivot,open('pivot.pkl','wb'))