# Project : Build a recommendation system using cosine similarity which recommends top 5 books to the user based on their choice

In [1]:
import warnings
warnings.filterwarnings('ignore')  # use to avoid warning in the code

In [2]:
import pandas as pd # import pandas lib. using it's short form pd

In [3]:
# Load datasets
users = pd.read_csv(r'Users.csv')
books = pd.read_csv(r'Books.csv')
ratings = pd.read_csv(r'Ratings.csv')

In [4]:
users.head()  # top 5 rows

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [5]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [6]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
print(users.shape)
print(books.shape)
print(ratings.shape)

(278858, 3)
(271360, 8)
(1149780, 3)


In [8]:
users.info() # Get info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [9]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


This system has to recommend top 5 books. So, let's check is there any duplicate rows in 'Book-Title' column

In [11]:
books[books.duplicated('Book-Title')]  # print duplicated rows in Book-Title column

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
239,0743486226,Angels &amp; Demons,Dan Brown,2003,Atria,http://images.amazon.com/images/P/0743486226.0...,http://images.amazon.com/images/P/0743486226.0...,http://images.amazon.com/images/P/0743486226.0...
355,0140374248,James and the Giant Peach,Roald Dahl,2000,Penguin USA (Paper),http://images.amazon.com/images/P/0140374248.0...,http://images.amazon.com/images/P/0140374248.0...,http://images.amazon.com/images/P/0140374248.0...
405,006101351X,The Perfect Storm : A True Story of Men Agains...,Sebastian Junger,1998,HarperTorch,http://images.amazon.com/images/P/006101351X.0...,http://images.amazon.com/images/P/006101351X.0...,http://images.amazon.com/images/P/006101351X.0...
456,044021145X,The Firm,John Grisham,1992,Bantam Dell Publishing Group,http://images.amazon.com/images/P/044021145X.0...,http://images.amazon.com/images/P/044021145X.0...,http://images.amazon.com/images/P/044021145X.0...
475,014023828X,The Tortilla Curtain,T. Coraghessan Boyle,1996,Penguin Books,http://images.amazon.com/images/P/014023828X.0...,http://images.amazon.com/images/P/014023828X.0...,http://images.amazon.com/images/P/014023828X.0...
...,...,...,...,...,...,...,...,...
271246,0195124995,For Cause and Comrades: Why Men Fought in the ...,James M. McPherson,1998,Oxford University Press,http://images.amazon.com/images/P/0195124995.0...,http://images.amazon.com/images/P/0195124995.0...,http://images.amazon.com/images/P/0195124995.0...
271315,1902852036,All the Queen's Men,Nick Elwood,1999,Gay Men's Press,http://images.amazon.com/images/P/1902852036.0...,http://images.amazon.com/images/P/1902852036.0...,http://images.amazon.com/images/P/1902852036.0...
271316,0441011799,Singularity Sky,Charles Stross,2004,Ace,http://images.amazon.com/images/P/0441011799.0...,http://images.amazon.com/images/P/0441011799.0...,http://images.amazon.com/images/P/0441011799.0...
271346,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,http://images.amazon.com/images/P/0395264707.0...,http://images.amazon.com/images/P/0395264707.0...,http://images.amazon.com/images/P/0395264707.0...


In [12]:
new_books = books.drop_duplicates('Book-Title') # Drop duplicated rows in  Book-Title column

In [13]:
new_books[new_books.duplicated('Book-Title')]  # now there is no duplicated rows in Book-Title column

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L


In [14]:
new_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


Since, our system has to recommend top 5 books. So, first we need to focus on new_books and then focus on other dataset having any common column. We observe that new_books and ratings share a common column 'ISBN' . So, let's merge these two datasets using 'ISBN'

In [15]:
print(ratings.shape)
print(new_books.shape)

(1149780, 3)
(242135, 8)


In [16]:
# Merge ratings and new_books 
ratings_books = ratings.merge(new_books, on='ISBN')

In [17]:
ratings_books.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [18]:
print(ratings_books.shape)

(883079, 10)


In [19]:
# Drop non-relevant columns
ratings_books.drop(['ISBN', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis = 1, inplace = True)

In [20]:
ratings_books.head()

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
2,6543,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
3,8680,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
4,10314,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books


Now, we can merge ratings_books with users because they share a common column 'User-ID'

In [21]:
print(ratings_books.shape)
print(users.shape)

(883079, 6)
(278858, 3)


In [22]:
users_ratings_books = ratings_books.merge(users, on='User-ID') # Merge 'ratings_books' with 'users' 

In [23]:
users_ratings_books.head()

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Location,Age
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"tyler, texas, usa",
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"cincinnati, ohio, usa",23.0
2,2313,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,"cincinnati, ohio, usa",23.0
3,2313,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,"cincinnati, ohio, usa",23.0
4,2313,5,The Mistress of Spices,Chitra Banerjee Divakaruni,1998,Anchor Books/Doubleday,"cincinnati, ohio, usa",23.0


In [24]:
print(users_ratings_books.shape)

(883079, 8)


In [25]:
# Drop non-relevant columns
users_ratings_books.drop(['Location', 'Age'], axis = 1, inplace = True)

In [26]:
users_ratings_books.head() # it is our combined and final dataset for this project

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
2,2313,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage
3,2313,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins
4,2313,5,The Mistress of Spices,Chitra Banerjee Divakaruni,1998,Anchor Books/Doubleday


In [27]:
users_ratings_books.isnull().sum() # check null values

User-ID                0
Book-Rating            0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              1
dtype: int64

Since, we have only 2 null values in entire dataset. So, we can drop entire rows having null value from the dataset 

In [28]:
users_ratings_books.dropna(inplace=True)  # drop rows having null value

In [29]:
users_ratings_books.isnull().sum() # now there is no null value 

User-ID                0
Book-Rating            0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
dtype: int64

In [30]:
print(users_ratings_books.shape)

(883077, 6)


In [31]:
users_ratings_books.head()

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
2,2313,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage
3,2313,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins
4,2313,5,The Mistress of Spices,Chitra Banerjee Divakaruni,1998,Anchor Books/Doubleday


since, too many entries in users_ratings_books, so, we will filter down to users who gave many book ratings 

In [32]:
#group User-ID and Book-Rating to count no. of ratings each user has given  
x = users_ratings_books.groupby('User-ID').count()['Book-Rating'] > 100 # count no. of Book-Rating more than 100 for each User-ID

In [33]:
x  # if user has given more than 100 ratings, then True else False

User-ID
2         False
8         False
9         False
10        False
12        False
          ...  
278844    False
278846    False
278849    False
278851    False
278854    False
Name: Book-Rating, Length: 83910, dtype: bool

Length: 83910 means there are 83910 unique User-IDs 

In [34]:
x[x]  # display only True values of Book-Rating in x 

User-ID
254       True
507       True
882       True
1424      True
1435      True
          ... 
277478    True
277639    True
278137    True
278188    True
278418    True
Name: Book-Rating, Length: 1418, dtype: bool

In [35]:
knowledgeable_users = x[x].index  # display only True values of User-ID in x i.e. these are the User-IDs who gave more than 100 ratings

In [36]:
knowledgeable_users 

Int64Index([   254,    507,    882,   1424,   1435,   1733,   1903,   2033,
              2110,   2276,
            ...
            274808, 275020, 275970, 276680, 277427, 277478, 277639, 278137,
            278188, 278418],
           dtype='int64', name='User-ID', length=1418)

In [37]:
knowledgeable_users.shape

(1418,)

#isin() method checks if the Dataframe contains the specified value(s).
#It returns a DataFrame similar to the original DataFrame, but the original values have been replaced with True if the value was one of the specified values, otherwise False.

In [38]:
users_ratings_books['User-ID']

0         276725
1           2313
2           2313
3           2313
4           2313
           ...  
883074    276618
883075    276647
883076    276647
883077    276660
883078    276672
Name: User-ID, Length: 883077, dtype: int64

In [39]:
users_ratings_books['User-ID'].isin(knowledgeable_users)

0         False
1         False
2         False
3         False
4         False
          ...  
883074    False
883075    False
883076    False
883077    False
883078    False
Name: User-ID, Length: 883077, dtype: bool

In [40]:
#users_ratings_books[users_ratings_books['User-ID'].isin(knowledgeable_users)]

In [41]:
filtered_users_ratings = users_ratings_books[users_ratings_books['User-ID'].isin(knowledgeable_users)]

In [42]:
filtered_users_ratings # False will be replaced by NaN and True will be replaced by original values in this dataframe

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
36,6543,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
37,6543,0,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
38,6543,0,The Da Vinci Code,Dan Brown,2003,Doubleday
39,6543,0,Wild Animus,Rich Shapero,2004,Too Far
40,6543,0,Four To Score (A Stephanie Plum Novel),Janet Evanovich,1999,St. Martin's Paperbacks
...,...,...,...,...,...,...
795067,240543,10,Guidelines of faith,Satoru Izumi,1980,Nichiren Shoshu International Center
795068,240543,10,Treasures of the heart,Daisaku Ikeda,1982,Nichiren Shoshu International Center
795069,240543,10,Songs of victory: Poems and photographs,Daisaku Ikeda,1988,Nichiren Shoshu International Center
795070,240543,10,The creative family,Daisaku Ikeda,1992,Nichiren Shoshu International Center


Now, entries in dataframe Users_ratings_books have reduced from 883077 to 488357  

let's further filter down to the books that have received atleast 50 ratings 

In [43]:
filtered_users_ratings.groupby('Book-Title').count()['Book-Rating']

Book-Title
 A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)    3
 Always Have Popsicles                                                                                        1
 Apple Magic (The Collector's series)                                                                         1
 Beyond IBM: Leadership Marketing and Finance for the 1990s                                                   1
 Clifford Visita El Hospital (Clifford El Gran Perro Colorado)                                                1
                                                                                                             ..
Ã?Â?berraschung am Valentinstag.                                                                              1
Ã?Â?lpiraten.                                                                                                 1
Ã?Â?rger mit Produkt X. Roman.                                                               

In [44]:
# combines Book-Title with Book-Rating to count the no. ratings each book has received

y = filtered_users_ratings.groupby('Book-Title').count()['Book-Rating'] >= 50 

In [45]:
y  # if book has received more than 50 ratings, then True else False

Book-Title
 A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)    False
 Always Have Popsicles                                                                                        False
 Apple Magic (The Collector's series)                                                                         False
 Beyond IBM: Leadership Marketing and Finance for the 1990s                                                   False
 Clifford Visita El Hospital (Clifford El Gran Perro Colorado)                                                False
                                                                                                              ...  
Ã?Â?berraschung am Valentinstag.                                                                              False
Ã?Â?lpiraten.                                                                                                 False
Ã?Â?rger mit Produkt X. Roman.                               

In [46]:
y[y]   # display only True values of Book-Rating in y 

Book-Title
1984                                                                 True
1st to Die: A Novel                                                  True
2010: Odyssey Two                                                    True
2nd Chance                                                           True
4 Blondes                                                            True
                                                                     ... 
Year of Wonders                                                      True
You Belong To Me                                                     True
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values    True
Zoya                                                                 True
\O\" Is for Outlaw"                                                  True
Name: Book-Rating, Length: 620, dtype: bool

In [47]:
famous_books = y[y].index # display only True values of Book-Title in y i.e. books that received more than 50 ratings 

In [48]:
famous_books

Index(['1984', '1st to Die: A Novel', '2010: Odyssey Two', '2nd Chance',
       '4 Blondes',
       'A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash',
       'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Wicked: The Life and Times of the Wicked Witch of the West', 'Wifey',
       'Wild Animus', 'Without Remorse',
       'Word Freak: Heartbreak, Triumph, Genius, and Obsession in the World of Competitive Scrabble Players',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=620)

In [49]:
 filtered_users_ratings['Book-Title']

36                           Flesh Tones: A Novel
37                      The Lovely Bones: A Novel
38                              The Da Vinci Code
39                                    Wild Animus
40         Four To Score (A Stephanie Plum Novel)
                           ...                   
795067                        Guidelines of faith
795068                     Treasures of the heart
795069    Songs of victory: Poems and photographs
795070                        The creative family
795071                                 Still life
Name: Book-Title, Length: 488357, dtype: object

In [50]:
 filtered_users_ratings['Book-Title'].isin(famous_books)

36        False
37         True
38         True
39         True
40         True
          ...  
795067    False
795068    False
795069    False
795070    False
795071    False
Name: Book-Title, Length: 488357, dtype: bool

In [51]:

final_users = filtered_users_ratings[filtered_users_ratings['Book-Title'].isin(famous_books)]

In [52]:
final_users  # False replaced by NaN and True replaced by original values in this dataframe

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
37,6543,0,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
38,6543,0,The Da Vinci Code,Dan Brown,2003,Doubleday
39,6543,0,Wild Animus,Rich Shapero,2004,Too Far
40,6543,0,Four To Score (A Stephanie Plum Novel),Janet Evanovich,1999,St. Martin's Paperbacks
42,6543,10,Roses Are Red (Alex Cross Novels),James Patterson,2001,Warner Vision
...,...,...,...,...,...,...
733862,163759,5,Macgregor Brides (Macgregors),Nora Roberts,1997,Silhouette
734071,27169,8,"Whirlwind (Tyler, Book 1)",Nancy Martin,1992,Harlequin
756016,133868,0,Nerd in Shining Armor,Vicki Lewis Thompson,2003,Dell Publishing Company
771060,155463,0,2010: Odyssey Two,Arthur C. Clarke,1984,Del Rey Books


entries in this dataframe further reduced to 51,997. Now, we can create a pivot table for this dataframe

In [53]:
#create the pivot table for 'final_users'
# make Book-Title as index column, User-ID as a header and Book-Rating as values 
pivot_table = final_users.pivot_table(index = 'Book-Title', columns = 'User-ID', values = 'Book-Rating')

In [54]:
pivot_table 

User-ID,254,507,882,1424,1435,1733,1903,2033,2110,2276,...,274808,275020,275970,276680,277427,277478,277639,278137,278188,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,0.0,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2010: Odyssey Two,,,,,,,,,,0.0,...,,,,,,,,,,
2nd Chance,,,,,,,,,,,...,,,,,,,,,,
4 Blondes,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,,,...,,,0.0,,,,,,,
You Belong To Me,,,,,,,,,,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,,,,,,,...,,,0.0,,,,,,,
Zoya,,,,,,,,,,,...,,,,,,,,,,


In [55]:
# Filling the NA values with '0'
pivot_table.fillna(0, inplace = True)

In [56]:
pivot_table.head()

User-ID,254,507,882,1424,1435,1733,1903,2033,2110,2276,...,274808,275020,275970,276680,277427,277478,277639,278137,278188,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
#standardize the pivot table using 'StandardScaler'

from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler(with_mean=True, with_std=True)
pivot_table_normalized = scaler.fit_transform(pivot_table)

In [58]:
pivot_table_normalized

array([[ 7.64872154, -0.07946654, -0.11651892, ..., -0.08988424,
        -0.08753254, -0.0977649 ],
       [-0.13389447, -0.07946654, -0.11651892, ..., -0.08988424,
        -0.08753254, -0.0977649 ],
       [-0.13389447, -0.07946654, -0.11651892, ..., -0.08988424,
        -0.08753254, -0.0977649 ],
       ...,
       [-0.13389447, -0.07946654, -0.11651892, ..., -0.08988424,
        -0.08753254, -0.0977649 ],
       [-0.13389447, -0.07946654, -0.11651892, ..., -0.08988424,
        -0.08753254, -0.0977649 ],
       [-0.13389447, -0.07946654, -0.11651892, ..., -0.08988424,
        -0.08753254, -0.0977649 ]])

In [59]:
# use cosine similarity on the standardised data
# calculate the similarity matrix for all the items(books) using 'cosine_similarity'.  -- model building

from sklearn.metrics.pairwise import cosine_similarity
similarity_score = cosine_similarity(pivot_table_normalized)

In [60]:
similarity_score

array([[ 1.        , -0.01859849,  0.08733311, ...,  0.01350255,
        -0.02201299, -0.02261981],
       [-0.01859849,  1.        , -0.0294832 , ..., -0.02238366,
        -0.03070822,  0.10549892],
       [ 0.08733311, -0.0294832 ,  1.        , ...,  0.00231518,
         0.00643198, -0.01337545],
       ...,
       [ 0.01350255, -0.02238366,  0.00231518, ...,  1.        ,
        -0.00500822, -0.01371862],
       [-0.02201299, -0.03070822,  0.00643198, ..., -0.00500822,
         1.        ,  0.00914464],
       [-0.02261981,  0.10549892, -0.01337545, ..., -0.01371862,
         0.00914464,  1.        ]])

Now, we create a function called 'recommend()' which recommends top 5 books to the user based on their choice.


In [61]:
import numpy as np

In [62]:
def recommend(book_name): 
    
    # Returns the numerical index for the book_name
    index = np.where(pivot_table.index==book_name)[0][0]
    
    # Sorts the similarities for the book_name in descending order ([1:6] means 5 books)
    similar_books = sorted(list(enumerate(similarity_score[index])),key=lambda x:x[1], reverse=True)[1:6]
    
    # To return result in list format
    data = []
    
    for index,similarity in similar_books:
        item = []
        # Get the book details by index (it selects the top 5 similar books excluding the given book)
        temp_df = new_books[new_books['Book-Title'] == pivot_table.index[index]]
        
# Only add the title, author, and image-url to the result (retrieves these details of the top 5 similar books from the new_books)
        item.extend(temp_df['Book-Title'].values)
        item.extend(temp_df['Book-Author'].values)
        item.extend(temp_df['Image-URL-M'].values)
        
        data.append(item)
    return data    # return info as a list

when we pass any book from the pivot table as a parameter to the recommend() func, it will return the top 5 similar books

In [63]:
# Model Validating
# Call the recommend method
recommend('1984') # retrieves the top 5 similar books excluding the given book '1984'

[["Foucault's Pendulum",
  'Umberto Eco',
  'http://images.amazon.com/images/P/0345368754.01.MZZZZZZZ.jpg'],
 ['Tis : A Memoir',
  'Frank McCourt',
  'http://images.amazon.com/images/P/0684848783.01.MZZZZZZZ.jpg'],
 ['Animal Farm',
  'George Orwell',
  'http://images.amazon.com/images/P/0451526341.01.MZZZZZZZ.jpg'],
 ['The Glass Lake',
  'Maeve Binchy',
  'http://images.amazon.com/images/P/0440221595.01.MZZZZZZZ.jpg'],
 ['Summer Pleasures',
  'Nora Roberts',
  'http://images.amazon.com/images/P/0373218397.01.MZZZZZZZ.jpg']]

In [64]:
recommend('1st to Die: A Novel') # retrieves the top 5 similar books excluding the book '1st to Die: A Novel'

[['The Beach House',
  'James Patterson',
  'http://images.amazon.com/images/P/0446612545.01.MZZZZZZZ.jpg'],
 ['The Sum of All Fears (Jack Ryan Novels)',
  'Tom Clancy',
  'http://images.amazon.com/images/P/0425133540.01.MZZZZZZZ.jpg'],
 ['\\O\\" Is for Outlaw"',
  'SUE GRAFTON',
  'http://images.amazon.com/images/P/0449003787.01.MZZZZZZZ.jpg'],
 ['Roses Are Red (Alex Cross Novels)',
  'James Patterson',
  'http://images.amazon.com/images/P/0446605484.01.MZZZZZZZ.jpg'],
 ["Left Behind: A Novel of the Earth's Last Days (Left Behind No. 1)",
  'Tim Lahaye',
  'http://images.amazon.com/images/P/0842329129.01.MZZZZZZZ.jpg']]