# Book Recommender System

In [1]:
import numpy as np
import pandas as pd

In [2]:
# load data
def load_file(csv_file_list, i): 
  csv_file = csv_file_list[i]
  data = pd.read_csv(csv_file)
  return data 

csv_files = ["Books.csv", "Users.csv", "Ratings.csv"]

In [3]:
# load files: books, users, ratings
books = load_file(csv_files, 0)
users = load_file(csv_files, 1)
ratings = load_file(csv_files, 2)

  data = pd.read_csv(csv_file)


## Descriptive Analysis

In [4]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [5]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
# shape of books, users, ratings
print(f"Books: {books.shape}")
print(f"Users: {users.shape}")
print(f"Ratings: {ratings.shape}")

Books: (271360, 8)
Users: (278858, 3)
Ratings: (1149780, 3)


Identify Nullvalues

In [8]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [9]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [10]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

Identify Duplicated Rows

In [11]:
books.duplicated().sum()
ratings.duplicated().sum()
users.duplicated().sum()

np.int64(0)

## Building Recommender Systems: 4 Approaches

+ Popular-based Recommender System
+ Collaborative-Filtering System
+ Content-based Filtering System

## Model Building: Popularity Based Recommender System

Model Type: Popularity Based Recommendation System

In [12]:
ratings_w_name = ratings.merge(books, on="ISBN")

In [13]:
num_ratings_df = ratings_w_name.groupby("Book-Title")[["Book-Rating"]].count().reset_index()
num_ratings_df.rename(columns={"Book-Rating": "Num-Ratings"}, inplace=True)
num_ratings_df.head()

Unnamed: 0,Book-Title,Num-Ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [14]:
avg_ratings_df = ratings_w_name.groupby("Book-Title")[["Book-Rating"]].mean().reset_index()
avg_ratings_df.rename(columns={"Book-Rating": "Avg-Ratings"}, inplace=True)
avg_ratings_df

Unnamed: 0,Book-Title,Avg-Ratings
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


In [15]:
popular_df = num_ratings_df.merge(avg_ratings_df, on="Book-Title")
popular_df

Unnamed: 0,Book-Title,Num-Ratings,Avg-Ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


In [16]:
popular_df = popular_df[popular_df["Num-Ratings"] >= 250].sort_values('Avg-Ratings', ascending=False).head(50)

In [17]:
popular_df = popular_df.merge(books, on="Book-Title")
popular_df = popular_df.drop_duplicates('Book-Title')[["Book-Title", "Book-Author", "Image-URL-M", "Num-Ratings", "Avg-Ratings"]]

In [18]:
popular_df.head()

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,Num-Ratings,Avg-Ratings
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.73741
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.501441
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,556,5.183453


## Collaborative-Filtering Based Recommender System

In [19]:
x = ratings_w_name.groupby("User-ID").count()["Book-Rating"] > 200
rw_users = x[x].index # user_ids written and read

+ Number of Users: 92 106 users
+ Book-rating: how many ratings each user has given

In [20]:
filtered_rating = ratings_w_name[ratings_w_name["User-ID"].isin(rw_users)]
y = filtered_rating.groupby("Book-Title")["Book-Rating"].count() >= 50
famous_books = y[y].index

+ Number of ratings in rw users: 474007 ratings

In [21]:
famous_books

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

In [22]:
final_ratings = filtered_rating[filtered_rating["Book-Title"].isin(famous_books)]

In [23]:
# find duplicates in final ratings data
final_ratings.duplicated().sum()

np.int64(0)

In [24]:
# pivot final ratings data
final_book_pt = final_ratings.pivot_table(index="Book-Title", columns="User-ID", values="Book-Rating")
final_book_pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,10.0,,,,,,0.0,,,
1st to Die: A Novel,,,,,,,,,,9.0,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,0.0,...,,,,,,0.0,,,0.0,
4 Blondes,,,,,,,,0.0,,,...,,,,,,,,,,
A Bend in the Road,0.0,,7.0,,,,,,,,...,,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,,0.0,...,,9.0,,,,,0.0,,,
You Belong To Me,,,,,,,,,0.0,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,0.0,,,...,,,,,,,0.0,,,
Zoya,,,,,,,,,,,...,,0.0,,,,,,,,


In [25]:
final_book_pt.fillna(0, inplace=True)

In [26]:
final_book_pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
num_users, num_books = final_book_pt.shape
print(f"Number of Users: {num_users}")
print(f"Number of Books: {num_books}")

Number of Users: 706
Number of Books: 810


#### Cosine Similiarity: Measuring distance between datapoints

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
similarity_scores = cosine_similarity(final_book_pt)
similarity_scores[0]
similarity_scores.shape

(706, 706)

number of items: 706

Using the recommend()-function, you can select a book based on your preference. After you give in the book-name, the recommend()-function will generate new recommended books that are highly similar to your input book.

the recommend()-function used in the recommend.html file

In [68]:
# create function --> suggest 5 books
def recommend(book_name): 
  # index fetch
  index = np.where(final_book_pt.index==book_name)[0][0]
  similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:6]
  
  data = []
  # iterate items
  for items in similar_items: 
    item = []
    # print(items[0])
    # print(final_book_pt.index[items[0]])
    # print(f"{items[0]} - {final_book_pt.index[items[0]]}")
    temp_df = books[books['Book-Title']== final_book_pt.index[items[0]]]
    item.extend(list(temp_df.drop_duplicates("Book-Title")['Book-Title'].values))
    item.extend(list(temp_df.drop_duplicates("Book-Title")['Book-Author'].values))
    item.extend(list(temp_df.drop_duplicates("Book-Title")["Image-URL-M"].values))

    data.append(item)

  return data

In [69]:
# recommend("1984")
recommend("Harry Potter and the Prisoner of Azkaban (Book 3)")

[['Harry Potter and the Goblet of Fire (Book 4)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg'],
 ['Harry Potter and the Chamber of Secrets (Book 2)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg'],
 ['Harry Potter and the Order of the Phoenix (Book 5)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/043935806X.01.MZZZZZZZ.jpg'],
 ["Harry Potter and the Sorcerer's Stone (Book 1)",
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0590353403.01.MZZZZZZZ.jpg'],
 ["Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))",
  'J. K. Rowling',
  'http://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg']]

In [32]:
final_book_pt.index[545]

"The Handmaid's Tale"

In [33]:
sorted(list(enumerate(similarity_scores[0])))
sorted(list(enumerate(similarity_scores[0])), key=lambda x: x[1])
sorted(list(enumerate(similarity_scores[0])), key=lambda x: x[1], reverse=True)
sorted(list(enumerate(similarity_scores[0])), key=lambda x: x[1], reverse=True)[1:6]

[(47, np.float64(0.2702651417103732)),
 (545, np.float64(0.2639619371123497)),
 (82, np.float64(0.23669374347400993)),
 (634, np.float64(0.23299389358170394)),
 (551, np.float64(0.2262639743141286))]

In [34]:
# retrieve index place of item
np.where(final_book_pt.index=='1984')[0][0]             # np.int64(0)
np.where(final_book_pt.index=='4 Blondes')[0][0]        # np.int64(3)
np.where(final_book_pt.index=='Year of Wonders')[0][0]  # np.int64(701)

np.int64(701)

### Finding elements from Recommendation System Data for Web Design

In [35]:
popular_df["Image-URL-M"][0]

'http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg'

### Export notebook for model deployment

In [36]:
import pickle


In [None]:
# pickle file 1
pickle.dump(popular_df, open('popular.pkl', 'wb'))

In [38]:
popular_df.head()
popular_df["Image-URL-M"][0]

'http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg'

In [None]:
# pickle file 2: 
pickle.dump(final_book_pt, open('book_pt.pkl', 'wb'))
pickle.dump(books, open('books.pkl', 'wb'))
pickle.dump(similarity_scores, open("similarity_score.pkl", 'wb'))