In [3]:
#importing the packages
import pandas as pd
import sys
from scipy import sparse 
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

#importing warnings to turn off future warnings
import warnings
warnings.simplefilter(action='ignore')

In [10]:
#reading in the data
explicit_ratings = pd.read_csv('explicit_ratings.csv')
explicit_ratings.head()

Unnamed: 0.1,Unnamed: 0,user_id,book_id,rating,goodreads_book_id,tag_id,count,tag_name.x,genre.x,tag_name.y,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,0,2886,1,5,2767052,11305,10836,fantasy,fantasy,fantasy,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,1,4,2,5,3,11305,47478,fantasy,fantasy,fantasy,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,2,451,3,5,41865,11305,14288,fantasy,fantasy,fantasy,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,3,15,4,3,2657,14487,4735,historical-fiction,historical,historical-fiction,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,4,4,5,4,4671,14487,2394,historical-fiction,historical,historical-fiction,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [6]:
for col in explicit_ratings.columns: 
    print(col)

Unnamed: 0
user_id
book_id
rating
goodreads_book_id
tag_id
count
tag_name.x
genre.x
tag_name.y
genre.y
best_book_id
work_id
books_count
isbn
isbn13
authors
original_publication_year
original_title
title
language_code
average_rating
ratings_count
work_ratings_count
work_text_reviews_count
ratings_1
ratings_2
ratings_3
ratings_4
ratings_5
image_url
small_image_url


In [11]:
#Dropping unneeded columns
explicit_ratings.drop(['Unnamed: 0','goodreads_book_id', 'tag_id', 'count', 'tag_name.x','tag_name.y','genre.y', 'best_book_id', 'work_id', 'books_count','isbn','isbn13', 'original_publication_year', 'original_title', 'ratings_count','work_ratings_count','work_text_reviews_count','ratings_1','ratings_2','ratings_3','ratings_4','ratings_5','small_image_url','image_url','language_code'], 1,  inplace=True)

In [12]:
explicit_ratings.head()

Unnamed: 0,user_id,book_id,rating,genre.x,authors,title,average_rating
0,2886,1,5,fantasy,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34
1,4,2,5,fantasy,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.44
2,451,3,5,fantasy,Stephenie Meyer,"Twilight (Twilight, #1)",3.57
3,15,4,3,historical,Harper Lee,To Kill a Mockingbird,4.25
4,4,5,4,historical,F. Scott Fitzgerald,The Great Gatsby,3.89


In [53]:
# # #data for pivot table
# #Creating a sample of users -who have more than 200 ratings
# user_counts = explicit_ratings['user_id'].value_counts()
# sample_ratings = explicit_ratings[explicit_ratings['user_id'].isin(user_counts[user_counts >= 3].index)]

In [54]:
#check the size and shape
sample_ratings.shape

(6027, 7)

## Pivot table

In [32]:
#seeing if my sample size is small enough to create a pivot table
pivot = explicit_ratings.pivot_table(index='title', columns='user_id', values='rating')

In [33]:
pivot.shape

(9940, 4149)

## Sparse matrix

In [34]:
#getting the size of the pivot file
sys.getsizeof(pivot)

330826830

In [35]:
#preprocessing step of converting nans to zeros.
pivot_sparse = sparse.csr_matrix(pivot.fillna(0))

In [36]:
#getting the updated file size
sys.getsizeof(pivot_sparse)

64

## Recommender

In [37]:
#setting up the recommender 
recommender = cosine_similarity(pivot_sparse)

In [38]:
#verifying the shape of the engine to make sure the numbers are the same
recommender.shape

(9940, 9940)

In [40]:
#creating a dataframe to bring the title names back into view
recommender_df = pd.DataFrame(recommender, columns=pivot.index, index=pivot.index)
recommender_df.head(10)

title,"Angels (Walsh Family, #3)","""حكايات فرغلي المستكاوي ""حكايتى مع كفر السحلاوية",#GIRLBOSS,'Salem's Lot,"'Tis (Frank McCourt, #2)","1,000 Places to See Before You Die",1/4 جرام,"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works","100 Bullets, Vol. 1: First Shot, Last Call",100 Love Sonnets,...,ماهی سیاه کوچولو,مخطوطة بن إسحاق: مدينة الموتى,نادي السيارات,هشت کتاب,هيبتا,واحة الغروب,يوتوبيا,ڤيرتيجو,キスよりも早く1 [Kisu Yorimo Hayaku 1] (Faster than a Kiss #1),美少女戦士セーラームーン新装版 1 [Bishōjo Senshi Sailor Moon Shinsōban 1]
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""حكايات فرغلي المستكاوي ""حكايتى مع كفر السحلاوية",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#GIRLBOSS,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'Tis (Frank McCourt, #2)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"1,000 Places to See Before You Die",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1/4 جرام,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"100 Bullets, Vol. 1: First Shot, Last Call",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100 Love Sonnets,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Evaluation of the Recommender Engine

In [41]:
#setting the columns so we can see the full titles
pd.set_option('display.max_colwidth', -1)

In [49]:
#Code to search for titles 
q = 'Interview with the Vampire'
explicit_ratings[explicit_ratings['title'].str.contains(q)]['title'].head()

203     Interview with the Vampire (The Vampire Chronicles, #1)                                                
9021    Vampire Chronicles: Interview with the Vampire, The Vampire Lestat, The Queen of the Damned (Anne Rice)
Name: title, dtype: object

In [47]:
#Looking up recommendations for those who liked HP:
recommender_df["Harry Potter and the Sorcerer's Stone (Harry Potter, #1)"].sort_values(ascending=False)[1:11]

title
The Sun Also Rises                                             1.0
Life of Pi                                                     1.0
Charlie and the Chocolate Factory (Charlie Bucket, #1)         1.0
Alexander and the Terrible, Horrible, No Good, Very Bad Day    1.0
Alas, Babylon                                                  1.0
The Catcher in the Rye                                         1.0
The Ultimate Hitchhiker's Guide to the Galaxy                  1.0
The Count of Monte Cristo                                      1.0
The Giver (The Giver, #1)                                      1.0
A Confederacy of Dunces                                        1.0
Name: Harry Potter and the Sorcerer's Stone (Harry Potter, #1), dtype: float64

In [50]:
#Looking up recommendations for those who liked Interview with the Vampire:
recommender_df['Interview with the Vampire (The Vampire Chronicles, #1)'].sort_values(ascending=False)[1:11]

title
Thinner                                                       1.0
The Door to December                                          1.0
From a Buick 8                                                1.0
Pandora / Vittorio the Vampire (New Tales of the Vampires)    1.0
Dirk Gently's Holistic Detective Agency (Dirk Gently #1)      1.0
Pandora (New Tales of the Vampires, #1)                       1.0
Clear and Present Danger (Jack Ryan Universe, #6)             1.0
The Velveteen Rabbit                                          1.0
The Sum of All Fears (Jack Ryan Universe, #7)                 1.0
The Lost World (Professor Challenger, #1)                     1.0
Name: Interview with the Vampire (The Vampire Chronicles, #1), dtype: float64