## Implementation of User-Item model of Collaborative filtering Recommenders
### Notice: The Item-Item and User-User models are implemented in "Book_Recommender_Original" file

In [1]:
#! pip install scikit-surprise

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import pairwise_distances

### Reading datasets

In [3]:
ratings = pd.read_csv('BX-CSV-Dump/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding='latin-1')
items = pd.read_csv('BX-CSV-Dump/BX-Books.csv', sep=';', error_bad_lines=False, encoding='latin-1' ,warn_bad_lines=False)
users = pd.read_csv('BX-CSV-Dump/BX-Users.csv', sep=';', error_bad_lines=False, encoding='latin-1' ,warn_bad_lines=False)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Merging the datasets into one table

In [4]:
ratings = pd.merge(ratings, items, on='ISBN')
ratings = pd.merge(ratings, users, on='User_ID')
ratings = ratings.drop(['Image_URL-S', 'Image_URL_M', 'Image_URL_L'], axis=1)

ratings.head()

Unnamed: 0,User_ID,ISBN,Book_Rating,Book_Title,Book_Author,Year_Of_Publication,Publisher,Location,Age
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"tyler, texas, usa",
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"cincinnati, ohio, usa",23.0
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,"cincinnati, ohio, usa",23.0
3,2313,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,"cincinnati, ohio, usa",23.0
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,"cincinnati, ohio, usa",23.0


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031136 entries, 0 to 1031135
Data columns (total 9 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   User_ID              1031136 non-null  int64  
 1   ISBN                 1031136 non-null  object 
 2   Book_Rating          1031136 non-null  int64  
 3   Book_Title           1031136 non-null  object 
 4   Book_Author          1031135 non-null  object 
 5   Year_Of_Publication  1031136 non-null  object 
 6   Publisher            1031134 non-null  object 
 7   Location             1031136 non-null  object 
 8   Age                  753301 non-null   float64
dtypes: float64(1), int64(2), object(6)
memory usage: 78.7+ MB


### To reduce the dimensionality of the data set, and avoid running into “memory error”, try to downsample it following rules

In [6]:
min_book_ratings = 50
filter_books = ratings['ISBN'].value_counts() > min_book_ratings
filter_books = filter_books[filter_books].index.tolist()

min_user_ratings = 50
filter_users = ratings['User_ID'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

print('The original data frame shape:\t{}'.format(ratings.shape))
ratings = ratings[(ratings['ISBN'].isin(filter_books)) & (ratings['User_ID'].isin(filter_users))]
print('The new data frame shape:\t{}'.format(ratings.shape))

The original data frame shape:	(1031136, 9)
The new data frame shape:	(137573, 9)


### Do label encoding in order to works with numberic indexs

In [7]:
label_encoder = preprocessing.LabelEncoder()
ratings['ISBN']= label_encoder.fit_transform(ratings['ISBN'])
ratings['User_ID']= label_encoder.fit_transform(ratings['User_ID'])

ratings.head()

Unnamed: 0,User_ID,ISBN,Book_Rating,Book_Title,Book_Author,Year_Of_Publication,Publisher,Location,Age
37,53,451,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"strafford, missouri, usa",34.0
38,53,280,0,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown","strafford, missouri, usa",34.0
39,53,647,0,The Da Vinci Code,Dan Brown,2003,Doubleday,"strafford, missouri, usa",34.0
40,53,2028,0,Wild Animus,Rich Shapero,2004,Too Far,"strafford, missouri, usa",34.0
41,53,241,0,Four To Score (A Stephanie Plum Novel),Janet Evanovich,1999,St. Martin's Paperbacks,"strafford, missouri, usa",34.0


### Now extract the uniques of the users and items for making the similarity matrix 

In [8]:
n_users = ratings['User_ID'].unique().shape[0]
n_items = ratings['ISBN'].unique().shape[0]

n_users, n_items

(2954, 2101)

### Ok now do group by on 'User_ID' in order to get the books which each of users have read

In [9]:
user_books = ratings.groupby(by=['User_ID'])
user_books.first()

Unnamed: 0_level_0,ISBN,Book_Rating,Book_Title,Book_Author,Year_Of_Publication,Publisher,Location,Age
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1174,6,Manhattan Hunt Club,JOHN SAUL,2002,Ballantine Books,"arden hills, minnesota, usa",
1,1237,8,The Dark Half,Stephen King,1994,Signet Book,"minneapolis, minnesota, usa",24.0
2,1932,0,"Artemis Fowl (Artemis Fowl, Book 1)",Eoin Colfer,2002,Miramax Kids,"dumas, arkansas, usa",
3,1056,0,The Notebook,Nicholas Sparks,1996,Warner Books,"san diego, california, usa",20.0
4,459,0,The Wasp Factory,Iain Banks,0,Abacus,"grenoble, rhone-alpes, france",23.0
...,...,...,...,...,...,...,...,...
2949,957,9,The Street Lawyer,JOHN GRISHAM,1999,Dell,"lake george, new york, usa",34.0
2950,1056,0,The Notebook,Nicholas Sparks,1996,Warner Books,"omaha, nebraska, usa",
2951,241,0,Four To Score (A Stephanie Plum Novel),Janet Evanovich,1999,St. Martin's Paperbacks,"slidell, louisiana, usa",
2952,669,0,The Second Summer of the Sisterhood,ANN BRASHARES,2003,Delacorte Books for Young Readers,"sandy, utah, usa",


### The similarity matrix for

In [10]:
items_matrix = np.zeros((n_users, n_items))

for grp, pdf in user_books:
    book_cage = pdf['ISBN'].tolist()
    for book in book_cage:
        items_matrix[grp, book] = 1
        
items_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Calculate the distance of items using 'Cosine' and 'Pearson' methods

In [11]:
item_similarity_cosine = pairwise_distances(items_matrix.T, metric='cosine')
item_similarity_pearson = pairwise_distances(items_matrix.T, metric='correlation') #TODO: check it!

item_similarity_cosine

array([[0.        , 0.97763932, 0.95      , ..., 0.95232687, 0.95435645,
        0.95232687],
       [0.97763932, 0.        , 0.95527864, ..., 1.        , 1.        ,
        1.        ],
       [0.95      , 0.95527864, 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [0.95232687, 1.        , 1.        , ..., 0.        , 0.65184469,
        0.81818182],
       [0.95435645, 1.        , 1.        , ..., 0.65184469, 0.        ,
        0.82592234],
       [0.95232687, 1.        , 1.        , ..., 0.81818182, 0.82592234,
        0.        ]])

## Let's test what we did

### Now define a function to get a book name and returns the most similar ones 

In [12]:
def book_recommender(book_name, similarity_matrix):
    
    book_id = ratings[ratings['Book_Title'] == book_name]['ISBN'].tolist()[0]
    
    score = similarity_matrix[book_id]
    
    sorted_score = sorted(score, key=lambda x : x, reverse=True)
    
    book_indexes = [list(score).index(x) for x in sorted_score]
    
    return ratings['Book_Title'].iloc[book_indexes]

### Here we go :) 
### The top 10 books which have most rate similarity to the input book

In [13]:
book_recommender('The Lovely Bones: A Novel', item_similarity_cosine).unique().tolist()[1:11]

['Eyes of the Dragon',
 'The Green Mile',
 'The Return of the King (The Lord of the Rings, Part 3)',
 "Open House (Oprah's Book Club (Paperback))",
 "Bridget Jones's Diary",
 'A Is for Alibi (Kinsey Millhone Mysteries (Paperback))',
 'Notes from a Small Island',
 'Diary of a Mad Bride (Summer Display Opportunity)',
 'She Walks These Hills',
 "Left Behind: A Novel of the Earth's Last Days (Left Behind No. 1)"]