## Implementation of two kinds of systems in Collaborative filtering
### - Item-Item
### - User-User
### Notice: The implementation of User-Item model is in the "Book_Recommender_User_Item_Base" file as an extra score :)

In [1]:
#! pip install scikit-surprise

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import pairwise_distances

### Reading datasets

In [3]:
ratings = pd.read_csv('BX-CSV-Dump/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding='latin-1')
items = pd.read_csv('BX-CSV-Dump/BX-Books.csv', sep=';', error_bad_lines=False, encoding='latin-1' ,warn_bad_lines=False)
users = pd.read_csv('BX-CSV-Dump/BX-Users.csv', sep=';', error_bad_lines=False, encoding='latin-1' ,warn_bad_lines=False)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Merging the datasets into one table

In [4]:
ratings = pd.merge(ratings, items, on='ISBN')
ratings = pd.merge(ratings, users, on='User_ID')
ratings = ratings.drop(['Image_URL-S', 'Image_URL_M', 'Image_URL_L'], axis=1)

ratings.head()

Unnamed: 0,User_ID,ISBN,Book_Rating,Book_Title,Book_Author,Year_Of_Publication,Publisher,Location,Age
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"tyler, texas, usa",
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"cincinnati, ohio, usa",23.0
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,"cincinnati, ohio, usa",23.0
3,2313,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,"cincinnati, ohio, usa",23.0
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,"cincinnati, ohio, usa",23.0


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031136 entries, 0 to 1031135
Data columns (total 9 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   User_ID              1031136 non-null  int64  
 1   ISBN                 1031136 non-null  object 
 2   Book_Rating          1031136 non-null  int64  
 3   Book_Title           1031136 non-null  object 
 4   Book_Author          1031135 non-null  object 
 5   Year_Of_Publication  1031136 non-null  object 
 6   Publisher            1031134 non-null  object 
 7   Location             1031136 non-null  object 
 8   Age                  753301 non-null   float64
dtypes: float64(1), int64(2), object(6)
memory usage: 78.7+ MB


### To reduce the dimensionality of the data set, and avoid running into “memory error”, try to downsample it following rules

In [6]:
min_book_ratings = 50
filter_books = ratings['ISBN'].value_counts() > min_book_ratings
filter_books = filter_books[filter_books].index.tolist()

min_user_ratings = 50
filter_users = ratings['User_ID'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

print('The original data frame shape:\t{}'.format(ratings.shape))
ratings = ratings[(ratings['ISBN'].isin(filter_books)) & (ratings['User_ID'].isin(filter_users))]
print('The new data frame shape:\t{}'.format(ratings.shape))

The original data frame shape:	(1031136, 9)
The new data frame shape:	(137573, 9)


### Do label encoding in order to works with numberic indexs

In [7]:
label_encoder = preprocessing.LabelEncoder()
ratings['ISBN']= label_encoder.fit_transform(ratings['ISBN'])
ratings['User_ID']= label_encoder.fit_transform(ratings['User_ID'])

ratings.head()

Unnamed: 0,User_ID,ISBN,Book_Rating,Book_Title,Book_Author,Year_Of_Publication,Publisher,Location,Age
37,53,451,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"strafford, missouri, usa",34.0
38,53,280,0,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown","strafford, missouri, usa",34.0
39,53,647,0,The Da Vinci Code,Dan Brown,2003,Doubleday,"strafford, missouri, usa",34.0
40,53,2028,0,Wild Animus,Rich Shapero,2004,Too Far,"strafford, missouri, usa",34.0
41,53,241,0,Four To Score (A Stephanie Plum Novel),Janet Evanovich,1999,St. Martin's Paperbacks,"strafford, missouri, usa",34.0


### Now extract the uniques of the users and items for making the similarity matrix 

In [8]:
n_users = ratings['User_ID'].unique().shape[0]
n_items = ratings['ISBN'].unique().shape[0]

n_users, n_items

(2954, 2101)

## Collaborative filtering
### Item-Item---User-User

### The rating matrix for items and users

In [9]:
data_matrix = np.zeros((n_users, n_items))

for line in ratings.itertuples():
    data_matrix[line[1] -1, line[2] -1] = line[3]
    
data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Calculate the distance of items using 'Cosine' and 'Pearson' methods

In [10]:
item_similarity_cosine = pairwise_distances(data_matrix.T, metric='cosine')
item_similarity_pearson = pairwise_distances(data_matrix.T, metric='correlation') #TODO: check it!

item_similarity_cosine

array([[0.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.        , 0.86679994, ..., 1.        , 1.        ,
        0.9254838 ],
       [1.        , 0.86679994, 0.        , ..., 0.95020725, 0.92281476,
        0.84484368],
       ...,
       [1.        , 1.        , 0.95020725, ..., 0.        , 0.73065115,
        0.87855551],
       [1.        , 1.        , 0.92281476, ..., 0.73065115, 0.        ,
        0.81174524],
       [1.        , 0.9254838 , 0.84484368, ..., 0.87855551, 0.81174524,
        0.        ]])

### Calculate the distance of users using 'Cosine' and 'Pearson' methods

In [11]:
user_similarity_cosine = pairwise_distances(data_matrix, metric='cosine')
user_similarity_pearson = pairwise_distances(data_matrix, metric='correlation')

user_similarity_cosine

array([[0.        , 0.8846225 , 1.        , ..., 1.        , 0.9169177 ,
        1.        ],
       [0.8846225 , 0.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 1.        , 0.        , ..., 0.9447847 , 1.        ,
        0.93628812],
       ...,
       [1.        , 1.        , 0.9447847 , ..., 0.        , 0.91532092,
        0.95179866],
       [0.9169177 , 1.        , 1.        , ..., 0.91532092, 0.        ,
        1.        ],
       [1.        , 1.        , 0.93628812, ..., 0.95179866, 1.        ,
        0.        ]])

## Let's test what we did

### Now define a function to get a book name and returns the most similar ones 

In [12]:
def book_recommender(book_name, similarity_matrix):
    
    book_id = ratings[ratings['Book_Title'] == book_name]['ISBN'].tolist()[0]
    
    score = similarity_matrix[book_id]
    
    sorted_score = sorted(score, key=lambda x : x, reverse=True)
    
    book_indexes = [list(score).index(x) for x in sorted_score]
    
    return ratings['Book_Title'].iloc[book_indexes]

### Then define a function to get a user ID and returns the most similar users

In [13]:
def user_recommender(user_id, similarity_matrix):
    
    score = similarity_matrix[user_id]
    score = list(filter(lambda x: str(x) != 'nan', score))

    sorted_score = sorted(score, key=lambda x : x, reverse=True)
    user_indexes = [list(score).index(x) for x in sorted_score]
    
    return ratings['User_ID'].iloc[user_indexes]

### Here we go :) 

### The top 10 books which have most rate similarity to the input book

#### Based on cosine distance

In [14]:
book_recommender('The Lovely Bones: A Novel', item_similarity_cosine).unique().tolist()[1:11]

['The Last Juror',
 'American Gods: A Novel',
 'A Patchwork Planet',
 'Now You See Me',
 'Gone with the Wind',
 'Isle of Dogs',
 'The Five People You Meet in Heaven',
 'Must Love Dogs',
 'Beach Music',
 'Pearl in the Mist (Landry)']

#### Based on pearson distance

In [15]:
book_recommender('The Lovely Bones: A Novel', item_similarity_pearson).unique().tolist()[1:11]

['Split Second',
 'Ruby (Landry)',
 'Coraline',
 'Blow Fly: A Scarpetta Novel',
 '2nd Chance',
 'The Nanny Diaries: A Novel',
 'Five Quarters of the Orange',
 'The Last Juror',
 'A Day Late and a Dollar Short',
 "The Pilot's Wife : A Novel"]

### The top 10 users which have most rate similarity to the input user

#### Based on cosine distance

In [16]:
user_recommender(53, user_similarity_cosine).unique().tolist()[1:11]

[2184, 273, 812, 1288, 860, 1414, 2778, 2047, 2396, 1028]

#### Based on pearson distance

In [17]:
user_recommender(273, user_similarity_pearson).unique().tolist()[1:11]

[1414, 2397, 1288, 812, 860, 1343, 217, 1028, 53, 2396]

### Yohaa :)