This is a toy example of how can we perform some filtering approach on meta-data information for generating recommendations.

We will load the file: books_n_description.csv. 
Then, we will filter all records without either a publication year or a category.
Next, we will generate a random recommendation base upon a naive approach.

#Imports

In [None]:
import pandas as pd
import random
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load data

In [None]:
#Make sure you downloaded the dataset and you create such a path, alternativly - changed this path
PATH = '/content/drive/My Drive/Recommender Systems/Content Base/Datasets/Book-Crossing'
os.chdir(PATH)

In [None]:
# Load data from csv file
df_item_info = pd.read_csv('books_n_description.csv',index_col=0, names=['ISBN', 'title', 'author', 'pub_year', 'publisher', 'category','description'],encoding='latin-1', skiprows=1)

# print the first 5 rows of the data
df_item_info.head()



Unnamed: 0,ISBN,title,author,pub_year,publisher,category,description
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,Actresses,"In a small town in Canada, Clara Callan reluct..."
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,Medical,"Describes the great flu epidemic of 1918, an o..."
2,399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,Fiction,A Chinese immigrant who is convinced she is dy...
3,440234743,The Testament,John Grisham,1999.0,Dell,Fiction,"A suicidal billionaire, a burnt-out Washington..."
4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994.0,Plume,Fiction,Staring unflinchingly into the abyss of slaver...


In [None]:
# Filter records without a category and without pub_year
df_item_filtered = df_item_info[df_item_info['category'].notnull()].copy()
df_item_filtered = df_item_filtered[df_item_filtered['pub_year'].notnull()].copy()


In [None]:
# Naive approach - select only the following columns in order to generate recommendations
df_item = df_item_filtered[['ISBN','category','pub_year']]
df_item.set_index('ISBN', inplace=True)
df_item.head()


Unnamed: 0_level_0,category,pub_year
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
2005018,Actresses,2001.0
374157065,Medical,1999.0
399135782,Fiction,1991.0
440234743,Fiction,1999.0
452264464,Fiction,1994.0


# Load rating data

In [None]:
df_rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding="ISO-8859-1")
df_rating_filtered = df_rating[df_rating['ISBN'].isin(df_item.index)]
df_rating_filtered.set_index('ISBN', inplace=True)
df_rating_filtered.head()




Unnamed: 0_level_0,User-ID,Book-Rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
055356451X,276746,0
1853262404,276925,0
8408011200,276925,0
1883473004,277031,8
3442435838,277048,0


In [None]:
# join the item data the with the rating data
df_merge = df_rating_filtered.join(df_item)
# print 5 sample rows
df_merge.head()

Unnamed: 0_level_0,User-ID,Book-Rating,category,pub_year
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000648302X,11676,8,End of the world,1999.0
000648302X,37950,9,End of the world,1999.0
000648302X,89624,9,End of the world,1999.0
000648302X,98544,0,End of the world,1999.0
000648302X,99328,3,End of the world,1999.0


In [None]:
def recommand (userID,rating_threshold):
    """ Return an item to recommend. 
    According to this naive filtering approach, this function will generate a pool of candidates for a recommendation for a given userID. 
    First, we will consider only books in the category that the userID read. 
    Next, we will filter books with pub_year, not between the minimum and the maximum pub_year read by userID and above a given rating_threshold.
    Finally, out of all possible candidates, we will generate one random recommendation.
    Notably, we are not recommending a book that the user has already read.

        Parameters
        ----------
        userID : integer
            The user ID
        rating_threshold : integer
             A threshold number, ratings above this value are considered as possible candidates.
        
        """

    user_categories = df_merge['category'][df_merge['User-ID'] == userID].unique()
    user_pub_year = df_merge['pub_year'][df_merge['User-ID'] == userID].unique()
    max_year = max(user_pub_year)
    min_year = min(user_pub_year)
    
    user_ISBN = df_merge[df_merge['User-ID'] == userID].index.values
    recommend_df = df_merge[df_merge['category'].isin(user_categories)]
    recommend_df = recommend_df[recommend_df['Book-Rating'] >= rating_threshold]  # Naive approch - can you think why?
    recommend_df = recommend_df[recommend_df['pub_year'].between(min_year,max_year,inclusive=True)]
    recommend_df = recommend_df.loc[~recommend_df.index.isin(user_ISBN)]    
    random_isbn = random.choice(recommend_df.index.values)
    book_data = df_item_info[df_item_info['ISBN']==random_isbn]
    book_data.set_index('ISBN', inplace=True)
    book_data = book_data.join(recommend_df[['Book-Rating']],lsuffix='_caller', rsuffix='_other')
    book_data = book_data.iloc[0]
    return book_data
    
    
    

In [None]:
df_item_info.head()

Unnamed: 0,ISBN,title,author,pub_year,publisher,category,description
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,Actresses,"In a small town in Canada, Clara Callan reluct..."
1,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,Medical,"Describes the great flu epidemic of 1918, an o..."
2,399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,Fiction,A Chinese immigrant who is convinced she is dy...
3,440234743,The Testament,John Grisham,1999.0,Dell,Fiction,"A suicidal billionaire, a burnt-out Washington..."
4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994.0,Plume,Fiction,Staring unflinchingly into the abyss of slaver...


In [None]:
recommand(userID = 276746, rating_threshold = 6)

title                                    Puerto Vallarta Squeeze
author                                       Robert James Waller
pub_year                                                    1995
publisher                                           Warner Books
category                                                 Fiction
description    The author of the blockbuster The Bridges of M...
Book-Rating                                                    8
Name: 044651747X, dtype: object

In [None]:
def recommand_rating_above_average (userID,rating_threshold_average):
    """ Return an item to recommend. 
    According to this naive filtering approach, this function will generate a pool of candidates for a recommendation for a given userID. 
    First, we will consider only books in the category that the userID read. 
    Next, we will filter books with pub_year, not between the minimum and the maximum pub_year read by userID.    
    Finally, out of all possible candidates, we will generate one random recommendation.
    Notably, we are not recommending a book that the user has already read.
    
    Since, using the rating_threshold is not enough, we need to consider for each book the average rating and filter the candidates according to it.

        Parameters
        ----------
        userID : integer
            The user ID
        rating_threshold_average : integer
             A threshold number, ratings above this value are considered as possible candidates.
        
        """
    user_categories = df_merge['category'][df_merge['User-ID'] == userID].unique()
    user_pub_year = df_merge['pub_year'][df_merge['User-ID'] == userID].unique()
    max_year = max(user_pub_year)
    min_year = min(user_pub_year)
    
    user_ISBN = df_merge[df_merge['User-ID'] == userID].index.values
    recommend_df = df_merge[df_merge['category'].isin(user_categories)]
    recommend_df = recommend_df[recommend_df['pub_year'].between(min_year,max_year,inclusive=True)]
    recommend_df = recommend_df.loc[~recommend_df.index.isin(user_ISBN)]
    
    #recommend_df = recommend_df[recommend_df['Book-Rating'] >= rating_treshold]  # Naive approch - can you think why?    
    #replace previous row with this two rows    
    
    recommend_df = recommend_df[['Book-Rating']].groupby(['ISBN']).mean()
    recommend_df = recommend_df[recommend_df['Book-Rating'] >= rating_threshold_average*1.0] 
    
    try:
        random_isbn = random.choice(recommend_df.index.values)
        book_data = df_item_info[df_item_info['ISBN']==random_isbn]    
        book_data.set_index('ISBN', inplace=True)
        book_data = book_data.join(recommend_df[['Book-Rating']],lsuffix='_caller', rsuffix='_other')
        book_data = book_data.iloc[0]
        return book_data
    except:
        print('we found zero books over the rating treshold. Lower you treshold for generate recommendation')
        return
    
    

In [None]:
recommand_rating_above_average(userID = 276746, rating_threshold_average = 6)

title                                                      Congo
author                                          Michael Crichton
pub_year                                                    1995
publisher                                       Ballantine Books
category                                                 Fiction
description    Armed with the latest gifts of advanced techno...
Book-Rating                                                  7.5
Name: 345378490, dtype: object

# Code Task
Write different naive approch of recommanding base on content data.
For example, build for each user a user profile aggregating it historical data.
Then, seek for similar item to each user profile.