# Project 4 - Books Recommendation using cosine similarity
Collaborative based filtering->Item based

In [19]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import sklearn as sk
import pickle as pk

### Data preparation

In [9]:
#Creating dataframes from csv files to read the data
books_df_original = pd.read_csv('./Resources/Books.csv')
ratings_df = pd.read_csv('./Resources/Ratings.csv')

  books_df_original = pd.read_csv('./Resources/Books.csv')


In [10]:
# Filter out data with no publication year
books_df = books_df_original[books_df_original['Year-Of-Publication'] != 0]

In [11]:
# remove duplicated books records if any by looking at ISBN
books_df=books_df.drop_duplicates(subset=['ISBN'])

In [12]:
# update the datatype of a 'Book-Rating' field to numeric one
ratings_df['Book-Rating']=pd.to_numeric(ratings_df['Book-Rating'],errors='coerce')

In [13]:
# merge books data with ratings, so that we have only those records where books is rated and has title info
ratings_df=pd.merge(books_df,ratings_df,on='ISBN', how = 'inner')
# leave only title, user and rating data
ratings_df=ratings_df.drop(['ISBN','Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)
# drop if any N/As
ratings_df=ratings_df.dropna()
# drop duplicated records when same user could rate book(s) with same title
ratings_df=ratings_df.drop_duplicates()

### Filter data, so that only statistically significant data are left

In [14]:
min_books_rated_by_user=50
min_rates_received_by_book=25
top_X_recommendations=5

In [15]:
#find the count of books rated by user
groupped_r_users=ratings_df.groupby('User-ID')['Book-Rating'].count()

#find the count of rates per book-title
groupped_r_books=ratings_df.groupby('Book-Title')['User-ID'].count()

#select only those books which were rated more than min_rates_received_by_book
titles_with_acceptable_rates_count=list(groupped_r_books[groupped_r_books>min_rates_received_by_book].index)

#select only those users (user_id) who rated more than min_books_rated_by_user books
user_ids_with_acceptable_books_count_rated=list(groupped_r_users[groupped_r_users>min_books_rated_by_user].index)

# filter rating-user data to have only books/users of interest (which have highest rates count and rated highest number of books respectively)
rating_input_df=ratings_df[ratings_df['Book-Title'].isin(titles_with_acceptable_rates_count)&ratings_df['User-ID'].isin(user_ids_with_acceptable_books_count_rated)]



In [16]:
def recommend_me_books_cos_sim(book_title, rating_input_df):
    # use pivot method to create a matrix with columns=User-ID and rows='Book-Title' and values equal to rating value given for a book by a respective user
    df_books_ratigs_user=rating_input_df.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
    # filling n/a with 0 so far, assuming it means that no interest for a book by a user,
    df_books_ratigs_user=df_books_ratigs_user.fillna(0)
    # create a dictionary for mapping between row number ans Book-Title
    index_title_dict=dict(df_books_ratigs_user.reset_index()['Book-Title'])
    # apply cosine_similarity
    books_similarity = cosine_similarity(df_books_ratigs_user)
    # convert output of cosine_similarity into df
    books_similarity_df=pd.DataFrame(books_similarity)
    # introduce title here
    books_similarity_df=books_similarity_df.rename(columns=index_title_dict)
    books_similarity_df['Book-Title']=books_similarity_df.index
    books_similarity_df['Book-Title']=books_similarity_df['Book-Title'].map(index_title_dict)
    # find a similarity list for the book
    recommendations=books_similarity_df[[book_title,'Book-Title']].sort_values(by=book_title, ascending=False)
    book_title_list=[book_title]
    recommendations=recommendations[~recommendations['Book-Title'].isin(book_title_list)]
    # select top top_X_recommendations
    top_recommendations=recommendations[:top_X_recommendations].rename(columns={book_title:'similarity rate'})
    recommendations_full_info=pd.merge(top_recommendations, books_df, left_on='Book-Title',right_on='Book-Title', how='left')
    dict_years=dict(recommendations_full_info.groupby('Book-Title')['Year-Of-Publication'].max())
    for i, row in recommendations_full_info.iterrows():
        if row['Year-Of-Publication']!=dict_years[row['Book-Title']]:
            recommendations_full_info.loc[i,'Year-Of-Publication']=0
    recommendations_full_info=recommendations_full_info[recommendations_full_info['Year-Of-Publication'] != 0]
    recommendations_full_info=recommendations_full_info.drop_duplicates(subset=['Book-Title'])
    return recommendations_full_info


In [17]:
recommend_me_books_cos_sim('1st to Die: A Novel', rating_input_df)

Unnamed: 0,similarity rate,Book-Title,ISBN,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
1,0.249912,Pop Goes the Weasel,446608815,James Patterson,2000,Warner Vision,http://images.amazon.com/images/P/0446608815.0...,http://images.amazon.com/images/P/0446608815.0...,http://images.amazon.com/images/P/0446608815.0...
5,0.22593,Along Came a Spider (Alex Cross Novels),446692638,James Patterson,2003,Warner Books,http://images.amazon.com/images/P/0446692638.0...,http://images.amazon.com/images/P/0446692638.0...,http://images.amazon.com/images/P/0446692638.0...
9,0.215181,Kiss the Girls,446677388,James Patterson,2000,Warner Books,http://images.amazon.com/images/P/0446677388.0...,http://images.amazon.com/images/P/0446677388.0...,http://images.amazon.com/images/P/0446677388.0...
11,0.212501,Roses Are Red (Alex Cross Novels),446605484,James Patterson,2001,Warner Vision,http://images.amazon.com/images/P/0446605484.0...,http://images.amazon.com/images/P/0446605484.0...,http://images.amazon.com/images/P/0446605484.0...
12,0.211548,Blood Test (Alex Delaware Novels (Paperback)),553569635,Jonathan Kellerman,1995,Bantam Books,http://images.amazon.com/images/P/0553569635.0...,http://images.amazon.com/images/P/0553569635.0...,http://images.amazon.com/images/P/0553569635.0...


In [22]:
pk.dump(rating_input_df,open('rating_input.pkl','wb'))

In [23]:
pk.dump(books_df,open('books_df.pkl','wb'))