In [14]:
import numpy as np
import pandas as pd

In [15]:
# Import data
df = pd.read_csv('ratings_with_features.csv')

df.head()

Unnamed: 0,location,user_id,age,isbn,rating,book_title,book_author,year_of_publication,publisher,img_m,Summary,Category,num of ratings,num of ratings user
0,"n/a, n/a, n/a",11676,34.7439,399135782,9,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,A Chinese immigrant who is convinced she is dy...,['Fiction'],116.0,2191.0
1,"knoxville, tennessee, usa",29526,26.0,399135782,9,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,A Chinese immigrant who is convinced she is dy...,['Fiction'],116.0,62.0
2,"san antonio, texas, usa",46398,37.0,399135782,9,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,A Chinese immigrant who is convinced she is dy...,['Fiction'],116.0,132.0
3,"homer, alaska, usa",148712,34.7439,399135782,10,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,A Chinese immigrant who is convinced she is dy...,['Fiction'],116.0,19.0
4,"colorado springs, colorado, usa",230522,52.0,399135782,7,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,A Chinese immigrant who is convinced she is dy...,['Fiction'],116.0,102.0


In [16]:
# Extract the summary column for content based model
summary_non_duplicates = df[['Summary']].drop_duplicates()
summary_non_duplicates = pd.merge(df['book_title'].drop_duplicates(), summary_non_duplicates, left_index=True, right_index=True).reset_index(drop=True)

In [17]:
# Assign number of users and books to variables
n_users = (df['user_id'].nunique())
n_books = (df['book_title'].nunique())

print("{} users".format(n_users))
print("{} books".format(n_books))

2000 users
5007 books


In [18]:
# We need to zip the user_id and book_id as some of the indexes are much bigger than the shape of our matrix
users = dict(zip(df['user_id'].unique(), np.arange(0, n_users),))
books = dict(zip(df['book_title'].unique(), np.arange(0, n_books)))

In [19]:
# Create user-book_rating matrix with users as rows and books as columns
ratings = np.zeros((n_users, n_books))

for row in df.itertuples():    
    ratings[users[row[2]], books[row[6]]] = row[5]

ratings.shape

(2000, 5007)

In [20]:
# Lets check what is the sparsity of our matrix
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))
del sparsity
# The sparsity is low because the data set is huge
# and the average number of ratings per user is relatively low

Sparsity: 0.68%


# Collaborative filtering

In [34]:
from sklearn.model_selection import train_test_split
# Split data into train and test 
np.random.seed(42) 

train_data, test_data = train_test_split(ratings, test_size=0.25, random_state=42)

In [36]:
# Cosine similarity func
def similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [37]:
from sklearn.metrics.pairwise import pairwise_distances

# Lets find user and item similarity by using cosine similarity
user_similarity = similarity(train_data, kind='user')
item_similarity = similarity(train_data, kind='item')

In [38]:
# Predict function
def predict(ratings, similarity, type='user'):
    if type =='user':
        # Here we need to normalize the weight of each user's rating
        # That is because each user may have different 'scale' of rating the book
        # e.g. Someone really liked the book so gave it 4, another person gave 5 as he kind of liked it
        
        # That is why we will take mean of each user's ratings
        mean_user_rating = ratings.mean(axis=1) # axis = 1 so it will take user's rating instead of mean rating of a book
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        # dot function - matrix multiplications
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    
    # Here we do not need to normalize 
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) 
    return pred

In [39]:
item_prediction = predict(train_data, item_similarity, type='item')
user_prediction = predict(train_data, user_similarity, type='user')

# Evaluation

In [42]:
# We will use RMSE as it seems to be most used in the industry for that kind of predictions
from sklearn.metrics import mean_squared_error
from math import sqrt

# Remember that we want to evaluate the non zero values - these are the actual ratings from user
def get_rmse(preds, actual):
    preds = preds[actual.nonzero()].flatten() # flatten function reduces the dimensionality of array to 1d
    actual = actual[actual.nonzero()].flatten()

    return sqrt(mean_squared_error(preds, actual))

In [43]:
print('User-based CF RMSE: {}'.format(get_rmse(user_prediction, test_data)))
print('Item-based CF RMSE: {}'.format(get_rmse(item_prediction, test_data)))

# Clean the kernel from this data as we will not need this anymore
del user_prediction
del item_prediction
del user_similarity
del item_similarity

User-based CF RMSE: 8.010185676597473
Item-based CF RMSE: 8.08024101974398


In [44]:
# Let's now retrain the data on the whole dataset
user_similarity_all = similarity(ratings,)
item_similarity_all = similarity(ratings.T)

item_prediction_all = predict(ratings, item_similarity_all, type='item')
user_prediction_all = predict(ratings, user_similarity_all, type='user')


print('User-based CF RMSE (All data): {}'.format(get_rmse(user_prediction_all, ratings)))
print('Item-based CF RMSE (All data): {}'.format(get_rmse(item_prediction_all, ratings)))

User-based CF RMSE (All data): 7.258342718896441
Item-based CF RMSE (All data): 7.33069003330777


In [45]:
# Build Content-Based models based on the summary of each movie in the daaset
# We will use TFIDF technique with ngrams 1 and 2
from sklearn.feature_extraction.text import TfidfVectorizer

# We are using this technique instead of countVectorizer because:
# TFIDF will give importance to the rare words, that we will need in finding similarities in the book's summary

tf_transformer = TfidfVectorizer(ngram_range=(1, 2)).fit(summary_non_duplicates['Summary'])
X_train_tf = tf_transformer.transform(summary_non_duplicates['Summary'])

In [46]:
# Get the cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
summary_similarity = pd.DataFrame(cosine_similarity(X_train_tf))

In [47]:
# The pearson correlation does not depend on the scale of data
# It means that very popular books will not be favored
item_correlation = 1 - pairwise_distances(ratings.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0

print('The pearson correlation RMSE: {}'.format(get_rmse(item_correlation, ratings)))

The pearson correlation RMSE: 8.06885197254186


In [48]:
# SUMMARY SIMILARITY
# example
bookname = 'The Testament'

# map keys
bookskey = list(books)
userkey = list(users)

# store the book and its similarity
book_similarity_by_summary = {}

# Get the index of the book in the summary similarity metrix
target_index_summary = summary_non_duplicates[summary_non_duplicates['book_title'] == bookname].index[0]

# Get the top 5 similar books to given book
top5_books_id_summary = summary_similarity.loc[target_index_summary].sort_values(ascending=False)[1:6]

# Map it to the dictionary
for index in top5_books_id_summary.index:
    book_similarity_by_summary[summary_non_duplicates.iloc[index]['book_title']] = top5_books_id_summary[index]

book_similarity_by_summary

{'Into the Wild': 0.07713094551124931,
 'She Went All the Way (Avon Light Contemporary Romances)': 0.07426472063308022,
 'No Sanctuary': 0.07407931519820433,
 'Bellwether': 0.07221418881836919,
 'The English Patient': 0.0601836994314694}

In [49]:
# returns the top ids with largest similarity to given book 
def top_k_movies(similarity, book_idx, k=10):
    return [x for x in np.argsort(similarity[book_idx,:])[:-k-1:-1]]

In [56]:
# Item SIMILARITY
book_id = books[bookname]

# map keys
bookskey = list(books)
userkey = list(users)

book_similarity_by_item = {}

top5_books_id_item = top_k_movies(item_similarity_all, book_id, 6)

# map the values of the most similar books with its title
for id_ in top5_books_id_item[1:]:
    book_similarity_by_item[bookskey[id_]] = item_similarity_all[id_, book_id]

book_similarity_by_item

{'The Runaway Jury': 0.28458417625889515,
 'The Street Lawyer': 0.27400003713158927,
 'The Chamber': 0.2556271962622344,
 'Critical Mass': 0.24025795103115202,
 'The Brethren': 0.23786493340704284}

In [57]:
# Correlation
book_similarity_by_corr = {}

top5_books_id_corr = top_k_movies(item_correlation, book_id, 6)

for id_ in top5_books_id_corr[1:]:
#     print(df.iloc[id_]['book_title'])
    book_similarity_by_corr[df.iloc[id_]['book_title']] = item_correlation[book_id, id_]

book_similarity_by_corr

{'The Five People You Meet in Heaven': 0.2542364042251464,
 'The Testament': 0.2432376175927512,
 'All That Remains (Kay Scarpetta Mysteries (Paperback))': 0.22982788912366536,
 'To Kill a Mockingbird': 0.22189866861575036,
 "Our Dumb Century: The Onion Presents 100 Years of Headlines from America's Finest News Source": 0.2063009070382933}