In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.feature_selection import r_regression
from scipy.stats import pearsonr

In [15]:
book_df = pd.read_csv("../datasets/clean/filtered_datasets/books_merged.csv")

In [16]:
#def a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a' etc.
tfidf = TfidfVectorizer(stop_words='english')

#Make empty cells into empty strings (should not be a lot of them)
book_df['description'] = book_df['description'].fillna('')

#Make TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(book_df['description'])
tfidf_matrix.shape

(2426, 16448)

In [21]:
# Calculate Pearson correlation coefficient
def pearson_correlation(x, y):
    mask = ~(np.isnan(x) | np.isnan(y))
    return pearsonr(x[mask], y[mask])[0]

# Calculate item-item similarity matrix using Pearson correlation
def calculate_similarity_matrix(data_matrix):
    return pairwise_distances(data_matrix.T, metric=pearson_correlation)

In [22]:
# we create an indices matrix so we can search book index by name
indices = pd.Series(book_df.index, index=book_df['Book-Title']).drop_duplicates()

In [25]:
# Calculate item-item similarity matrix using Pearson correlation
pearson_sim = calculate_similarity_matrix(tfidf_matrix)

In [26]:
# replaced cosine_sim with pearson_sim in get_recommendations function
def get_recommendations(title, amount=10, similarity_matrix=pearson_sim):
    idx = indices[title]
    sim_scores = list(enumerate(similarity_matrix[idx]))

    # sort by most similar and get top 10 similar books
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:amount+1]
    
    book_indices = [i[0] for i in sim_scores]
    
    return book_df['title'].iloc[book_indices]

In [7]:
get_recommendations('Gilead',amount=2)

2252           The Handmaid's Tale
905     Go Tell it on the Mountain
Name: title, dtype: object

In [8]:
get_recommendations('The Lord of the Rings, the Return of the King - Visual Companion',20)


3640      The Lord of the Rings Complete Visual Companion
3620    The Lord of the Rings - The Making of the Movi...
1998                           The Poetry of Pablo Neruda
3650                     The Lord of the Rings Sketchbook
88                       The Art of The Lord of the Rings
3571              The Illustrated A Brief History of Time
3239              The Cambridge Companion to Schopenhauer
3624          The Lord of the Rings - Weapons and Warfare
1889    The Hobbit / The Lord of the Rings - The Hobbi...
3619    The Lord of the Rings - The Two Towers : Visua...
4461           Star Wars - The Complete Visual Dictionary
80                                       The Silmarillion
3599                         J.R.R. Tolkien - A Biography
63      The Lord of the Rings - The Art of The Return ...
2117    Tsunamis and Other Natural Disasters - A Nonfi...
1407                      Zondervan Handbook to the Bible
3602                 The History of the Lord of the Rings
4535          

In [9]:
get_recommendations('The Lord of the Rings',10)

1735    The Fellowship of the Ring - Being the First P...
6173    The Fellowship of the Ring - Being the First P...
79      The Return of the King - Being the Third Part ...
1316                 The Hobbit, Or, There and Back Again
3590    The Two Towers - Being the Second Part of The ...
3603    The Return of the Shadow - The History of The ...
1889    The Hobbit / The Lord of the Rings - The Hobbi...
3624          The Lord of the Rings - Weapons and Warfare
3167                                The Tolkien Companion
80                                       The Silmarillion
Name: title, dtype: object