<a href="https://colab.research.google.com/github/murtagh97/book_recommender/blob/main/recommender_ntb_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
### set up the directories & mount google drive ###
# change directories according to your drive #

import os
import re
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

path_to_project = ['drive','My Drive','ds_ml_task']
path_root = os.path.join(*path_to_project)

os.chdir(path_root)
os.getcwd()

Mounted at /content/drive


'/content/drive/My Drive/ds_ml_task'

# 1 Data Read

In [3]:
import pandas as pd
import numpy as np

import string

Data avalilable [here](http://www2.informatik.uni-freiburg.de/~cziegler/BX/).

In [4]:
### create path to the data according to your drive ###
path_to_data = '/content/drive/My Drive/ds_ml_task/BX-CSV-Dump/'

### read csv files ###
ratings = pd.read_csv(path_to_data + 'BX-Book-Ratings.csv', delimiter=";", encoding="latin1")
books = pd.read_csv(path_to_data + 'BX-Books.csv', delimiter=";", encoding="latin1", error_bad_lines=False)
users = pd.read_csv(path_to_data + 'BX-Users.csv', delimiter=";", encoding="latin1")

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
ratings.rename(
    columns = {
        'User-ID': 'user_id',
        'ISBN': 'isbn',
        'Book-Rating': 'book_rating'
        }, 
    inplace=True
    )
ratings.head(2)

Unnamed: 0,user_id,isbn,book_rating
0,276725,034545104X,0
1,276726,0155061224,5


In [6]:
#ratings.to_csv( 'rating_info.csv', index = False)

In [7]:
books.rename(
    columns = {
        'ISBN': 'isbn',
        'Book-Title': 'book_title',
        'Book-Author': 'book_author',
        'Year-Of-Publication': 'publication_year',
        'Publisher': 'publisher'
        }, 
    inplace=True
    )
books.head(2)

Unnamed: 0,isbn,book_title,book_author,publication_year,publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [8]:
users.rename(
    columns = {
        'User-ID': 'user_id',
        'Location': 'location',
        'Age': 'age'
        }, 
    inplace=True
    )
users.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


# 2 Data Cleansing

In [9]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
def get_missing_values(df):

    new_df = pd.DataFrame(columns = ['Total', 'Percent'])
    new_df['Total'] = df.isnull().sum(axis = 0)
    new_df['Percent'] = new_df['Total'] / df.shape[0] * 100

    new_df.sort_values(by=['Total'], inplace=True, ascending=False)
    return new_df

## 2.1 Users Info

In [11]:
# split location into City, State, Country columns
users[['city', 'state', 'country']] = users['location'].str.split(',', n=2, expand=True)
users_info = (
        users.copy()
        .drop(['location'], axis='columns')
        )

# strip Country column of additional punctuation
users_info['country'] = users_info['country'].str.replace('[{}]'.format(string.punctuation), '')

# convert user ID column to string
# users_info['user_id'] = users_info['user_id'].astype(str)

users_info.head(2)

Unnamed: 0,user_id,age,city,state,country
0,1,,nyc,new york,usa
1,2,18.0,stockton,california,usa


In [12]:
get_missing_values(users_info)

Unnamed: 0,Total,Percent
age,110762,39.719857
country,2,0.000717
state,1,0.000359
user_id,0,0.0
city,0,0.0


In [13]:
# replace nonsense age values with NaNs
users_info.loc[(users_info['age'] > 115) | (users_info['age'] < 5), 'age'] = np.NaN
get_missing_values(users_info)

Unnamed: 0,Total,Percent
age,111729,40.066629
country,2,0.000717
state,1,0.000359
user_id,0,0.0
city,0,0.0


In [14]:
#users_info.to_csv( 'users_info.csv', index = False)

In [15]:
mean_age = users_info['age'].mean()
users_info['age'] = users_info['age'].fillna(mean_age)
get_missing_values(users_info)

Unnamed: 0,Total,Percent
country,2,0.000717
state,1,0.000359
user_id,0,0.0
age,0,0.0
city,0,0.0


In [16]:
users_info['age'].describe()

count    278858.000000
mean         34.863441
std          10.773322
min           5.000000
25%          29.000000
50%          34.863441
75%          35.000000
max         115.000000
Name: age, dtype: float64

In [17]:
users_info[['city', 'state', 'country']].describe()

Unnamed: 0,city,state,country
count,278858,278857,278856
unique,32770,6334,1145
top,london,california,usa
freq,4105,19898,139190


## 2.2 Books Info

In [18]:
# drop images for now
books_info = (
        books.copy()
        .drop(['Image-URL-S', 'Image-URL-M'], axis='columns')
        )
books_info

Unnamed: 0,isbn,book_title,book_author,publication_year,publisher,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...


In [19]:
# some entries are integer, some are strings, needs to be cleaned
books_info['publication_year'].unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984, 0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 2030,
       1911, 1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 2050, 1934,
       1910, 1933, 1902, 1924, 1921, 1900, 2038, 2026, 1944, 1917, 1901,
       2010, 1908, 1906, 1935, 1806, 2021, '2000', '1995', '1999', '2004',
       '2003', '1990', '1994', '1986', '1989', '2002', '1981', '1993',
       '1983', '1982', '1976', '1991', '1977', '1998', '1992', '1996',
       '0', '1997', '2001', '1974', '1968', '1987', '1984', '1988',
       '1963', '1956', '1970', '1985', '1978', '1973', '1980'

In [20]:
# drop rows with entry errors
books_info = books_info[ (books_info['publication_year'] != 'DK Publishing Inc') & (books_info['publication_year'] != 'Gallimard') ]

# change dtype to numeric
books_info['publication_year'] = pd.to_numeric(books_info['publication_year'])

# still entries with year of publication == 0, which does not make much sense
# change entries with year of publication < 1800 & year of publication > 2004 to 0 
books_info.loc[(books_info['publication_year'] < 1800) | (books_info['publication_year'] > 2004), 'publication_year'] = 0

np.sort(
    books_info['publication_year'].unique()
)

array([   0, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908, 1909, 1910,
       1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926,
       1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937,
       1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948,
       1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959,
       1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970,
       1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
       1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
       1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004])

In [21]:
# clean the book_title column- remove text within the brackets
# usually represent either the edition of the book, name of the publisher, or the name of the book series -> in the end, it is the same book
books_info[books_info["book_title"].str.contains("Anna Karenina")]

Unnamed: 0,isbn,book_title,book_author,publication_year,publisher,Image-URL-L
6887,1853262714,Anna Karenina (Wordsworth Classics),Leo Tolstoy,1997,NTC/Contemporary Publishing Company,http://images.amazon.com/images/P/1853262714.0...
8230,0553211714,Anna Karenina,Leo Tolstoy,1984,Bantam,http://images.amazon.com/images/P/0553211714.0...
14451,0451524497,Anna Karenina (Signet Classics (Paperback)),Leo Tolstoy,1988,Penguin Books,http://images.amazon.com/images/P/0451524497.0...
22923,0713994606,Anna Karenina (Penguin Classics),Leo Tolstoy,2000,Viking,http://images.amazon.com/images/P/0713994606.0...
28499,0143035002,Anna Karenina (Oprah's Book Club),Leo Tolstoy,2004,Penguin Books,http://images.amazon.com/images/P/0143035002.0...
34418,067978330X,Anna Karenina (Modern Library Classics),Leo Tolstoy,2000,Modern Library,http://images.amazon.com/images/P/067978330X.0...
36076,0553213466,Anna Karenina,Leo Tolstoy,1984,Bantam,http://images.amazon.com/images/P/0553213466.0...
42930,1404360700,Anna Karenina,Leo Tolstoy,2003,IndyPublish.com,http://images.amazon.com/images/P/1404360700.0...
46540,0192833812,Anna Karenina (Oxford World's Classics),Leo Tolstoy,1998,Oxford University Press,http://images.amazon.com/images/P/0192833812.0...
50877,157815118X,Anna Karenina,Leo Tolstoy,2004,Media Books Audio Publishing,http://images.amazon.com/images/P/157815118X.0...


In [22]:
# remove everything between round brackets, nested brackets, and the brackets themself
books_info['book_title'] = books_info['book_title'].str.replace(r"\(.*\)","")
books_info['book_title'] = books_info['book_title'].str.replace(r"[ \t]+$","")

books_info[books_info["book_title"].str.contains("Anna Karenina")]

Unnamed: 0,isbn,book_title,book_author,publication_year,publisher,Image-URL-L
6887,1853262714,Anna Karenina,Leo Tolstoy,1997,NTC/Contemporary Publishing Company,http://images.amazon.com/images/P/1853262714.0...
8230,0553211714,Anna Karenina,Leo Tolstoy,1984,Bantam,http://images.amazon.com/images/P/0553211714.0...
14451,0451524497,Anna Karenina,Leo Tolstoy,1988,Penguin Books,http://images.amazon.com/images/P/0451524497.0...
22923,0713994606,Anna Karenina,Leo Tolstoy,2000,Viking,http://images.amazon.com/images/P/0713994606.0...
28499,0143035002,Anna Karenina,Leo Tolstoy,2004,Penguin Books,http://images.amazon.com/images/P/0143035002.0...
34418,067978330X,Anna Karenina,Leo Tolstoy,2000,Modern Library,http://images.amazon.com/images/P/067978330X.0...
36076,0553213466,Anna Karenina,Leo Tolstoy,1984,Bantam,http://images.amazon.com/images/P/0553213466.0...
42930,1404360700,Anna Karenina,Leo Tolstoy,2003,IndyPublish.com,http://images.amazon.com/images/P/1404360700.0...
46540,0192833812,Anna Karenina,Leo Tolstoy,1998,Oxford University Press,http://images.amazon.com/images/P/0192833812.0...
50877,157815118X,Anna Karenina,Leo Tolstoy,2004,Media Books Audio Publishing,http://images.amazon.com/images/P/157815118X.0...


In [23]:
books_info['book_title'][books_info['book_title'].str.contains(r'&amp;(?!$)')]

23                   Mary-Kate &amp; Ashley Switching Goals
118                                     Angels &amp; Demons
203       The James Dean Affair: A Neil Gulliver &amp; S...
239                                     Angels &amp; Demons
291                     Angels &amp; Insects : Two Novellas
                                ...                        
271084    Flashman &amp; the Angel of the Lord: From the...
271113    Magic... Naturally!: Science Entertainments &a...
271235                                   Tycho &amp; Kepler
271266                              Rough &amp; Rugged Lily
271283    The Feelings Book: The Care &amp; Keeping of Y...
Name: book_title, Length: 3786, dtype: object

In [24]:
# replace html tag for ampersand &amp; by 'and'
books_info['book_title'] = books_info['book_title'].str.replace(r'&amp;(?!$)','and')
books_info['book_title'][books_info['book_title'].str.contains(r'&amp;(?!$)')]

Series([], Name: book_title, dtype: object)

In [25]:
# there are still books with typos in their book titles and more
books_info[books_info["book_title"].str.contains("For Whom")]

Unnamed: 0,isbn,book_title,book_author,publication_year,publisher,Image-URL-L
18563,684803356,For Whom the Bell Tolls,Ernest Hemingway,1995,Scribner,http://images.amazon.com/images/P/0684803356.0...
29667,20518501,For Whome the Bell Tolls,Ernest Hemingway,1987,Scribner Paper Fiction,http://images.amazon.com/images/P/0020518501.0...
34375,684830485,For Whom the Bell Tolls,Ernest Hemingway,1996,Scribner,http://images.amazon.com/images/P/0684830485.0...
62116,425183866,For Whom Death Tolls,Kate Kingsbury,2002,Berkley Publishing Group,http://images.amazon.com/images/P/0425183866.0...
128660,1890862185,For Whom the Minivan Rolls,Jeffrey Cohen,2002,Bancroft Press,http://images.amazon.com/images/P/1890862185.0...
132040,684717980,For Whom the Bell Tolls,Ernest Hemingway,1940,Scribner Paper Fiction,http://images.amazon.com/images/P/0684717980.0...
140264,684176602,For Whome the Bell Tolls,Ernest Hemingway,1982,Scribner Paper Fiction,http://images.amazon.com/images/P/0684176602.0...
199290,413690709,For Whom Bell Tolls,Steve Bell,1994,Methuen Publishing Ltd,http://images.amazon.com/images/P/0413690709.0...
252385,822004976,For Whom The Bell Tolls,LaRocque DuBose,1965,Cliffs Notes,http://images.amazon.com/images/P/0822004976.0...


In [26]:
# there are books, published in different years, listed under different ISBN -> in the end, the same book
books_info[books_info.book_title=='Anna Karenina'].head(5)

Unnamed: 0,isbn,book_title,book_author,publication_year,publisher,Image-URL-L
6887,1853262714,Anna Karenina,Leo Tolstoy,1997,NTC/Contemporary Publishing Company,http://images.amazon.com/images/P/1853262714.0...
8230,553211714,Anna Karenina,Leo Tolstoy,1984,Bantam,http://images.amazon.com/images/P/0553211714.0...
14451,451524497,Anna Karenina,Leo Tolstoy,1988,Penguin Books,http://images.amazon.com/images/P/0451524497.0...
22923,713994606,Anna Karenina,Leo Tolstoy,2000,Viking,http://images.amazon.com/images/P/0713994606.0...
28499,143035002,Anna Karenina,Leo Tolstoy,2004,Penguin Books,http://images.amazon.com/images/P/0143035002.0...


In [27]:
# count ISBN codes, given the book_title
multiple_isbn = books_info.groupby('book_title').isbn.nunique()
print(multiple_isbn.value_counts())

1     199407
2      19202
3       4270
4       1605
5        754
6        421
7        239
8        150
9         97
10        55
11        40
12        38
13        28
14        22
15        15
18        11
16        10
17         8
19         7
26         6
20         4
22         3
30         3
23         2
24         2
25         2
48         2
44         2
34         2
21         1
37         1
47         1
46         1
41         1
38         1
33         1
35         1
50         1
31         1
29         1
28         1
27         1
53         1
Name: isbn, dtype: int64


In [28]:
books_w_multiple_isbn = multiple_isbn.where(
    multiple_isbn > 1
    )

books_w_multiple_isbn.dropna(
    inplace = True
    )

print(f'{len(books_w_multiple_isbn)} book titles have multiple ISBN codes -> assign a unique identifier.')

27014 book titles have multiple ISBN codes -> assign a unique identifier.


In [29]:
# Create dictionary for books with multiple isbns
def multiple_isbn_dict(book_df, isbn_df):
    title_isbn_dict = {}
    for title in isbn_df.index:
        isbn_series = book_df.loc[book_df.book_title == title].isbn.unique()
        title_isbn_dict[title] = isbn_series.tolist()
    return title_isbn_dict

%time dict_unique_isbn = multiple_isbn_dict(books_info, books_w_multiple_isbn)

CPU times: user 8min 38s, sys: 1.94 s, total: 8min 40s
Wall time: 8min 40s


In [30]:
print(f'''Book 'Anna Karenina' has {len(dict_unique_isbn["Anna Karenina"])} different ISBN codes.''')

Book 'Anna Karenina' has 26 different ISBN codes.


In [31]:
# add unique isbn column to the dataframe
def add_unique_isbn_col(df, isbn_dict):
    new_df = df.copy()
    new_df['isbn_unique'] = df.apply(
        lambda row: isbn_dict[row.book_title][0] if row.book_title in isbn_dict.keys() else row.isbn, 
        axis=1
        )
    return new_df

books_info_unique = add_unique_isbn_col(books_info, dict_unique_isbn)

In [32]:
books_info_unique[books_info_unique.book_title=='Anna Karenina'].head(5)

Unnamed: 0,isbn,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique
6887,1853262714,Anna Karenina,Leo Tolstoy,1997,NTC/Contemporary Publishing Company,http://images.amazon.com/images/P/1853262714.0...,1853262714
8230,553211714,Anna Karenina,Leo Tolstoy,1984,Bantam,http://images.amazon.com/images/P/0553211714.0...,1853262714
14451,451524497,Anna Karenina,Leo Tolstoy,1988,Penguin Books,http://images.amazon.com/images/P/0451524497.0...,1853262714
22923,713994606,Anna Karenina,Leo Tolstoy,2000,Viking,http://images.amazon.com/images/P/0713994606.0...,1853262714
28499,143035002,Anna Karenina,Leo Tolstoy,2004,Penguin Books,http://images.amazon.com/images/P/0143035002.0...,1853262714


In [33]:
#books_info_unique.to_csv( 'book_info_unique.csv', index = False)

## 2.3 Create Main Df

### 2.3.1 main_df_all

In [34]:
main_df_all = pd.merge(ratings, books_info_unique, on=['isbn'])
main_df_all = pd.merge(main_df_all, users_info, on=['user_id'])
print(f'Df all size: {main_df_all.shape[0]} entries.')
main_df_all.head(5)

Df all size: 1031132 entries.


Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,tyler,texas,usa
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,23.0,cincinnati,ohio,usa
2,2313,0812533550,9,Ender's Game,Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,0312853238,23.0,cincinnati,ohio,usa
3,2313,0679745580,8,In Cold Blood,TRUMAN CAPOTE,1994,Vintage,http://images.amazon.com/images/P/0679745580.0...,0679745580,23.0,cincinnati,ohio,usa
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...,0060173289,23.0,cincinnati,ohio,usa


In [35]:
get_missing_values(main_df_all)

Unnamed: 0,Total,Percent
publisher,2,0.000194
book_author,1,9.7e-05
user_id,0,0.0
isbn,0,0.0
book_rating,0,0.0
book_title,0,0.0
publication_year,0,0.0
Image-URL-L,0,0.0
isbn_unique,0,0.0
age,0,0.0


In [36]:
main_df_all[main_df_all.loc[:,['book_author','publisher']].isnull().any(axis='columns')]

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country
9777,98391,193169656X,9,Tyrant Moon,Elaine Corvidae,2002,,http://images.amazon.com/images/P/193169656X.0...,193169656X,52.0,morrow,georgia,usa
9786,98391,1931696993,9,Finders Keepers,Linnea Sinclair,2001,,http://images.amazon.com/images/P/1931696993.0...,082177364X,52.0,morrow,georgia,usa
486483,98647,9627982032,8,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing,http://images.amazon.com/images/P/9627982032.0...,9627982032,26.0,selayang,selangor,malaysia


In [37]:
main_df_all['book_author'] = main_df_all['book_author'].fillna('Other')
main_df_all['publisher'] = main_df_all['publisher'].fillna('Other')

get_missing_values(main_df_all)

Unnamed: 0,Total,Percent
user_id,0,0.0
isbn,0,0.0
book_rating,0,0.0
book_title,0,0.0
book_author,0,0.0
publication_year,0,0.0
publisher,0,0.0
Image-URL-L,0,0.0
isbn_unique,0,0.0
age,0,0.0


### 2.3.2 main_df_explicit

In [38]:
main_df_explicit = main_df_all[ (main_df_all['book_rating'] != 0) ]
print(f'Df explicit size: {main_df_explicit.shape[0]} entries.')
main_df_explicit.head(5)

Df explicit size: 383841 entries.


Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,23.0,cincinnati,ohio,usa
2,2313,0812533550,9,Ender's Game,Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,0312853238,23.0,cincinnati,ohio,usa
3,2313,0679745580,8,In Cold Blood,TRUMAN CAPOTE,1994,Vintage,http://images.amazon.com/images/P/0679745580.0...,0679745580,23.0,cincinnati,ohio,usa
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...,0060173289,23.0,cincinnati,ohio,usa
5,2313,0385482388,5,The Mistress of Spices,Chitra Banerjee Divakaruni,1998,Anchor Books/Doubleday,http://images.amazon.com/images/P/0385482388.0...,0385482388,23.0,cincinnati,ohio,usa


In [39]:
main_df_explicit.describe()

Unnamed: 0,user_id,book_rating,publication_year,age
count,383841.0,383841.0,383841.0,383841.0
mean,136031.222256,7.626702,1965.183459,36.167518
std,80482.268021,1.841341,245.009725,10.48299
min,8.0,1.0,0.0,5.0
25%,67591.0,7.0,1992.0,31.0
50%,133788.0,8.0,1997.0,34.863441
75%,206219.0,9.0,2001.0,40.0
max,278854.0,10.0,2004.0,114.0


In [40]:
# add column with the average rating of each book given its unique ISBN code
average_ratings = main_df_explicit.groupby('isbn_unique')['book_rating'].mean().round(2).reset_index()
average_ratings.rename(columns={'book_rating': 'average_rating'}, inplace=True)

main_df_explicit = main_df_explicit.merge(average_ratings, on='isbn_unique')
main_df_explicit.head(5)

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country,average_rating
0,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,23.0,cincinnati,ohio,usa,6.29
1,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,st. charles county,missouri,usa,6.29
2,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,beaverton,oregon,usa,6.29
3,50403,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,conway,arkansas,usa,6.29
4,63970,034545104X,8,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,springfield,missouri,usa,6.29


In [41]:
# add column with the number of ratings of each book given its unique ISBN code
n_ratings_explicit = main_df_explicit.groupby('isbn_unique')['book_rating'].count().reset_index()
n_ratings_explicit.rename(columns={'book_rating': 'n_book_ratings'}, inplace=True)

main_df_explicit = main_df_explicit.merge(n_ratings_explicit, on='isbn_unique')
main_df_explicit.head(5)

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country,average_rating,n_book_ratings
0,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,23.0,cincinnati,ohio,usa,6.29,28
1,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,st. charles county,missouri,usa,6.29,28
2,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,beaverton,oregon,usa,6.29,28
3,50403,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,conway,arkansas,usa,6.29,28
4,63970,034545104X,8,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,springfield,missouri,usa,6.29,28


In [42]:
main_df_explicit['n_book_ratings'].describe().round(3)

count    383841.000
mean         36.041
std          75.334
min           1.000
25%           2.000
50%           7.000
75%          32.000
max         707.000
Name: n_book_ratings, dtype: float64

In [43]:
# keep books that have been rated more than the mean / third quartile number of ratings
main_df_explicit = main_df_explicit[main_df_explicit['n_book_ratings'] >= 32]
main_df_explicit

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country,average_rating,n_book_ratings
28,2313,0812533550,9,Ender's Game,Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,0312853238,23.000000,cincinnati,ohio,usa,8.86,168
29,81977,0765342294,9,Ender's Game,Orson Scott Card,2002,Starscape Books,http://images.amazon.com/images/P/0765342294.0...,0312853238,34.000000,minneapolis,minnesota,usa,8.86,168
30,163202,0812550706,10,Ender's Game,Orson Scott Card,1994,Tor Books,http://images.amazon.com/images/P/0812550706.0...,0312853238,26.000000,los angeles,california,usa,8.86,168
31,11676,0812550706,9,Ender's Game,Orson Scott Card,1994,Tor Books,http://images.amazon.com/images/P/0812550706.0...,0312853238,34.863441,,,na,8.86,168
32,11676,0812589041,7,Ender's Game,Orson Scott Card,1999,Tor Books,http://images.amazon.com/images/P/0812589041.0...,0312853238,34.863441,,,na,8.86,168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265139,128624,3257229534,9,Der Vorleser,Bernhard Schlink,1999,Diogenes Verlag AG,http://images.amazon.com/images/P/3257229534.0...,3257229534,20.000000,oberhausen,nordrhein-westfalen,germany,7.88,32
265140,136037,3257229534,8,Der Vorleser,Bernhard Schlink,1999,Diogenes Verlag AG,http://images.amazon.com/images/P/3257229534.0...,3257229534,19.000000,möhlin,aargau,switzerland,7.88,32
265141,157669,3257229534,7,Der Vorleser,Bernhard Schlink,1999,Diogenes Verlag AG,http://images.amazon.com/images/P/3257229534.0...,3257229534,34.863441,hamburg,hamburg,germany,7.88,32
265142,163288,3257229534,7,Der Vorleser,Bernhard Schlink,1999,Diogenes Verlag AG,http://images.amazon.com/images/P/3257229534.0...,3257229534,22.000000,baunatal,hessen,germany,7.88,32


In [44]:
#main_df_explicit.to_csv( 'main_df_explicit.csv', index = False)
#pd.read_csv(os.getcwd() + 'main_df_explicit.csv').head()

### 2.3.3 main_df_implicit

In [45]:
main_df_implicit = main_df_all.copy()
main_df_implicit.loc[ main_df_implicit['book_rating'] >= 0, 'book_rating'] = 1
print(f'Df implicit size: {main_df_implicit.shape[0]} entries.')
main_df_implicit.head(5)

Df implicit size: 1031132 entries.


Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country
0,276725,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,tyler,texas,usa
1,2313,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,23.0,cincinnati,ohio,usa
2,2313,0812533550,1,Ender's Game,Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,0312853238,23.0,cincinnati,ohio,usa
3,2313,0679745580,1,In Cold Blood,TRUMAN CAPOTE,1994,Vintage,http://images.amazon.com/images/P/0679745580.0...,0679745580,23.0,cincinnati,ohio,usa
4,2313,0060173289,1,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...,0060173289,23.0,cincinnati,ohio,usa


In [46]:
# add column with the average rating of each book given its unique ISBN code (not necessary)
average_ratings = main_df_implicit.groupby('isbn_unique')['book_rating'].mean().round(2).reset_index()
average_ratings.rename(columns={'book_rating': 'average_rating'}, inplace=True)

main_df_implicit = main_df_implicit.merge(average_ratings, on='isbn_unique')
main_df_implicit.head(5)

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country,average_rating
0,276725,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,tyler,texas,usa,1
1,2313,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,23.0,cincinnati,ohio,usa,1
2,6543,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.0,strafford,missouri,usa,1
3,8680,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,st. charles county,missouri,usa,1
4,10314,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,beaverton,oregon,usa,1


In [47]:
# add column with the number of interactions of each book given its unique ISBN code
n_ratings_implicit = main_df_implicit.groupby('isbn_unique')['book_rating'].count().reset_index()
n_ratings_implicit.rename(columns={'book_rating': 'n_book_ratings'}, inplace=True)

main_df_implicit = main_df_implicit.merge(n_ratings_implicit, on='isbn_unique')
main_df_implicit.head(5)

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country,average_rating,n_book_ratings
0,276725,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,tyler,texas,usa,1,60
1,2313,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,23.0,cincinnati,ohio,usa,1,60
2,6543,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.0,strafford,missouri,usa,1,60
3,8680,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,st. charles county,missouri,usa,1,60
4,10314,034545104X,1,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,034545104X,34.863441,beaverton,oregon,usa,1,60


In [48]:
main_df_implicit['n_book_ratings'].describe().round(3)

count    1031132.000
mean          77.450
std          182.233
min            1.000
25%            4.000
50%           15.000
75%           73.000
max         2502.000
Name: n_book_ratings, dtype: float64

In [49]:
main_df_implicit = main_df_implicit[main_df_implicit['n_book_ratings'] >= 77]
print(f'New df explicit size: {main_df_implicit.shape[0]} entries.')
main_df_implicit.head(5)

New df explicit size: 248549 entries.


Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,publication_year,publisher,Image-URL-L,isbn_unique,age,city,state,country,average_rating,n_book_ratings
60,2313,812533550,1,Ender's Game,Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,312853238,23.0,cincinnati,ohio,usa,1,278
61,81977,765342294,1,Ender's Game,Orson Scott Card,2002,Starscape Books,http://images.amazon.com/images/P/0765342294.0...,312853238,34.0,minneapolis,minnesota,usa,1,278
62,115435,812550706,1,Ender's Game,Orson Scott Card,1994,Tor Books,http://images.amazon.com/images/P/0812550706.0...,312853238,30.0,cincinnati,ohio,usa,1,278
63,163202,812550706,1,Ender's Game,Orson Scott Card,1994,Tor Books,http://images.amazon.com/images/P/0812550706.0...,312853238,26.0,los angeles,california,usa,1,278
64,227520,812550706,1,Ender's Game,Orson Scott Card,1994,Tor Books,http://images.amazon.com/images/P/0812550706.0...,312853238,33.0,san jose,california,usa,1,278


In [50]:
#main_df_implicit.to_csv( 'main_df_implicit.csv', index = False)
#pd.read_csv(os.getcwd() + 'main_df_implicit.csv').head()

# 3 EDA and Visualisation

In the web app.

# 4 Model Building

Using explicit ratings.

## 4.1 Nearest Neighbours

### 4.1.1 Model

In [51]:
from scipy.sparse import csr_matrix 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [52]:
wide_df = main_df_explicit.pivot_table(
    columns='user_id', 
    index='isbn_unique', 
    values='book_rating'
    )

wide_df.fillna(0, inplace=True)

wide_sparse = csr_matrix(wide_df)

In [53]:
model_knn = NearestNeighbors(
    algorithm='brute',
    metric = 'cosine'
)

model_knn.fit(
    wide_sparse
    )

NearestNeighbors(algorithm='brute', metric='cosine')

In [54]:
print( main_df_explicit.isbn_unique.nunique() == wide_df.shape[0] )
print( main_df_explicit.user_id.nunique() == wide_df.shape[1] )

True
True


### 4.1.2 Custom Evaluation Metrics

In [55]:
n_rec = 5

# create dictionary with every book from the df and its corresponding top5 recommendations
hr_dict = {}
for book_index in range(wide_df.shape[0]):
    
    book_isbn = wide_df.axes[0].tolist()[book_index]

    _, recommendations = model_knn.kneighbors(
        wide_df.iloc[book_index, :].values.reshape(1,-1), 
        n_neighbors = n_rec + 1
        )
    
    recommendations_isbn = []
    for rec in recommendations.flatten():
        rec_isbn = wide_df.axes[0].tolist()[rec] 
        if rec_isbn != book_isbn:
            recommendations_isbn.append( rec_isbn )

    hr_dict.update( { book_isbn : recommendations_isbn } )

In [56]:
# convert dict keys and values to lists
books_list = list(hr_dict.keys())
top_5_list = list(hr_dict.values())

In [57]:
# hit = min 1 recommended book has the same author as the input book
hits = 0
for index in range(len(books_list)):

    sub_df_book = books_info_unique[books_info_unique['isbn_unique'] == books_list[index] ]
    book_author = sub_df_book.iloc[0]['book_author'].lower()

    sub_df_author = books_info_unique[ books_info_unique['book_author'].str.lower() == book_author ].drop_duplicates(subset=['isbn_unique'], keep = 'first')

    count = 0
    for rec_book in top_5_list[index]:
        count += sub_df_author[sub_df_author['isbn_unique'] == rec_book].shape[0]

    if count > 0:
        hits += 1

hr = hits / len(books_list)

print(f'Number of hits is {hits} (hit = book from the same author recommended).')
print(f'Hit ratio is {round(hr, 4)}.')

Number of hits is 780 (hit = book from the same author recommended).
Hit ratio is 0.5878.


In [58]:
# hit = min 1 recommended book has the same author as the input book
# hit ratio adjusted = valid only if author wrote min 2 books
hits = 0
author_count = 0
for index in range(len(books_list)):

    sub_df_book = books_info_unique[books_info_unique['isbn_unique'] == books_list[index] ]
    book_author = sub_df_book.iloc[0]['book_author'].lower()

    sub_df_author = books_info_unique[ books_info_unique['book_author'].str.lower() == book_author ].drop_duplicates(subset=['isbn_unique'], keep = 'first')
    if sub_df_author.shape[0] > 1:
        author_count += 1

    count = 0
    for rec_book in top_5_list[index]:
        count += sub_df_author[sub_df_author['isbn_unique'] == rec_book].shape[0]

    if count > 0:
        hits += 1

hr_adj = hits / author_count

print(f'Number of hits is {hits} (hit = book from the same author recommended).')
print(f'Hit ratio is {round(hr_adj, 4)}.')

Number of hits is 780 (hit = book from the same author recommended).
Hit ratio is 0.595.


In [59]:
# hit = min 1 recommended book was read by min 1 user that has also read the input book
hits = 0
for index in range(len(books_list)):

    sub_df_user = main_df_explicit[main_df_explicit['isbn_unique'] == books_list[index] ]

    for _, row in sub_df_user.iterrows():

        sub_df = main_df_explicit[ main_df_explicit['user_id'] == row['user_id'] ]

        count = 0
        for rec_book in top_5_list[index]:
            count += sub_df[sub_df['isbn_unique'] == rec_book].shape[0]
                
        if count > 0:
            hits += 1
            break

hr = hits / len(books_list)

print(f'Number of hits is {hits} (hit = min 1 recommended book was read by min 1 user that has also read the input book).')
print(f'Hit ratio is {hr}.')

Number of hits is 1327 (hit = min 1 recommended book was read by min 1 user that has also read the input book).
Hit ratio is 1.0.


In [60]:
# hit = each of the recommended book was read by the users that have also read the input book
hits = 0
for index in range(len(books_list)):

    sub_df_user = main_df_explicit[main_df_explicit['isbn_unique'] == books_list[index] ]

    count = 0
    for rec_book in top_5_list[index]:

        for _, row in sub_df_user.iterrows():

            sub_df = main_df_explicit[ main_df_explicit['user_id'] == row['user_id'] ]

            if sub_df[sub_df['isbn_unique'] == rec_book].shape[0] > 0:
                count += 1
                break
                
    if count == n_rec:
        hits += 1

hr = hits / len(books_list)

print(f'Number of hits is {hits} (hit = each of the recommended book was read by the users that have also read the input book).')
print(f'Hit ratio is {hr}.')

Number of hits is 1327 (hit = each of the recommended book was read by the users that have also read the input book).
Hit ratio is 1.0.


In [61]:
# ratio of the users, that have read the input book and also min 1 of its recommendations
perc_hit = []
for index in range(len(books_list)):

    sub_df_user = main_df_explicit[main_df_explicit['isbn_unique'] == books_list[index] ]

    count = 0
    for _, row in sub_df_user.iterrows():

        sub_df = main_df_explicit[ main_df_explicit['user_id'] == row['user_id'] ]

        for rec_book in top_5_list[index]:

            if sub_df[sub_df['isbn_unique'] == rec_book].shape[0] > 0:
                count += 1
                break
    
    perc_hit.append(
        count / sub_df_user.shape[0]
    )

print(f'Ratio of the users, that have read the input book and also min 1 of its recommendations {round(np.mean(perc_hit), 4)}.')

Ratio of the users, that have read the input book and also min 1 of its recommendations 0.3305.


In [62]:
# sanity check
all(i <= 1 for i in perc_hit)

True

## 4.2 TF model

In [63]:
import tensorflow as tf

In [64]:
def build_MF_model(n_users, n_books, emb_size):

      input_user = tf.keras.layers.Input(shape=(1,), name='input_user')
      input_book = tf.keras.layers.Input(shape=(1,), name='input_book')

      emb_user = tf.keras.layers.Embedding(
            input_dim= n_users + 1,
            output_dim= emb_size,
            embeddings_initializer='uniform',
            name='emb_user'
          )(input_user)
      
      emb_book = tf.keras.layers.Embedding(
            input_dim= n_books + 1,
            output_dim= emb_size,
            name= 'emb_book'
          )(input_book)

      X = tf.keras.layers.multiply([emb_user, emb_book])
      X = tf.keras.layers.Dense(units = 1)(X)
      
      model = tf.keras.Model(inputs = [input_user, input_book], outputs = X)
      model.compile(
              optimizer =tf.keras.optimizers.Adam(lr = 0.01), 
              loss = 'mae',
              metrics = 'mae'
            )
      
      return model

In [65]:
n_users = len(main_df_explicit.user_id.unique())
n_books = len(main_df_explicit.isbn_unique.unique())

In [66]:
mf_model = build_MF_model(n_users, n_books, 16)
mf_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_user (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 input_book (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 emb_user (Embedding)           (None, 1, 16)        505424      ['input_user[0][0]']             
                                                                                                  
 emb_book (Embedding)           (None, 1, 16)        21248       ['input_book[0][0]']             
                                                                                              

In [67]:
  from sklearn.preprocessing import LabelEncoder

  user_encoder = LabelEncoder()
  book_encoder = LabelEncoder()

  user_ids_encoded = user_encoder.fit_transform(main_df_explicit.user_id.values)
  book_ids_encoded = book_encoder.fit_transform(main_df_explicit.isbn_unique.values)

In [68]:
mf_model.fit(
        x=[user_ids_encoded, book_ids_encoded],
        y=main_df_explicit.book_rating.values,
        validation_split=0.1,
        #callbacks=[early_stopping],
        epochs=5,
        batch_size=128,
        #verbose=1
    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb13d0bb690>

In [69]:
mf_model.predict([user_ids_encoded, book_ids_encoded])

array([[[8.018869 ]],

       [[8.024233 ]],

       [[8.03032  ]],

       ...,

       [[7.6810064]],

       [[7.6815453]],

       [[7.6822515]]], dtype=float32)

# 5 Model Deployment

## 5.1 Fuzzy String Matching

In [70]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [71]:
from fuzzywuzzy import fuzz
from PIL import Image
import matplotlib.pyplot as plt
import requests

In [72]:
def book_title_matching(book_title, book_df):

    book_df_unique = book_df.drop_duplicates(
        subset = 'book_title', keep="first"
        )

    best_ratio = 0

    for title in book_df_unique.book_title:

        ratio = fuzz.ratio(
            title.lower(), 
            book_title.lower()
            )
        
        if ratio >= best_ratio:
            best_ratio = ratio
            best_match = title

    return [ best_match, book_df_unique.loc[book_df_unique['book_title'] == best_match, 'isbn_unique'].iloc[0] ]

In [73]:
book_title = 'The lord of the rings'
match = book_title_matching(book_title, main_df_explicit)
match

['The Lord of the Rings', '0618129022']

In [74]:
def print_book_recommemndation(best_match, wide_df, book_df, model, n_rec):
    
    best_match_index = wide_df.index.tolist().index(best_match[1])

    distances, recommendations = model.kneighbors(
        wide_df.iloc[best_match_index, :].values.reshape(1,-1), 
        n_neighbors = n_rec + 1
        )

    for i in range(0, n_rec + 1):
        if i == 0:
            print('Recommendations for {0}:'.format(best_match[0]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(
                i, 
                book_df.loc[book_df['isbn_unique'] == wide_df.index[recommendations.flatten()[i]], 'book_title'].iloc[0],
                distances.flatten()[i]
                ))

In [75]:
def book_recommemndation_img(best_match, wide_df, book_df, model, n_rec):
    best_match_index = wide_df.index.tolist().index(best_match[1])

    distances, recommendations = model.kneighbors(
        wide_df.iloc[best_match_index, :].values.reshape(1,-1), 
        n_neighbors = n_rec + 1
        )

    plt.figure(figsize = (20,50))
    for i in range(2, n_rec):
        title = book_df.loc[book_df['isbn_unique'] == wide_df.index[recommendations.flatten()[i]], 'book_title'].iloc[0]
        distance = distances.flatten()[i]
        url = book_df.loc[book_df['isbn_unique'] == wide_df.index[recommendations.flatten()[i]], 'Image-URL-L'].iloc[0]

        plt.subplot(1, n_rec, i+1)
        im = Image.open(requests.get(url, stream=True).raw)
        plt.imshow(im)
        plt.axis("off")

## 5.2 Streamlit App

Streamlit script.