In [1]:
import pandas as pd
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
 from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from pathlib import Path
import os

In [4]:
path = os.path.join(Path.cwd() , 'data')
all_files = glob.glob(path + "/*.csv")
files = []

for filename in all_files:
    df = pd.read_csv(filename, header=0, encoding='unicode_escape')
    files.append(df)

data = pd.concat(files, axis=0, ignore_index=True)

In [5]:
data.isnull().sum()

Id                    0
Title                 0
Author                0
Average Rating        0
Number of ratings     0
Year of publishing    0
Url of image          0
Genre                 0
dtype: int64

In [6]:
duplicate = data[data.duplicated(keep=False)]
duplicate.sort_values(['Title'])

Unnamed: 0,Id,Title,Author,Average Rating,Number of ratings,Year of publishing,Url of image,Genre
2403,40961427-1984,1984 (Kindle Edition),George Orwell,4.19,3610028,1949,https://i.gr-assets.com/images/S/compressed.ph...,classics
3079,40961427-1984,1984 (Kindle Edition),George Orwell,4.19,3610028,1949,https://i.gr-assets.com/images/S/compressed.ph...,classics
19011,778581.A_History_of_Religious_Ideas_1,A History of Religious Ideas 1: From the Stone...,Mircea Eliade,4.44,1385,1975,https://i.gr-assets.com/images/S/compressed.ph...,religion
19968,778581.A_History_of_Religious_Ideas_1,A History of Religious Ideas 1: From the Stone...,Mircea Eliade,4.44,1385,1975,https://i.gr-assets.com/images/S/compressed.ph...,religion
2571,5148.A_Separate_Peace,A Separate Peace (Paperback),John Knowles,3.59,200133,1959,https://i.gr-assets.com/images/S/compressed.ph...,classics
...,...,...,...,...,...,...,...,...
20666,50160619-you-should-see-me-in-a-crown,You Should See Me in a Crown (Hardcover),Leah Johnson,4.17,29982,2020,https://i.gr-assets.com/images/S/compressed.ph...,romance
30438,50160619-you-should-see-me-in-a-crown,You Should See Me in a Crown (Hardcover),Leah Johnson,4.17,29982,2020,https://i.gr-assets.com/images/S/compressed.ph...,young-adult
30437,50160619-you-should-see-me-in-a-crown,You Should See Me in a Crown (Hardcover),Leah Johnson,4.17,29982,2020,https://i.gr-assets.com/images/S/compressed.ph...,young-adult
20667,50160619-you-should-see-me-in-a-crown,You Should See Me in a Crown (Hardcover),Leah Johnson,4.17,29982,2020,https://i.gr-assets.com/images/S/compressed.ph...,romance


In [7]:
data = data.drop_duplicates()

In [8]:
data['Year of publishing'].unique()

array(['2011', '1947', '2018', '2010', '2015', '2001', '2004', '2007',
       '2009', '2016', '1996', '1965', '2012', '2003', '2017', '1985',
       '1956', '2005', '1969', '2014', '1791', '1979', '2020', '1971',
       '1994', '1995', '1998', '1946', '1845', '2006', '1997', '2000',
       '1903', '2008', '1927', '1991', '2013', '1974', '1981', '1992',
       '1967', '2002', '1982', '2019', '1853', '1980', '1983', '1955',
       '1999', '1986', '1977', '1972', '400', '1988', '1984', '1959',
       '1900', '1993', '1964', '1990', '1925', '1973', 'Null', '1942',
       '2021', '1978', '1970', '1861', '1933', '1950', '1854', '1790',
       '1958', '121', '1957', '1937', '1929', '1938', '1961', '1987',
       '1857', '1948', '1963', '1934', '1919', '1966', '1885', '1989',
       '1962', '1945', '1976', '1926', '1949', '1953', '1932', '1550',
       '1898', '1922', '1855', '1918', '1914', '1960', '100', '1935',
       '1782', '1954', '1975', '1968', '1930', '1789', '1951', '1923',
       '1

In [9]:
data['Year of publishing'] = data['Year of publishing'].astype('str')

In [10]:
for i in range(data.shape[0]):
    if len(data.iloc[i, 5]) != 4:
        data.iloc[i, 5] = "Null"
    elif '-' in data.iloc[i, 5]:
        data.iloc[i, 5] = "Null"

In [11]:
data[data['Year of publishing']=="Null"]

Unnamed: 0,Id,Title,Author,Average Rating,Number of ratings,Year of publishing,Url of image,Genre
143,27037.Confession,Confessions (Paperback),Augustine of Hippo,3.89,57612,Null,https://i.gr-assets.com/images/S/compressed.ph...,biography
203,639864.Autobiography_of_a_Yogi,Autobiography of a Yogi (Paperback),Paramahansa Yogananda,4.18,57218,Null,https://i.gr-assets.com/images/S/compressed.ph...,biography
299,29022.The_Twelve_Caesar,The Twelve Caesars (Paperback),Suetonius,3.94,20208,Null,https://i.gr-assets.com/images/S/compressed.ph...,biography
681,415634.Plutarch_s_Live,Plutarch's Lives: Volume I (Paperback),Plutarch,4.10,5477,Null,https://i.gr-assets.com/images/S/compressed.ph...,biography
1097,36664343-tiger-wood,Tiger Woods (Hardcover),Jeff Benedict,4.35,6642,Null,https://i.gr-assets.com/images/S/compressed.ph...,biography
...,...,...,...,...,...,...,...,...
29864,6551734-sweden,Sweden (Paperback),Becky Ohlsen,3.85,183,Null,https://i.gr-assets.com/images/S/compressed.ph...,travel
29899,43783430-a-month-in-siena,A Month in Siena (Hardcover),Hisham Matar,4.06,1334,Null,https://i.gr-assets.com/images/S/compressed.ph...,travel
29935,1362.The_Historie,The Histories (Paperback),Herodotus,3.96,47156,Null,https://i.gr-assets.com/images/S/compressed.ph...,travel
29937,41181492-booked,Booked: A Traveler's Guide to Literary Locatio...,Richard Kreitner,3.67,235,Null,https://i.gr-assets.com/images/S/compressed.ph...,travel


In [12]:
data.drop(data[data['Year of publishing']=="Null"].index, inplace = True)

In [13]:
data['Average Rating'].unique()

array([4.15, 4.17, 4.5 , 4.35, 4.21, 4.04, 4.46, 4.12, 4.07, 4.45, 3.95,
       3.99, 4.32, 4.01, 4.14, 4.24, 4.34, 4.22, 3.7 , 4.36, 4.25, 3.83,
       4.28, 4.08, 3.9 , 4.18, 3.84, 4.37, 4.41, 4.02, 3.91, 4.1 , 4.09,
       4.39, 3.59, 3.85, 3.96, 4.31, 4.06, 3.8 , 3.82, 4.23, 3.69, 3.79,
       3.92, 4.11, 3.86, 3.81, 3.78, 3.77, 4.33, 3.88, 4.03, 3.93, 4.19,
       4.13, 3.73, 3.97, 3.65, 3.71, 4.2 , 4.05, 3.72, 3.89, 4.38, 3.87,
       4.51, 4.29, 3.62, 3.94, 4.  , 3.98, 4.16, 3.19, 3.29, 3.66, 3.75,
       4.58, 3.6 , 3.76, 3.67, 4.26, 4.71, 3.68, 3.64, 3.74, 3.63, 3.52,
       3.58, 3.48, 3.53, 4.57, 4.63, 4.66, 4.42, 3.32, 4.48, 3.57, 3.4 ,
       3.34, 4.49, 3.44, 4.3 , 4.27, 4.44, 4.54, 3.61, 4.6 , 4.56, 3.41,
       4.52, 4.43, 3.28, 3.56, 3.1 , 3.46, 3.51, 4.53, 4.47, 3.55, 4.62,
       3.45, 3.39, 4.4 , 3.54, 3.18, 3.42, 4.55, 3.43, 4.59, 3.38, 3.33,
       3.36, 3.47, 3.5 , 4.61, 3.15, 2.79, 3.27, 3.31, 3.3 , 3.17, 3.03,
       3.35, 3.49, 3.21, 4.77, 4.65, 4.64, 4.68, 4.

In [14]:
data.loc[data['Average Rating'] == 0.]

Unnamed: 0,Id,Title,Author,Average Rating,Number of ratings,Year of publishing,Url of image,Genre
14638,9430476-in-your-40s-judy-valon,In Your 40s. Judy Valon (Hardcover),Valon,0.0,0,2009,https://i.gr-assets.com/images/S/compressed.ph...,humour-and-comedy


In [15]:
data = data[data['Average Rating'] != 0.]

In [16]:
for i in range(data.shape[0]):
    data.iloc[i, 7] = ' '.join(data.iloc[i,7].split('-'))

In [17]:
df1 = data.groupby('Title')['Genre'].apply(lambda x: "%s" % ','.join(x)).reset_index()
df1

Unnamed: 0,Title,Genre
0,"""A Problem from Hell"": America and the Age of ...",history
1,"""Believing Women"" in Islam: Unreading Patriarc...",religion
2,"""Surely You're Joking, Mr. Feynman!"": Adventur...","biography,humour and comedy,nonfiction"
3,"""What Do You Care What Other People Think?"": F...",biography
4,#AskGaryVee: One Entrepreneur's Take on Leader...,business
...,...,...
20323,æ±äº¬å°ç¨®ãã¼ã­ã§ã¼ã°ã¼ã« 2 (Tokyo ...,horror
20324,æ±äº¬å°ç¨®ãã¼ã­ã§ã¼ã°ã¼ã« 3 (Tokyo ...,horror
20325,æ±äº¬å°ç¨®ãã¼ã­ã§ã¼ã°ã¼ã« 4 [Tokyo ...,horror
20326,æ¥µä¸»å¤«é 3 (Gokushufudou: The Way of the H...,graphic_novels


In [18]:
data = pd.merge(df1, data, on='Title')

In [19]:
data

Unnamed: 0,Title,Genre_x,Id,Author,Average Rating,Number of ratings,Year of publishing,Url of image,Genre_y
0,"""A Problem from Hell"": America and the Age of ...",history,368731._A_Problem_from_Hell_,Samantha Power,4.25,9582,2002,https://i.gr-assets.com/images/S/compressed.ph...,history
1,"""Believing Women"" in Islam: Unreading Patriarc...",religion,530114._Believing_Women_in_Islam,Asma Barlas,4.28,1267,2002,https://i.gr-assets.com/images/S/compressed.ph...,religion
2,"""Surely You're Joking, Mr. Feynman!"": Adventur...","biography,humour and comedy,nonfiction",35167685-surely-you-re-joking-mr-feynman,Richard P. Feynman,4.24,180244,1985,https://i.gr-assets.com/images/S/compressed.ph...,biography
3,"""Surely You're Joking, Mr. Feynman!"": Adventur...","biography,humour and comedy,nonfiction",35167685-surely-you-re-joking-mr-feynman,Richard P. Feynman,4.24,180238,1985,https://i.gr-assets.com/images/S/compressed.ph...,humour and comedy
4,"""Surely You're Joking, Mr. Feynman!"": Adventur...","biography,humour and comedy,nonfiction",35167685-surely-you-re-joking-mr-feynman,Richard P. Feynman,4.24,180238,1985,https://i.gr-assets.com/images/S/compressed.ph...,nonfiction
...,...,...,...,...,...,...,...,...,...
30599,æ±äº¬å°ç¨®ãã¼ã­ã§ã¼ã°ã¼ã« 2 (Tokyo ...,horror,20743809-2,Sui Ishida,4.45,27820,2012,https://i.gr-assets.com/images/S/compressed.ph...,horror
30600,æ±äº¬å°ç¨®ãã¼ã­ã§ã¼ã°ã¼ã« 3 (Tokyo ...,horror,20748818-3,Sui Ishida,4.41,15523,2012,https://i.gr-assets.com/images/S/compressed.ph...,horror
30601,æ±äº¬å°ç¨®ãã¼ã­ã§ã¼ã°ã¼ã« 4 [Tokyo ...,horror,22447379-4-tokyo-guru-4,Sui Ishida,4.39,13287,2012,https://i.gr-assets.com/images/S/compressed.ph...,horror
30602,æ¥µä¸»å¤«é 3 (Gokushufudou: The Way of the H...,graphic_novels,43619964-3,Kousuke Oono,4.37,3356,2019,https://i.gr-assets.com/images/S/compressed.ph...,graphic_novels


In [20]:
duplicate = data[data.duplicated(keep=False)]
duplicate.shape

(12, 9)

In [21]:
duplicate = data[data['Title'].duplicated(keep=False)]
duplicate.sort_values(['Title'])

Unnamed: 0,Title,Genre_x,Id,Author,Average Rating,Number of ratings,Year of publishing,Url of image,Genre_y
2,"""Surely You're Joking, Mr. Feynman!"": Adventur...","biography,humour and comedy,nonfiction",35167685-surely-you-re-joking-mr-feynman,Richard P. Feynman,4.24,180244,1985,https://i.gr-assets.com/images/S/compressed.ph...,biography
3,"""Surely You're Joking, Mr. Feynman!"": Adventur...","biography,humour and comedy,nonfiction",35167685-surely-you-re-joking-mr-feynman,Richard P. Feynman,4.24,180238,1985,https://i.gr-assets.com/images/S/compressed.ph...,humour and comedy
4,"""Surely You're Joking, Mr. Feynman!"": Adventur...","biography,humour and comedy,nonfiction",35167685-surely-you-re-joking-mr-feynman,Richard P. Feynman,4.24,180238,1985,https://i.gr-assets.com/images/S/compressed.ph...,nonfiction
7,#Girlboss (Hardcover),"biography,business,nonfiction,self help",18667945-girl,Sophia Amoruso,3.66,73905,2014,https://i.gr-assets.com/images/S/compressed.ph...,biography
8,#Girlboss (Hardcover),"biography,business,nonfiction,self help",18667945-girl,Sophia Amoruso,3.66,73904,2014,https://i.gr-assets.com/images/S/compressed.ph...,business
...,...,...,...,...,...,...,...,...,...
30576,Ø±Ø¨Ø§Ø¹ÙØ§Øª Ø®ÙØ§Ù (Hardcover),"classics,poetry,spirituality",716696._,Omar KhayyÃ¡m,4.18,19823,1120,https://i.gr-assets.com/images/S/compressed.ph...,poetry
30575,Ø±Ø¨Ø§Ø¹ÙØ§Øª Ø®ÙØ§Ù (Hardcover),"classics,poetry,spirituality",716696._,Omar KhayyÃ¡m,4.18,19823,1120,https://i.gr-assets.com/images/S/compressed.ph...,classics
30577,Ø±Ø¨Ø§Ø¹ÙØ§Øª Ø®ÙØ§Ù (Hardcover),"classics,poetry,spirituality",716696._,Omar KhayyÃ¡m,4.18,19823,1120,https://i.gr-assets.com/images/S/compressed.ph...,spirituality
30585,ÙØ§ ØªØ­Ø²Ù (Paperback),"religion,self help",2750180,Ø¹Ø§Ø¦Ø¶ Ø§ÙÙØ±ÙÙ,4.13,29098,2003,https://i.gr-assets.com/images/S/compressed.ph...,religion


In [22]:
data = data.drop_duplicates(subset=['Title']).reset_index()

In [23]:
x = data.Genre_x
a = list()
for i in x:
    abc = i
    a.append(abc.split(','))
a = pd.DataFrame(a)   
b = a[0].unique()
for i in b:
    data[i] = 0
for i in b:
    data.loc[data['Genre_x'].str.contains(i), i] = 1


In [24]:
data.drop(['index'], axis = 1, inplace = True)


In [25]:
df = data.drop(['Id', 'Author', 'Average Rating',
       'Number of ratings', 'Year of publishing', 'Url of image', 'Genre_y', 'Genre_x'], axis = 1)

In [26]:
df.head()

Unnamed: 0,Title,history,religion,biography,business,sports,poetry,nonfiction,horror,fiction,...,thriller,science fiction,classics,fantasy,spirituality,graphic_novels,mystery,historical_fiction,young adult,suspense
0,"""A Problem from Hell"": America and the Age of ...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"""Believing Women"" in Islam: Unreading Patriarc...",0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"""Surely You're Joking, Mr. Feynman!"": Adventur...",0,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,"""What Do You Care What Other People Think?"": F...",0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,#AskGaryVee: One Entrepreneur's Take on Leader...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df2 = df.drop(['Title'], axis=1)
df2

Unnamed: 0,history,religion,biography,business,sports,poetry,nonfiction,horror,fiction,travel,...,thriller,science fiction,classics,fantasy,spirituality,graphic_novels,mystery,historical_fiction,young adult,suspense
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20323,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
20324,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
20325,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
20326,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [28]:
df.head()

Unnamed: 0,Title,history,religion,biography,business,sports,poetry,nonfiction,horror,fiction,...,thriller,science fiction,classics,fantasy,spirituality,graphic_novels,mystery,historical_fiction,young adult,suspense
0,"""A Problem from Hell"": America and the Age of ...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"""Believing Women"" in Islam: Unreading Patriarc...",0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"""Surely You're Joking, Mr. Feynman!"": Adventur...",0,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,"""What Do You Care What Other People Think?"": F...",0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,#AskGaryVee: One Entrepreneur's Take on Leader...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
data.shape

(20328, 34)

In [30]:
data.columns

Index(['Title', 'Genre_x', 'Id', 'Author', 'Average Rating',
       'Number of ratings', 'Year of publishing', 'Url of image', 'Genre_y',
       'history', 'religion', 'biography', 'business', 'sports', 'poetry',
       'nonfiction', 'horror', 'fiction', 'travel', 'romance', 'self help',
       'humour and comedy', 'comics', 'crime', 'thriller', 'science fiction',
       'classics', 'fantasy', 'spirituality', 'graphic_novels', 'mystery',
       'historical_fiction', 'young adult', 'suspense'],
      dtype='object')

In [31]:
data = data[['Title', 'Genre_x', 'Id', 'Author', 'Average Rating',
       'Number of ratings', 'Year of publishing', 'Url of image']]

In [32]:
def combine_features(row):
      return row['Author']+" "+row['Genre_x']
data["combined_features"] = data.apply(combine_features,axis=1) 

In [33]:
tfidf = TfidfVectorizer(stop_words='english', strip_accents='ascii')
vec = tfidf.fit_transform(data['combined_features'])

In [34]:
df1 = pd.DataFrame(vec.toarray(), columns=tfidf.get_feature_names())


In [35]:
df1.head()

Unnamed: 0,1uu,33,37,50,aapek,aaron,aaronovitch,aarons,abagnale,abaka,...,zora,zoraida,zsuzsanna,zub,zuboff,zuckerman,zuckoff,zukav,zusak,zweig
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
data.drop(['Genre_x', 'Author', 'Url of image', 'combined_features'], axis = 1, inplace = True)

In [37]:
for i in range(data.shape[0]):
    data["Title"][i] = ''.join(data["Title"][i].split("(Paperback)"))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Title"][i] = ''.join(data["Title"][i].split("(Paperback)"))


In [38]:
data = pd.concat([data, df1], axis=1)


In [39]:
df = data.drop(['Title', 'Id'], axis = 1)


In [40]:
df.shape

(20328, 8521)

In [41]:
df.head()

Unnamed: 0,Average Rating,Number of ratings,Year of publishing,1uu,33,37,50,aapek,aaron,aaronovitch,...,zora,zoraida,zsuzsanna,zub,zuboff,zuckerman,zuckoff,zukav,zusak,zweig
0,4.25,9582,2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.28,1267,2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.24,180244,1985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.11,28974,1988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,4193,2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
# cosine_sim = cosine_similarity(df) The file after executing this step is provided in the README.md
