In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_colwidth', -1)

  


## Begin

In [2]:
books = pd.read_csv('Books.csv')
books.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg


In [3]:
ratings = pd.read_csv('Ratings.csv')
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
print(books.shape)
print(ratings.shape)

(271360, 8)
(1149780, 3)


In [5]:
# rename columns
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
ratings.columns = ['userID', 'ISBN', 'rating']

In [6]:
# drop columns with image link
books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)

## Data Clearing

In [7]:
# null rows count
books.isnull().sum()

ISBN                 0
bookTitle            0
bookAuthor           1
yearOfPublication    0
publisher            2
dtype: int64

### Book Title

In [8]:
books.bookTitle.describe()

count     271360        
unique    242135        
top       Selected Poems
freq      27            
Name: bookTitle, dtype: object

In [9]:
# all rows are string
[i for i in books.bookTitle if isinstance(i, str) == False]

[]

### Author

In [10]:
books.bookAuthor.describe()

count     271359         
unique    102023         
top       Agatha Christie
freq      632            
Name: bookAuthor, dtype: object

In [11]:
# row with null author
books[pd.isna(books.bookAuthor) == True]

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
187689,9627982032,The Credit Suisse Guide to Managing Your Personal Wealth,,1995,Edinburgh Financial Publishing


In [12]:
# replacement of null author with unknown
books.loc[pd.isna(books.bookAuthor) == True, 'bookAuthor'] = "unknown"
books[pd.isna(books.bookAuthor) == True]

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher


In [13]:
books[books.bookAuthor == "unknown"]

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
96703,1561449164,Pound It (Popular Mechanics for Kids),unknown,1997,Modern Publishing
187689,9627982032,The Credit Suisse Guide to Managing Your Personal Wealth,unknown,1995,Edinburgh Financial Publishing
267861,960638822,A Course In Miracles Workbook for students Manual for Teachers (Vol I II III) (volume I II III),unknown,1985,Foundation For Inner Peace


### Year of Publication

In [14]:
books.yearOfPublication.describe()

count     271360
unique    202   
top       2002  
freq      13903 
Name: yearOfPublication, dtype: int64

In [15]:
# part of the years are text, some are words
books.yearOfPublication.unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984, 0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 2030,
       1911, 1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 2050, 1934,
       1910, 1933, 1902, 1924, 1921, 1900, 2038, 2026, 1944, 1917, 1901,
       2010, 1908, 1906, 1935, 1806, 2021, '2000', '1995', '1999', '2004',
       '2003', '1990', '1994', '1986', '1989', '2002', '1981', '1993',
       '1983', '1982', '1976', '1991', '1977', '1998', '1992', '1996',
       '0', '1997', '2001', '1974', '1968', '1987', '1984', '1988',
       '1963', '1956', '1970', '1985', '1978', '1973', '1980'

In [16]:
# years that are words
[i for i in books.yearOfPublication if isinstance(i, int) == False and i.isnumeric() == False]

['DK Publishing Inc', 'Gallimard', 'DK Publishing Inc']

In [17]:
# cause of problem: semicolon between double quotes prevents correct separation of title and author
for idx, row in enumerate(books.itertuples()):
  year = books.loc[row[0], 'yearOfPublication']
  if isinstance(year, int) == False and year.isnumeric() == False:
    cod = books.loc[row[0], 'ISBN']
    title = books.loc[row[0], 'bookTitle']
    print("ISBN: {cod}\n\t Title: {title}\n\t Year of Publication: {year}\n".format(cod = cod, title = title, year = year))

ISBN: 078946697X
	 Title: DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\";Michael Teitelbaum"
	 Year of Publication: DK Publishing Inc

ISBN: 2070426769
	 Title: Peuple du ciel, suivi de 'Les Bergers\";Jean-Marie Gustave Le ClÃ?Â©zio"
	 Year of Publication: Gallimard

ISBN: 0789466953
	 Title: DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\";James Buckley"
	 Year of Publication: DK Publishing Inc



In [18]:
#ISBN '078946697X'
books.loc[books.ISBN == '078946697X','yearOfPublication'] = 2000
books.loc[books.ISBN == '078946697X','bookAuthor'] = "Michael Teitelbaum"
books.loc[books.ISBN == '078946697X','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '078946697X','bookTitle'] = "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)"

#ISBN '0789466953'
books.loc[books.ISBN == '0789466953','yearOfPublication'] = 2000
books.loc[books.ISBN == '0789466953','bookAuthor'] = "James Buckley"
books.loc[books.ISBN == '0789466953','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '0789466953','bookTitle'] = "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)"

#ISBN '0789466953'
books.loc[books.ISBN == '2070426769','yearOfPublication'] = 2003
books.loc[books.ISBN == '2070426769','bookAuthor'] = "Jean-Marie Gustave Le ClÃ?Â©zio"
books.loc[books.ISBN == '2070426769','publisher'] = "Gallimard"
books.loc[books.ISBN == '2070426769','bookTitle'] = "Peuple du ciel, suivi de 'Les Bergers"

In [19]:
books.loc[books.ISBN == '078946697X']

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
209538,078946697X,"DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)",Michael Teitelbaum,2000,DK Publishing Inc


In [20]:
# change data type to integer
books.yearOfPublication = books.yearOfPublication.astype(np.int32)

In [21]:
print(sorted(books['yearOfPublication'].unique()))

[0, 1376, 1378, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908, 1909, 1910, 1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012, 2020, 2021, 2024, 2026, 2030, 2037, 2038, 2050]


In [22]:
# Replaced value 0 and greater than 2006 (dataset was published in 2004 with margin for updates) by NaN
books.loc[(books.yearOfPublication > 2006) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN
yearMean = round(books.yearOfPublication.mean())
books.yearOfPublication.fillna(yearMean, inplace=True)

In [23]:
books.yearOfPublication.isnull().sum()

0

### Publisher

In [24]:
# rows with publisher null
books[pd.isna(books.publisher) == True]

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002.0,
129037,1931696993,Finders Keepers,Linnea Sinclair,2001.0,


In [25]:
# replaced null publisher for "other"
books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'
books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'

In [26]:
books[books.publisher == 'other']

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002.0,other
129037,1931696993,Finders Keepers,Linnea Sinclair,2001.0,other


In [27]:
# dataset without null values
books.isnull().sum()

ISBN                 0
bookTitle            0
bookAuthor           0
yearOfPublication    0
publisher            0
dtype: int64

In [28]:
#books.yearOfPublication = books.yearOfPublication.astype(np.int32)

## Filter Books

In [29]:
# recommendation only of books registered in the system
ratings_books = ratings[ratings.ISBN.isin(books.ISBN)]
print(ratings.shape)
print(ratings_books.shape)

(1149780, 3)
(1031136, 3)


In [30]:
# separation of evaluated data (<> 0)
ratings_books_implicit = ratings_books[ratings_books.rating == 0]
ratings_books_explicit = ratings_books[ratings_books.rating != 0]

In [31]:
# definition of minimum number of ratings and selection of books with the minimum number of ratings received
MIN_RATINGS = 5
counts_books = ratings_books_explicit.ISBN.value_counts()
books_more_ratings = books[books['ISBN'].isin(counts_books[counts_books >= MIN_RATINGS].index)]

In [32]:
print(books.shape)
print(books_more_ratings.shape)

(271360, 5)
(13787, 5)


## Model (based recommender)

In [33]:
# removing white space from texts
def clean_data(data):
  if isinstance(data, str):
    return str.lower(data.replace(" ", ""))

In [34]:
# cleaning the author and editor columns
features = ['bookAuthor', 'publisher']
for feature in features:
  books_more_ratings[feature] = books_more_ratings[feature].apply(clean_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [35]:
books_more_ratings.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
1,2005018,Clara Callan,richardbrucewright,2001.0,harperflamingocanada
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,ginabarikolata,1999.0,farrarstrausgiroux
5,399135782,The Kitchen God's Wife,amytan,1991.0,putnampubgroup
18,440234743,The Testament,johngrisham,1999.0,dell
19,452264464,Beloved (Plume Contemporary Fiction),tonimorrison,1994.0,plume


In [36]:
# joining the required columns by a space. This is the final pre-processing step and the output of this function will be fed into the word vector model
books_more_ratings['soup'] = books_more_ratings.bookTitle + ' ' + books_more_ratings.bookAuthor + ' '  + books_more_ratings.publisher

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [37]:
books_more_ratings.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,soup
1,2005018,Clara Callan,richardbrucewright,2001.0,harperflamingocanada,Clara Callan richardbrucewright harperflamingocanada
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,ginabarikolata,1999.0,farrarstrausgiroux,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It ginabarikolata farrarstrausgiroux
5,399135782,The Kitchen God's Wife,amytan,1991.0,putnampubgroup,The Kitchen God's Wife amytan putnampubgroup
18,440234743,The Testament,johngrisham,1999.0,dell,The Testament johngrisham dell
19,452264464,Beloved (Plume Contemporary Fiction),tonimorrison,1994.0,plume,Beloved (Plume Contemporary Fiction) tonimorrison plume


In [38]:
from sklearn.feature_extraction.text import CountVectorizer

# create the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(books_more_ratings.soup)

In [39]:
count_matrix.shape

(13787, 16153)

In [40]:
from sklearn.metrics.pairwise import cosine_similarity

# ompute the Cosine Similarity matrix based on the count_matrix
cos_sim = cosine_similarity(count_matrix, count_matrix)

In [41]:
# resetting the DataFrame Index and Creating Reverse Mapping
metadata = books_more_ratings.reset_index()
#indices = pd.Series(metadata.index, index=metadata['bookTitle'])
indices = pd.Series(metadata.index, index=metadata['ISBN'])

In [46]:
def get_recommendations(book, cosine_sim=cos_sim, topn=10):
  try:
    # get the index of the book 
    idx = indices[book]

    # get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar books
    sim_scores = sim_scores[1:topn+1]

    # get the books indices
    book_indices = [i[0] for i in sim_scores]
    scores_final = [i[1] for i in sim_scores]
    average_score = sum(scores_final) / len(scores_final)

    # Return the top 10 most similar books
    #return indices.iloc[book_indices].index, average_score
    return metadata['bookTitle'].iloc[book_indices]

  except:
    #return [0, -1]
    return 0

In [43]:
books.loc[books.ISBN == '0553294385']

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
3417,553294385,"I, Robot",ISAAC ASIMOV,1991.0,Spectra


In [47]:
get_recommendations('0553294385', topn=15)

12495    Robot Visions                                        
13686    The Complete Robot                                   
6952     Robot Dreams (Remembering Tomorrow)                  
13066    Nightfall (Bantam Spectra Book)                      
7222     Foundation (Foundation Novels (Paperback))           
3334     Prelude to Foundation (Foundation Novels (Paperback))
7401     Second Foundation (Foundation Novels (Paperback))    
6570     The Gods Themselves                                  
8156     Nemesis                                              
91       The Martian Chronicles                               
1322     The Difference Engine                                
2307     The Caves of Steel                                   
5379     Foundation and Earth                                 
6571     Robots and Empire                                    
6584     Prince of the Blood                                  
Name: bookTitle, dtype: object