In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from collections import Counter
import nltk
import re, string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Dealing with Duplicates

In [2]:
full_books = pd.read_csv('data/full_df.csv.bz2', compression='bz2')

# Drop duplicates based on title and author
books = full_books.sort_values(by='ratings_count', ascending=False).drop_duplicates(subset=['title','authors'], keep='first', ignore_index=True)

# Drop duplicates based on description
dd = books['description'].value_counts().to_frame().query('description > 1').reset_index()
dd_indexes = [0,1,2,3,4,5,6,7,8,9,19,11,12,13,15,16,17,20,25,28,32,33,41]
books_to_drop = []
for i in dd_indexes:
    drop = list(books[books['description'] == dd.iloc[i,0]].index)
    books_to_drop.append(drop)
books_to_drop = [item for sublist in books_to_drop for item in sublist]
books = books.drop(books_to_drop).reset_index(drop=True)
books = books.sort_values(by='ratings_count', ascending=False).drop_duplicates(subset=['description'], keep='first', ignore_index=True)

# Drop boxsets and collections
key_words = 'Boxset|boxset|boxed set|Boxed Set|Book Collection|Poems'
mask = books['title'].str.contains(pat=key_words, regex=True, case=True, na=False)
books = books[~mask].reset_index(drop=True)

# Reducing the size of the Data

Because of computational issues, we cannot run cosine similarity on 800,000 books. We will limit this to around 70,000 of the most popular books in our dataset:

In [13]:
books_popular = books.query('ratings_count > 800')
books_popular.head(3)

Unnamed: 0,book_id,isbn,author_id,authors,title,description,publisher,genres,avg_rating,ratings_count,num_pages,pub_year,language_code,similar_books,url,cover_image
0,2767052,439023483,153394,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",Winning will make you famous.\nLosing means ce...,Scholastic Press,"['favorites', 'currently-reading', 'to-read', ...",4.34,4899965,374,2008.0,eng,"['1902241', '146499', '954674', '9917938', '10...",https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...
1,3,439554934,1077326,J.K. Rowling,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter's life is miserable. His parents ...,Scholastic Inc,"['to-read', 'favorites', 'fantasy', 'young-adu...",4.45,4765497,320,1997.0,eng,"['13830', '127586', '121822', '37586', '616435...",https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...
2,2657,61120081,1825,Harper Lee,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...,Harper Perennial Modern Classics,"['to-read', 'favorites', 'classics', 'classic'...",4.26,3255518,324,2006.0,eng,"['1934', '2156', '15638', '53835', '77142', '5...",https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...


In [14]:
books_popular.shape

(68346, 16)

In [15]:
desc = books_popular[['title','description']]
desc.head()

Unnamed: 0,title,description
0,"The Hunger Games (The Hunger Games, #1)",Winning will make you famous.\nLosing means ce...
1,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter's life is miserable. His parents ...
2,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...
3,The Great Gatsby,"THE GREAT GATSBY, F. Scott Fitzgerald's third ..."
4,The Fault in Our Stars,"There is an alternate cover edition .\n""I fel..."


# Cleaning Description Column

In [16]:
#Make everything lowercase
desc['description'] = desc['description'].apply(lambda x:' '.join(x.lower() for x in str(x).split()))

# Remove contractions
desc['description'] = desc['description'].apply(lambda x: contractions.fix(x))

# Remove dashes from words
desc['description'] = desc['description'].str.replace('-',' ')

# Remove any non-alphanumeric characters
desc['description']=desc['description'].apply(lambda x: ' '.join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

# Remove any single letters remaining
desc['description']=desc['description'].apply(lambda x: ' '.join([x for x in str(x).split() if len(x)>1]))

# Remove stopwords
stop = stopwords.words("english")
desc['description']=desc['description'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

In [17]:
# Before Cleaning
books_popular['description'][0]

"Winning will make you famous.\nLosing means certain death.\nThe nation of Panem, formed from a post-apocalyptic North America, is a country that consists of a wealthy Capitol region surrounded by 12 poorer districts. Early in its history, a rebellion led by a 13th district against the Capitol resulted in its destruction and the creation of an annual televised event known as the Hunger Games. In punishment, and as a reminder of the power and grace of the Capitol, each district must yield one boy and one girl between the ages of 12 and 18 through a lottery system to participate in the games. The 'tributes' are chosen during the annual Reaping and are forced to fight to the death, leaving only one survivor to claim victory.\nWhen 16-year-old Katniss's young sister, Prim, is selected as District 12's female representative, Katniss volunteers to take her place. She and her male counterpart Peeta, are pitted against bigger, stronger representatives, some of whom have trained for this their 

In [18]:
# After Cleaning
desc['description'][0]

'winning make famous losing means certain death nation panem formed post apocalyptic north america country consists wealthy capitol region surrounded poorer districts early history rebellion led th district capitol resulted destruction creation annual televised event known hunger games punishment reminder power grace capitol district must yield one boy one girl ages lottery system participate games tributes chosen annual reaping forced fight death leaving one survivor claim victory year old katniss young sister prim selected district female representative katniss volunteers take place male counterpart peeta pitted bigger stronger representatives trained whole lives sees death sentence katniss close death survival second nature'

In [19]:
desc.shape

(68346, 2)

In [None]:
desc.to_csv('data/reduced_books_desc.csv', index=False)

In [None]:
#desc = pd.read_csv('data/reduced_books_desc.csv')
#desc.dropna(inplace=True)

# Vectorizing Description

In [20]:
tfidf = TfidfVectorizer(min_df=20, max_df=0.50)
tfidf_matrix = tfidf.fit_transform(desc['description'])
tfidf_matrix = tfidf_matrix.astype(np.float32)
tfidf_matrix.shape

(68346, 18774)

# Baseline Recommendation System: Content Based Filtering using Cosine Similarity

In [21]:
tfidf_matrix = tfidf_matrix.toarray()
cos = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [82]:
# Saving the indices of book titles, we will need this to retrieve book titles
indices = pd.Series(desc.index, index=desc['title'])
indices.sample(5)

title
Under Milk Wood                               19261
2 A.M. at The Cat's Pajamas                   18275
The Heart Goes Last                            2882
The Special Power of Restoring Lost Things    46861
Sophie's Heart                                 9267
dtype: int64

In [83]:
# Let's try it out on a sample book
sample_book = indices['The Shining']
similarity_scores = pd.DataFrame(cos[sample_book], columns=["score"]).sort_values(by='score', ascending=False).head(11)
similarity_scores

Unnamed: 0,score
61,1.0
36167,0.265128
21237,0.192767
43526,0.184216
7223,0.156825
28956,0.153965
68016,0.151153
43183,0.14688
24214,0.141233
39789,0.141217


In [84]:
# Now we can retrieve the title and description of the books similar to our sample book

similar_movies_indices = list(similarity_scores.index)
books_popular[['title','description']].iloc[similar_movies_indices]

Unnamed: 0,title,description
61,The Shining,Jack Torrance's new job at the Overlook Hotel ...
36167,"The Shining (The Shining, #1)",This tale of a troubled man hired to care for ...
21237,Hotel Vendome,"When Swiss-born Hugues Martin sees a small, ru..."
43526,Hotel Babylon,'Something strange occurs to guests as soon as...
7223,Carrie / 'Salem's Lot / The Shining,Stephen King is a unique and powerful writer w...
28956,The Haunted Hotel: A Mystery of Modern Venice,Is there no explanation of the mystery of The ...
68016,Room with a Clue (Pennyfoot Hotel #1),Beginning a new series set in a seaside hotel ...
43183,Hotel Honolulu,Newly married and having recently taken over t...
24214,"A Rule Against Murder (Armand Gamache, #4)","At the Manoir Bellechasse, when a gentile fami..."
39789,Bonechiller,WELCOME TO NOWHERE.\nDanny's dad takes a job a...


# Checking the Recommendations:

In [86]:
books_popular['description'].iloc[21237]

"When Swiss-born Hugues Martin sees a small, run-down hotel in New York for the rough diamond it is, he transforms it into a beautiful boutique hotel of impeccable elegance, run with the precision and attention to detail he learned through his hotelier training in Europe.\nRenowned for its unparalleled service, the Hotel Vendome soon becomes the ideal New York refuge for the rich and famous, as well as a perfect home for Hugues' beautiful wife and their young daughter. But when his wife tires of his obsession for the hotel, she walks out on him for a notorious rock star, leaving Hugues a single parent to four-year-old Heloise.\nHeloise and her family live happily amid a colourful, exciting and sometimes mysterious milieu of celebrities, socialites, politicians, world travellers and hotel employees--and their inevitable intrigues.\nAs unexpected challenges arise, the hotel is the centre of their world. And when Heloise grows us, she longs to follow in her father's footsteps and one day 

In [87]:
books_popular['description'].iloc[43526]

"'Something strange occurs to guests as soon as they check in. Even if in real life they are perfectly well-mannered, decent people with proper balanced relationships, as soon as they spin through the revolving hotel doors the normal rules of behaviour no longer seem to apply.'\nAll of the following is true.Only the names have been changed to protect the guilty. All the anecdotes, the stories, the characters, the situations, the highs, the lows, the scams, the drugs, the misery, the love, the death and the insanity are exactly as was told by Anonymous - someone who has spent his whole career working in hotels at the heart of London's luxury hotel industry. However, for legal reasons, the stories now take place in a fictitious hotel known as Hotel Babylon. More than a decade is compressed into a day. Everything else is as it should be. The rich spend money, the hotel makes money and the chambermaids still fight the bellboys over a two-pound coin.It's just another twenty-four hours in an

In [88]:
books_popular['description'].iloc[39789]

"WELCOME TO NOWHERE.\nDanny's dad takes a job as caretaker at a marina on the shore of a vast, frozen lake in Harvest Cove, a tiny town tucked away in Canada's Big Empty. If you're looking for somewhere to hide, this is it.\nIt's the worst winter in years. One night, running in the dark, Danny is attacked by a creature so strange and terrifying he tries to convince himself he was hallucinating. Then he learns about Native American legends of a monster that's haunted the lake for a thousand years. And that every generation, in the coldest winters, kids have disappeared into the night. People think they ran away.\nDanny knows better. Because now the beast is after him."

We can see that the recommender is working, but it is giving very naive recommendations. The sample book I chose was 'The Shining', and it is mostly recommending any books that have the word 'hotel' in its description. This is to be expected, since we are only working with descriptions, and not including oher features like genre or user interactions. But this is a solid baseline model!