In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
import nltk
import re, string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Getting the Data

In [2]:
# Function to extract author name (if needed)
def get_author(data):
    for i in range(len(data['authors'])):
        author = data['authors'][i]['author_id']
        return author

# Function to extract genre keywords from ppopular shelves
def get_genre(data):
    genres = []
    for i in range(len(data['popular_shelves'])):
        genre = list(data['popular_shelves'][i].values())[1]
        genres.append(genre)
    return genres

# Function to extract fields that we want in our dataframe
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        'author_id' : get_author(data),
        'title': data['title_without_series'],
        'description': data['description'],
        'reviews_count': data['text_reviews_count'],
        'avg_rating' :data['average_rating'],
        'ratings_count' : data['ratings_count'],
        "popular_shelves" : data['popular_shelves'],
        'keywords' : get_genre(data),
        'language':data['language_code'],
    }


In [3]:
# Loop through json and use parse_fields() function for each book
books_list = []

with gzip.open("data/goodreads_books_fantasy_paranormal.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        books_list.append(fields)

In [4]:
# Save books_list as a dataframe
books = pd.DataFrame.from_dict(books_list)
print(books.shape)
books.head()

(258585, 10)


Unnamed: 0,book_id,author_id,title,description,reviews_count,avg_rating,ratings_count,popular_shelves,keywords,language
0,7327624,10333,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,7,4.03,140,"[{'count': '58', 'name': 'to-read'}, {'count':...","[to-read, fantasy, fiction, owned, hardcover, ...",eng
1,6066812,19158,All's Fairy in Love and War (Avalon: Web of Ma...,"To Kara's astonishment, she discovers that a p...",6,4.22,98,"[{'count': '515', 'name': 'to-read'}, {'count'...","[to-read, fantasy, owned, books-i-own, current...",
2,33394837,242185,The House of Memory (Pluto's Snitch #2),,60,4.33,269,"[{'count': '54', 'name': 'currently-reading'},...","[currently-reading, netgalley, kindle, read-20...",eng
3,12182387,50873,"The Passion (Dark Visions, #3)",This is the final tale in the bestselling auth...,1,4.04,4,"[{'count': '1057', 'name': 'to-read'}, {'count...","[to-read, young-adult, ya, paranormal, fantasy...",
4,29074693,5360266,"Prowled Darkness (Dante's Circle, #7)",,21,4.23,149,"[{'count': '598', 'name': 'to-read'}, {'count'...","[to-read, currently-reading, paranormal, shift...",en-US


# 2. Reducing and Cleaning the data
* Only include english books
* Remove null descriptions
* Remove duplicates
* Change data types

In [5]:
# Only include books that are in english (or unknown, as these may be english as well)
language_list = ['eng', 'en', 'en-US','en-GB','']

books = books[books['language'].isin(language_list)]
books.shape

(216385, 10)

In [6]:
# Replace empty descriptions with NaN
books['description'] = books['description'].replace('', np.nan)
books.isna().sum()

book_id                0
author_id              2
title                  0
description        19060
reviews_count          0
avg_rating             0
ratings_count          0
popular_shelves        0
keywords               0
language               0
dtype: int64

In [7]:
# Drop rows with missing values
books = books.dropna()

In [8]:
# Check for duplicates
books.duplicated(subset=['title', 'author_id']).value_counts()

False    130139
True      67186
dtype: int64

In [9]:
# Sort by highest rating count
# Drop duplicates by title and author

books_no_dup = books.sort_values(by='ratings_count', ascending=False)\
                    .drop_duplicates(subset=['title', 'author_id'], keep='first')\
                    .reset_index(drop=True)

books_no_dup.shape

(130139, 10)

In [10]:
# Check for duplicated descriptions
books_no_dup.duplicated(subset=['description']).value_counts()

False    120157
True       9982
dtype: int64

In [11]:
# Analyze duplcated descriptions
books_no_dup['description'].value_counts().to_frame().query('description > 1').head()

Unnamed: 0,description
This book was converted from its physical edition to the digital format by a community of volunteers. You may find it for free on the web. Purchase of the Kindle edition includes wireless delivery.,28
"By falling down a rabbit hole and stepping through a mirror, Alice experiences unusual adventures with a variety of nonsensical characters.",21
"Source of legend and lyric, reference and conjecture, Alice's Adventures in Wonderland is for most children pure pleasure in prose. While adults try to decipher Lewis Carroll's putative use of complex mathematical codes in the text, or debate his alleged use of opium, young readers simply dive with Alice through the rabbit hole, pursuing ""The dream-child moving through a land / Of wonders wild and new."" There they encounter the White Rabbit, the Queen of Hearts, the Mock Turtle, and the Mad Hatter, among a multitude of other characters--extinct, fantastical, and commonplace creatures. Alice journeys through this Wonderland, trying to fathom the meaning of her strange experiences. But they turn out to be ""curiouser and curiouser,"" seemingly without moral or sense.\nFor more than 130 years, children have reveled in the delightfully non-moralistic, non-educational virtues of this classic. In fact, at every turn, Alice's new companions scoff at her traditional education. The Mock Turtle, for example, remarks that he took the ""regular course"" in school: Reeling, Writhing, and branches of Arithmetic-Ambition, Distraction, Uglification, and Derision. Carroll believed John Tenniel's illustrations were as important as his text. Naturally, Carroll's instincts were good; the masterful drawings are inextricably tied to the well-loved story.",17
H. P. Lovecraft was one of the greatest horror writers of all time. His seminal work appeared in the pages of legendary Weird Tales and has influenced countless writer of the macabre. This is one of those stories.,16
"They open a door and enter a world\nNARNIA...the land beyond the wardrobe, the secret country known only to Peter, Susan, Edmund, and Lucy...the place where the adventure begins.\nLucy is the first to find the secret of the wardrobe in the professor's mysterious old house. At first, no one believes her when she tells of her adventures in the land of Narnia. But soon Edmund and then Peter and Susan discover the Magic and meet Aslan, the Great Lion, for themselves. In the blink of an eye, their lives are changed forever.",16


In [12]:
# We will remove these duplicates in the same way as the title/author duplicates
books_no_dup = books_no_dup.sort_values(by='ratings_count', ascending=False)\
                    .drop_duplicates(subset=['description'], keep='first')\
                    .reset_index(drop=True)

books_no_dup.shape

(120157, 10)

In [13]:
# Change data types
books_no_dup = books_no_dup.astype({'reviews_count':int,
                                    'avg_rating':float,
                                    'ratings_count':int,
                                    'author_id':int,
                                    'book_id':int})

In [14]:
# Subset books to include only popular books (rating count > 200)

pop_books = books_no_dup.query('ratings_count > 200')
pop_books.shape

(26686, 10)

# 3. Dealing with Popular Shelves and Genre Keywords

In [15]:
# Save popular shelves to new df
shelves = pop_books['popular_shelves'].to_frame()
shelves

Unnamed: 0,popular_shelves
0,"[{'count': '15963', 'name': 'to-read'}, {'coun..."
1,"[{'count': '1085', 'name': 'to-read'}, {'count..."
2,"[{'count': '6558', 'name': 'to-read'}, {'count..."
3,"[{'count': '2081', 'name': 'to-read'}, {'count..."
4,"[{'count': '976', 'name': 'to-read'}, {'count'..."
...,...
116924,"[{'count': '49', 'name': 'to-read'}, {'count':..."
116925,"[{'count': '2736', 'name': 'to-read'}, {'count..."
116926,"[{'count': '643', 'name': 'to-read'}, {'count'..."
116927,"[{'count': '170', 'name': 'to-read'}, {'count'..."


In [16]:
# Function to extract each shelf into a dataframe
def dict_list_to_df(df, col):
    """Return a Pandas dataframe based on a column that contains a list of JSON objects or dictionaries.
    Args:
        df (Pandas dataframe): The dataframe to be flattened.
        col (str): The name of the column that contains the JSON objects or dictionaries.
    Returns:
        Pandas dataframe: A new dataframe with the JSON objects or dictionaries expanded into columns.
    """

    rows = []
    for index, row in df[col].iteritems():
        for item in row:
            rows.append(item)
    df = pd.DataFrame(rows)
    return df

In [17]:
# shelves_df will have repeated shelves, so we will do a value_counts() to get only the unique ones
shelves_df = dict_list_to_df(shelves, 'popular_shelves')
shelves_df.columns = ['count', 'shelf']
shelves_df.head()

Unnamed: 0,count,shelf
0,15963,to-read
1,1090,fantasy
2,147,currently-reading
3,91,favorites
4,67,epic-fantasy


In [119]:
# df of all the unique shelves
keywords_counts = shelves_df['shelf'].value_counts().to_frame().reset_index()
keywords_counts.columns = ['shelf','count']
keywords_counts.head()

Unnamed: 0,shelf,count
0,to-read,26592
1,currently-reading,25742
2,fantasy,25735
3,favorites,22128
4,owned,21658


In [120]:
# Initial filtering
# Dropping shelves that don't provide any useful keywords
# Not the most efficient way to filter out the garbage, but it's a good start
drop = ['fantasy','to-read', 'favorites','owned','ebooks', 'currently-reading','books-i-own','kindle','ebook',
       'default','library','to-buy','owned-books','favourites','wish-list','my-books','e-book','my-library',
       'ya','audiobook','audiobooks','read-in-2014','read-in-2015','m-m','i-own','pnr','read-in-2016','read-in-2013',
       'middle-grade','novels','read-in-2012','mm','re-read','e-books','audible','dnf','maybe','have','favorite','5-stars',
       'read-in-2017','own-it','books','m-m-romance','read-in-2011','did-not-finish','owned-to-read','diana-wynne-jones',
       'to-read-fantasy','read-2012','mm-romance','read-2015','read-2016','favorite-series','part-of-a-series','audio-book',
       'read-2014','read-in-2010','need-to-buy','borrowed','book-club','read-2017','shelfari-favorites','on-my-shelf',
       'read-in-english','4-stars','kindle-books','first-in-series','audio-books','favorite-books','netgalley','favorite-books',
       'reviewed','fae','tbr','english','reread','sff','nook','read-2013','sf','menage','free','kindle-lending-library',
       'x-read_2005-2015','8th-grade','amazo-unlimited','a-shade-of-vampire','own-it-but-have-not-started-to-read',
       'library-doesn-t-have-it','audio-only','warriors-books','male-protag','september-2017','books-for-review',
       'books-on-my-shelf','might-read-1-day','web-novels','books-for-2016','a-chercher','pnr-scifi-fantasy','c-feehan',
       '2-z01','blauer-punkt','owned-irl','i-own-these-books','fantasy-w-shelf','englisch','books-i-read-in-2016','released-in-2015',
       'pre-goodreads-unhaul','bibliothèque','juv-fiction','m-m-bdsm','own-manga','riordan','first-in-series-spa',
       'anthologies-collections','canadian-author','russian','uncategorized','series-part-of','catalog','save-for-october',
       'loose-id','aventura','available-for-taking','graham-heather','pararom','series-first','1-fiction','easy-reading','tie-in-series',
       'shelf-d-authors','first-reads','m-read','harem','books-i-have-ii','title_same-title','on-my-bookshelf','may-read',
       'own-hardcover','epub','own-on-nook','to-read-library','nook-lendable','stand-alone-books','9p','what-mm-read','might-want-to-read',
       'to-read-short-stories','scan','temp-shelf-sorting-queue','meaghan','standalones','my-favs','maybe-books','started',
       'current-series','kindle-book','_-soft-copy','02-words-sb','purchased-but-not-read','genre','0-my-library','not-available-on-kindle',
       'signed-books','2013-release','your-library','001-library','serie-volledig','2-series','01-kindle-books-that-i-own','hoarded-books',
       'books-read-in-2014','kindle-unlimited-queue','can-t-wait-to-read','library-borrowed','print-books','read-first','read-for-school',
       'good-books','zz-lost-interest-in','tbr-pile-owned','not-owned','blom-check','dl-pendientes','to-read-kindle-books',
       'series-in-progress','2016-publication','our-books','to-read-on-kindle']

keywords_counts = keywords_counts[~np.isin(keywords_counts['shelf'], drop)]

In [155]:
# We will subset the shelves by count
# Brings us to 4071 shelves
k = keywords_counts.query('count > 50')
k.shape

(4071, 2)

In [156]:
k = k.reset_index(drop=True)

In [157]:
#Filter out shelves making specific words
genre_regex = re.compile(r'^.*(book|to-|read|my-|favou?rite|own|audio|author|library|fiction|epub|authors|series|kindle|english|default|calibre|next|want|[0-9]|best).*$')
filtered_list = [genre for genre in list(k['shelf']) if not re.search(genre_regex, genre)]
k['shelf'] = k['shelf'].apply(lambda x: " ".join([x for x in x.split() if x in filtered_list]))
k['shelf'] = k['shelf'].replace('', np.nan).dropna()

In [160]:
# Make everything lowercase
k['shelf'] = k['shelf'].apply(lambda x:' '.join(x.lower() for x in str(x).split()))

# Remove dashes from words
k['shelf'] = k['shelf'].apply(lambda x: x.replace('-',' '))

# Remove any non-alphanumeric characters
k['shelf'] = k['shelf'].apply(lambda x: '  '.join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

# Remove any single letters remaining
k['shelf'] = k['shelf'].apply(lambda x: ' '.join([x for x in str(x).split() if len(x)>3]))

# Remove stopwords
stop = stopwords.words("english")
k['shelf'] = k['shelf'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

In [171]:
# Drop nan shelves
k['shelf'] = k['shelf'].replace('',np.nan)
k = k.dropna()
k

Unnamed: 0,shelf,count
2,paranormal,18138
3,romance,16511
4,magic,13945
5,supernatural,13245
6,urban fantasy,12023
...,...,...
4065,abilities,51
4066,shifters dragons,51
4067,supernatural,51
4068,shifter werewolf dragon,51


In [172]:
# More filtering
drop2 = ['', 'series', 'warriors series','read','smexy','limbo unread','anthony piers','faves','books','check',
         'hardcopy','buddy reads','nook books','paper books','owned read','book','home','could finish','authors',
         'amazon kindle book','ebooks kindle','books read','read later','owned','favourite books','author','reader',
         'reminder','goodreads','long series','great reads','feehan','release','found','type book','want read kindle',
         'meine bcher','ebook owned','junk','ebook','cant find','owned books read','stand alone','english books','shelf authors',
         'misc','fiction']
k = k[~np.isin(k['shelf'], drop2)]

In [174]:
# Filtering our keywords created new duplicates, so we will drop those
k = k.drop_duplicates(subset=['shelf'], keep='first').reset_index(drop=True)

In [175]:
# Removing keywords from double/triple keywords (i.e paranormal fantasy, we would want to remove fantasy)
k['shelf'] = k['shelf'].apply(lambda x: x.replace('fantasy',''))
k['shelf'] = k['shelf'].apply(lambda x: x.replace('fiction',''))
k['shelf'] = k['shelf'].apply(lambda x: x.replace('read',''))
k['shelf'] = k['shelf'].apply(lambda x: x.replace('books',''))
k['shelf'] = k['shelf'].apply(lambda x: x.replace('genre',''))
k['shelf'] = k['shelf'].apply(lambda x: x.replace('series',''))
k['shelf'] = k['shelf'].apply(lambda x: x.replace('owned',''))
k['shelf'] = k['shelf'].apply(lambda x: " ".join(x.split()))
k['shelf'].value_counts()

urban              7
high               5
romance            5
paranormal         4
scifi              4
                  ..
otherworld         1
blog tour          1
talking animals    1
didnt finish       1
cthulhu            1
Name: shelf, Length: 1710, dtype: int64

In [176]:
# More duplicates to get rid of
k = k.drop_duplicates(subset=['shelf'], keep='first').reset_index(drop=True)
k

Unnamed: 0,shelf,count
0,paranormal,18138
1,romance,16511
2,magic,13945
3,supernatural,13245
4,urban,12023
...,...,...
1705,tokyo,51
1706,abilities,51
1707,shifters dragons,51
1708,shifter werewolf dragon,51


In [177]:
# Now we will create a column for the length of the string, to further filter the data
k['length'] = k['shelf'].str.len()

# Filter out keywords that are 3 or less characters
k = k.query('length > 3')

In [178]:
# We are down to 2245 keywords
k.shape

(1707, 3)

In [179]:
# Let's look at keywords of length 4
k.query('length == 4')

Unnamed: 0,shelf,count,length
23,high,4194,4
24,teen,4049,4
32,epic,3178,4
40,need,2814,4
50,mine,2550,4
...,...,...,...
1645,lost,53,4
1668,mols,52,4
1671,dani,52,4
1680,bwwm,52,4


In [180]:
# 4 letter keywords to drop
drop3 = ['high','want','need','mine','must','helf',
                'kobo','glbt','hold','star','gave','nope',
                'pile','favs','next','done','arcs','vamp',
                'free','para','copy','cute','male','sftr',
                'sort','dual','deck','good','find','spec',
                'play','ipad','disk','made','best','sold',
                'mmpb','eeee','cozy','nook','preg','boys',
                'e un','yaoi','sure','soon','long','temp','used',
                'dont','plus','sets','hand','syfy','skip',
                'epub','mobi','eown','safe','botb','okay',
                'york','time','rick','poly''sale','mini',
                'rvrs','like','mang','kick','asap','chic',
                'sifi','list','wish','book','xlam','paul','edit','film','sell',
                'spin','year','ones','real','btvs','wise','filt','comp','rate','easy',
                'test','wait','back','gone','fave','king','look','fant','date','gift',
                'fsrc','alex','illo','ebay','ccpl','july','carl','keep','june','owns',
                'hugo','less','lion','lire','pern','blue','hero','pets','mate','home',
                'food','scfi','lost','mols','dani','bwwm','shot']

k = k[~np.isin(k['shelf'], drop3)]
k.shape

(1609, 3)

In [181]:
# More manual filtering
drop4 = ['audio','paperback','kindle unlimited','kindle','finished','freebies','freebie','amazon','stars','interested',
        'shelfari wishlist','owned','purchased','part series','favorite authors','short','library book','completed series',
        'kindle freebie','digital','kindle freebies','kindle lendable','series','maybe','recommended',' bought','finish',
        'female author','series finish','kindle owned','next series','owned kindle','ebook wishlist','unowned want',
        'favourite','woody want','barnes noble','nook book','checked','recent e','mott nypl','download','save later',
        'absolute favorites','pending','someday','bella forrest','daddy dark baby warrior list','pending kindle free ebook',
        'personal collection','process review','serie','softcover','goods authors','general','pratchett terry','kenyon sherrilyn',
        'kindle trove','fantastyka','punya','book form','fantastik kurgu','language english','kindle lending',
        'full pile','january','downloaded sample','reviews','incomplete','bcherregal','temporary storage','downloaded',
        'highly recommended', 'wishing list','priscilla','terry bolryder','collections','published','free book','syfi horr',
        'bookcrossing','maybes','interesting','previously','lackey mercedes','gooood','kindle un','kindle lendables']

k = k[~np.isin(k['shelf'], drop2)]

In [182]:
k.shape

(1562, 3)

In [191]:
# We will not get all the unique words and save it as a new data frame
# Shelves with multiple words (i.e. horror supernatural fantasy) will be put into separate rows
# This will help reduce the size of the keywords
unique_words = k['shelf'].str.split(' ').explode().value_counts().rename_axis('genre').reset_index(name='count')

In [212]:
#We have 1254 unique words
len(unique_words)

1254

In [203]:
# We will manually go through the rest of the keywords, and delete the ones we don't want
k1 = list(unique_words['genre'][:500])
k2 = list(unique_words['genre'][500:1000])
k3 = list(unique_words['genre'][1000:])

In [204]:
keep1 = ['romance','paranormal','manga','shifters','shifter','love','adult','mystery','supernatural',
 'comics','suspense','magic','horror','thriller','short','female','fairy','hero','witches',
 'young','urban','werewolf','erotica','vampire','tales','time','heroine','alternate','angels','world','dragon',
 'school','male','vampires','dragons','werewolves','alpha','scifi','women','strong','teen','children',
 'action','contemporary','mates','angel','demons','creatures','shapeshifters','mangas',
 'game','japanese','powers','apocalyptic','wizards','light','space','retellings','cozy',
 'games','weres','historical','wolf','dystopian','later','dark','witch','super','bear',
 'kickass','forgotten','ghost','fairytales','comic','physical','magical','epic','childhood','lust',
 'shapeshifter','anthology','erotic','crime','mythology','novellas','novella','retold',
 'travel','badass','omnibus','apocalypse','buffy','faerie','feline','shonen','psychic',
 'retelling','guilty','shoujo','abilities','mccaffrey','yaoi','greek','star',
 'stars','worlds','sexy','realms','dragonlance','folklore','animal','drive',
 'dungeons','life','virgin','siren','heroines','shojo','alien','armstrong','high',
 'sword','aliens','history','american','gods','ghosts','fairytale','shorts','family',
 'mysteries','childrens','romantic','shape','futuristic','funny','anime','wolves','demon',
 'sorcery','tale','vamps','shounen','mythical','anthologies','dirty','fantasia','classics','alternative',
 'cthulhu','mythos','universe','superpowers','german','heroes','royalty','canada','america',
 'holy','warrior','nostalgia','mermaids','fantacy','interracial','hunters','military','elementary','arthurian',
 'myths','swords','ultimate','roleplaying','lesbian','omega','haunted','lgbt','laugh',
 'marriage','drama','bdsm','paranomal','dreamspinner','comedy','emotional','relationships','hunter','dead','mage',
 'sagas','faery','girl','cave','arthur','king','reality','thrillers','murder','british','animals','sweet',
 'opera','modern','beautiful','alice','teens','cheating','pregnancy','zombie','adventure',
 'legends','gothic','protagonist','classic','immortals','pleasure','furry','gender','detective','japan','realism',
 'elemental','romances','cats','future','possessive','wizard','political','saucy',
 'threesome','brothers','warhammer','epics','grief','coverblue','flights','mars','guardians',
 'bearshifters','disaster','mental','illness','girls','howlin','superheros','warewolves','nobility','dystopias','metaphysical','buffyverse','crossbreeding',
 'russia','captive','timetravel','multicultural','laser','worldbuilding','slayer','journey','couples','hanger','scorching',
 'ladies','bender','younger','distopia','teenage','disney','duology','djinn','tolkien','cowboy',
 'tracking','romanceparanormal','predictable','otherworldly','afterlife','slaves','spicy','swashbuckling','native','werebears','inspirational',
 'adolescent','couple','shorty','rejected','irish','torture','california','druids','gelesen','protective','cloud',
 'spirit','korean','chinese','bikers','wicca','dress','rulers','bound','nephilim','parody',
 'biography','earth','conspiracy','christianity','mangashoujo','portadasqueamo','unique','limbo','intense','bareback','kingdoms']

In [206]:
keep2 = ['samhain','orleans','france','forgottenrealms','virtual','taboo','television','tear','psychology',
 'dogs','polyamory','surreal','changeling','tokyopop','species','portal','dresden','parallel',
 'dragonlace','gargoyle','rowling','adaptation','menage','abused','heather','graham','omegaverse',
 'immortality','sports','banker','gaiman','foreign','steaming','hell','spirits','dwarves','dreams','shapeshifting',
 'thief','protector','hate','enjoyable','germany','orange','spanking','supers','lawyer','professional','curses',
 'indies','necromancy','alphas','scottish','fire','spies','wings','culture','forthcoming','ages',
 'pasts','breed','apocolypse','painful','egypt','wereanimal','penguin','china','criticism','garage','nature',
 'traditional','protagonists','suspence','issues','wereanimals','true','sassy','computer','leisure','companions','alchemy',
 'chills','thrills','detectives','cinderella','inmortales','lord','rings','succubus','loud','spanish',
 'wonderland','historic','norse','sirens','sorcerers','language','mature','pararomance','australia','bullying','bodyguard','antihero','bisexual',
 'essays','diskworld','mate','mount','romane','videogames','britain','humans','graphics',
 'faith','dimension','vikings','goblins','diverse','rich','folks','goddesses','holiday','bears',
 'spooky','fantastique','grimdark','politics','princesses','boarding','challenge','diversity','psychological',
 'litrpg','fantastic','vaginal','electronic','forbidden','hotness','translation','satire','pirates',
 'halloween','fantastical','hoarding','soulmates','christmas','superhero','kidnapped','werewolfs','superheroes','soul',
 'natural','scary','hilarious','philosophy','asia','otherworld','power','rpgs','western',
 'werebear','reincarnation','brain','immortal','angsty','noir','london','movies','twins','tree','romantica','candy','slash',
 'abduction','unnaturally','princess','mages','holding','sounds','basement','music','vampiros','mythological','canadian','poetry',
 'pleasures','males','steamy','death','england','elves','speculative','monsters','religion','angst','survival','lgbtq',
 'trilogy','trilogies','faeries','humor','abandoned','kids','science','dystopia','juvenile','humour','chick','fairies','friendship','youth',
 'steampunk','abuse','humorous','triangle','lost','zombies','smut','medieval','queer','assassins','heroic','steam','witchy',
 'victorian','tween','psychics','occult','reverse','harem','punk','illustrated','adults','damaged','partners','christian','tortured','enforcement',
 'amazing','kidnapping','pulp','college','warriors','violence','considering','gaming','assassin']

In [208]:
keep3 = ['adventures','allegory','amnesia','anthropomorphic','arts','asian','baby','beast','beauty','betrayal','billionaire','black','boulder','celtic','cheese','chicklit','child',
 'comfort','cops','cowboys','cozies','creepy','cultural','curious','cyberpunk','dangerzoned','destined','deutsch',
 'difference','disability','discworld','distopian','elementals','espionage','europe','evil','feminism','feminist',
 'fighter','fighting','fluffy','french','friends','gang','gargoyles','gayrom','grit','gritty','harlequin','highlander','highlanders','holidays',
 'hoopla','horses','humble','hurt','imaginary','indonesian','insanity','ireland','kink','kinky','knights','legend','living',
 'lovecraft','lovecraftian','magicians','magick','martial','memories','mercedes','mermaid','movie','mystical','myth','necromancer',
 'novela','orphans','pagan','period','police','poly','pregnant','present','preternatural','pretty','private','protectors','quest','quirky',
 'rape','reapers','regency','religious','revenge','roman','romans','room','royals','safety','scotland',
 'secrets','sequential','sexual','siblings','sisters','slave','slavery','southern','spiritual','spirituality','states','tattoos',
 'telepathy','terror','thieves','tokyo','tragedy','triangles','undead','unicorns','united','violent','warlocks','werecats','witchcraft']

In [210]:
final_keywords = keep1 + keep2 + keep3
len(final_keywords)

688

In [358]:
books = pop_books[['book_id','title','description','keywords']]

In [359]:
# Make everything lowercase
books['keywords'] = books['keywords'].apply(lambda x:' '.join(x.lower() for x in str(x).split()))

# Remove dashes from words
books['keywords'] = books['keywords'].apply(lambda x: x.replace('-',' '))

# Remove any non-alphanumeric characters
books['keywords'] = books['keywords'].apply(lambda x: '  '.join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

# Remove any single letters remaining
books['keywords'] = books['keywords'].apply(lambda x: ' '.join([x for x in str(x).split() if len(x)>3]))

# Remove stopwords
stop = stopwords.words("english")
books['keywords'] = books['keywords'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

# FINALLY remove words not in our final keywords list
books['keywords'] = books['keywords'].apply(lambda x: " ".join([x for x in x.split() if x in final_keywords]))

# Duplicate words from keywords
from collections import OrderedDict
books['keywords'] = books['keywords'].str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' ')

# 4. Cleaning Description Column

In [360]:
# Make everything lowercase
books['description'] = books['description'].apply(lambda x:' '.join(x.lower() for x in str(x).split()))

# Remove contractions
books['description'] = books['description'].apply(lambda x: contractions.fix(x))

# Remove dashes from words
books['description'] = books['description'].replace('-',' ')

# Remove any non-alphanumeric characters
books['description']=books['description'].apply(lambda x: ' '.join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

# Remove any single letters remaining
books['description']=books['description'].apply(lambda x: ' '.join([x for x in str(x).split() if len(x)>1]))

# Remove stopwords
stop = stopwords.words("english")
books['description']=books['description'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

In [363]:
books

Unnamed: 0,book_id,title,description,keywords
0,8801543,"among thieves (tales of the kin, #1)",drothe member kin years rubbing elbows thieves...,epic dark thieves adventure high urban magic a...
1,17457124,"red hill (red hill, #1)",world ends love survive scarlet raising two da...,zombies horror romance adult paranormal dystop...
2,24999067,bohemian gospel (bohemian gospel #1),set historical reign golden iron king bohemian...,historical mystery medieval magic adult romanc...
3,22717558,"kodiak's claim (kodiak point, #1)",might growl afraid bite hands full taking care...,paranormal shifters romance shifter shapeshift...
4,450203,the evil seed,something inside remembers never easy face fac...,horror vampires gothic contemporary urban myst...
...,...,...,...,...
116924,8029972,"our world (the dresden files roleplaying game,...",volume dresden files rpg gives extensive detai...,rpgs gaming roleplaying dresden games game vam...
116925,11215073,"all hallows (the morganville vampires, #6.6)",eve michael shane claire go eek party one year...,vampires young adult short vampire paranormal ...
116926,16080833,"fighter (outsider, #3)",first outsider became insider going fighter no...,romance werewolves young adult paranormal supe...
116927,13438641,"the sorcerer's vengeance (the sorcerer's path,...",book sorcerer path azerick thought finally fou...,magic epic high science challenge considering ...


# 5. Cleaning Title Columns

In [362]:
# Make everything lowercase
books['title'] = books['title'].apply(lambda x:' '.join(x.lower() for x in str(x).split()))

In [398]:
# Drop boxsets and collections
mask = books['title'].str.contains('boxset|box set|boxed set|collection|edition|volumes set|companion')
books = books[~mask].reset_index(drop=True)

In [399]:
# Remove paraentheses from book titles
books['title'] = books['title'].str.replace(r' \(.*\)','',  regex=True)
books.head()

Unnamed: 0,book_id,title,description,keywords,features
0,8801543,among thieves,drothe member kin years rubbing elbows thieves...,epic dark thieves adventure high urban magic a...,drothe member kin years rubbing elbows thieves...
1,17457124,red hill,world ends love survive scarlet raising two da...,zombies horror romance adult paranormal dystop...,world ends love survive scarlet raising two da...
2,24999067,bohemian gospel,set historical reign golden iron king bohemian...,historical mystery medieval magic adult romanc...,set historical reign golden iron king bohemian...
3,22717558,kodiak's claim,might growl afraid bite hands full taking care...,paranormal shifters romance shifter shapeshift...,might growl afraid bite hands full taking care...
4,450203,the evil seed,something inside remembers never easy face fac...,horror vampires gothic contemporary urban myst...,something inside remembers never easy face fac...


# 6. Cosine Similarity

In [400]:
books['features'] = books['description'] + books['keywords']

In [401]:
# There are still duplicate titles
books = books.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)

In [408]:
tfidf = TfidfVectorizer(min_df=25, max_df=0.90)
tfidf_matrix = tfidf.fit_transform(books['features'])
tfidf_matrix = tfidf_matrix.astype(np.float32)
tfidf_matrix.shape

(23682, 7679)

In [409]:
tfidf_matrix = tfidf_matrix.toarray()
cos = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [410]:
# Saving the indices of book titles, we will need this to retrieve book titles
indices = pd.Series(books.index, index=books['title'])
indices.sample(5)

title
death in the beginning    15241
everlasting desire         4382
nick of time               8555
lessons                   13501
a spell for chameleon     15294
dtype: int64

In [443]:
# Let's try it out on a sample book
sample_book = indices['the hobbit']
similarity_scores = pd.DataFrame(cos[sample_book], columns=["score"]).sort_values(by='score', ascending=False)
recs = similarity_scores.query('score < 0.40').head(10)

In [444]:
# Now we can retrieve the title and description of the books similar to our sample book

similar_movies_indices = list(recs.index)
books[['title','features']].iloc[similar_movies_indices]

Unnamed: 0,title,features
13042,the silmarillion: the epic history of the elve...,alternate cover art isbnclassics tolkien high ...
16960,"the hobbit, or, there and back again",hole ground lived hobbit nasty dirty wet hole ...
21455,the magical worlds of lord of the rings: the a...,nightmare key tolkien mythology gandalf really...
20584,the history of the lord of the rings,jrr tolkien lord rings firmly acknowledged one...
1060,splintered light: logos and language in tolkie...,tolkien perhaps best known hobbit lord rings s...
21869,the lost road and other writings,editor christopher tolkien satisfies hunger fa...
907,the lord of the rings,continuing story hobbit threevolume boxed set ...
3457,"the hobbit, or there and back again",bilbo baggins reasonably typical hobbit fond s...
21146,the shaping of middle-earth,fourth volume contains early myths legends led...
7788,"reader and educator guide to ""the hobbit"" and ...",hobbitand lord ringsare widely read beloved bo...
