In [None]:
import json
import gzip
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Review Data

In [1]:
with gzip.open("data/goodreads_reviews_fantasy_paranormal.json.gz") as f:
    line = f.readline()
    
data = json.loads(line)

In [2]:
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        "user_id" : data['user_id'],
        'rating' : data['rating']}

parse_fields(line)

{'book_id': '18245960',
 'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'rating': 5}

In [3]:
books_reviews = []

with gzip.open("data/goodreads_reviews_fantasy_paranormal.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        books_reviews.append(fields)

In [5]:
reviews = pd.DataFrame.from_dict(books_reviews)

print(reviews.shape)
reviews.head()

(3424641, 3)


Unnamed: 0,book_id,user_id,rating
0,18245960,8842281e1d1347389f2ab93d60773d4d,5
1,5577844,8842281e1d1347389f2ab93d60773d4d,5
2,17315048,8842281e1d1347389f2ab93d60773d4d,5
3,13453029,8842281e1d1347389f2ab93d60773d4d,4
4,13239822,8842281e1d1347389f2ab93d60773d4d,3


### We have over 3.4 million users, we will have to reduce this down at some point since this will be computationally expensive without cloud computing

# Book Data

In [141]:
with gzip.open("data/goodreads_books_fantasy_paranormal.json.gz") as f:
    line = f.readline()
    
data = json.loads(line)

In [142]:
def get_author(data):
    for i in range(len(data['authors'])):
        author = data['authors'][i]['author_id']
        return author



def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        'title': data['title_without_series'],
        'language':data['language_code'],
        'description': data['description'],
        'reviews_count': data['text_reviews_count'],
        'avg_rating' :data['average_rating'],
        'ratings_count' : data['ratings_count'],
        'author_id' : get_author(data)
    }

parse_fields(line)

{'book_id': '7327624',
 'title': 'The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)',
 'language': 'eng',
 'description': 'Omnibus book club edition containing the Ladies of Madrigyn and the Witches of Wenshar.',
 'reviews_count': '7',
 'avg_rating': '4.03',
 'ratings_count': '140',
 'author_id': '10333'}

In [143]:
books_list = []

with gzip.open("data/goodreads_books_fantasy_paranormal.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        books_list.append(fields)

In [160]:
books = pd.DataFrame.from_dict(books_list)
print(books.shape)
books.head()

(258585, 8)


Unnamed: 0,book_id,title,language,description,reviews_count,avg_rating,ratings_count,author_id
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",eng,Omnibus book club edition containing the Ladie...,7,4.03,140,10333
1,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,,"To Kara's astonishment, she discovers that a p...",6,4.22,98,19158
2,33394837,The House of Memory (Pluto's Snitch #2),eng,,60,4.33,269,242185
3,12182387,"The Passion (Dark Visions, #3)",,This is the final tale in the bestselling auth...,1,4.04,4,50873
4,29074693,"Prowled Darkness (Dante's Circle, #7)",en-US,,21,4.23,149,5360266


In [161]:
# We will only keep books that are labeled as english (or unknown)
books['language'].unique()

array(['eng', '', 'en-US', 'en-GB', 'pol', 'en-CA', 'tur', 'rus', 'spa',
       'ita', 'swe', 'per', 'por', 'gre', 'jpn', 'bul', 'fre', 'ind',
       'ger', 'nl', 'rum', 'dan', 'cze', 'fin', 'ben', 'kat', 'hun',
       'lit', 'tha', 'en', 'ara', 'srp', 'heb', 'scr', 'est', 'nor',
       'cat', 'slo', 'vie', 'pes', 'lav', 'ukr', 'zho', 'kor', 'nno',
       'msa', '--', 'nob', 'fil', 'slv', 'glg', 'isl', 'hin', 'urd',
       'mon', 'enm', 'mul', 'ast', 'gle', 'bos', 'che', 'nld', 'mal',
       'frs', 'grc', 'epo', 'pt-BR', 'es-MX', 'bug', 'jav', 'sqi', 'tgl',
       'abk', 'hye', 'div', 'mkd', 'tam', 'ota', 'ang', 'guj', 'ale',
       'aze', 'bel', 'lat', 'gla', 'vls', 'ira', 'aus', 'aar'],
      dtype=object)

In [162]:
language_list = ['eng', 'en', 'en-US','en-GB','']

books = books[books['language'].isin(language_list)]
books.shape

(216385, 8)

In [163]:
# Check for null values
books.isna().sum()

book_id          0
title            0
language         0
description      0
reviews_count    0
avg_rating       0
ratings_count    0
author_id        2
dtype: int64

In [164]:
# Drop nulls
books = books.dropna()

In [165]:
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216383 entries, 0 to 258584
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   book_id        216383 non-null  object
 1   title          216383 non-null  object
 2   language       216383 non-null  object
 3   description    216383 non-null  object
 4   reviews_count  216383 non-null  object
 5   avg_rating     216383 non-null  object
 6   ratings_count  216383 non-null  object
 7   author_id      216383 non-null  object
dtypes: object(8)
memory usage: 14.9+ MB


In [166]:
# Change data types
books = books.astype({'reviews_count':int,
                      'avg_rating':float,
                      'ratings_count':int,
                     'author_id':int})

In [167]:
# Check for duplicates
books.duplicated(subset=['title', 'author_id']).value_counts()

False    144519
True      71864
dtype: int64

In [168]:
# Remove duplicates
books_no_dup = books.sort_values(by='ratings_count', ascending=False)\
                    .drop_duplicates(subset=['title', 'author_id'], keep='first')\
                    .reset_index(drop=True)

books_no_dup

Unnamed: 0,book_id,title,language,description,reviews_count,avg_rating,ratings_count,author_id
0,3,Harry Potter and the Sorcerer's Stone (Harry P...,eng,Harry Potter's life is miserable. His parents ...,59856,4.45,4765497,1077326
1,41865,"Twilight (Twilight, #1)",en-US,About three things I was absolutely positive.\...,90766,3.57,3941381,941441
2,5907,The Hobbit,en-US,In a hole in the ground there lived a hobbit. ...,28665,4.25,2099680,656983
3,15881,Harry Potter and the Chamber of Secrets (Harry...,eng,The Dursleys were so mean and hideous that sum...,27159,4.38,1821802,1077326
4,34,The Fellowship of the Ring (The Lord of the Ri...,eng,"One Ring to rule them all, One Ring to find th...",11044,4.34,1813229,656983
...,...,...,...,...,...,...,...,...
144514,1964664,Stonehenge,,Three brave warriors--a prince from an ancient...,1,3.28,0,16147
144515,20406171,Tales of the Bodhisattva: the Khurran,,This is an ACE for ASIN:B00HJ6NEEC \nUpon the...,1,4.50,0,7727698
144516,18471107,Sweet Escape (Sweet Series) (Volume 2),,Sometimes the hardest war to fight is the one ...,1,4.40,0,6876493
144517,16122112,Pack and Coven,,"Harry Smith is a lone wolf, and he likes it th...",1,3.44,0,1321322


In [169]:
# Explore duplicated titles
books_no_dup['title'].value_counts().to_frame().query('title > 1')

Unnamed: 0,title
Haunted,16
Spellbound,14
Cursed,14
Fallen,13
Awakening,12
...,...
"Circles in the Stream (Avalon: Web of Magic, #1)",2
The Obsidian Mirror,2
Beloved Beast,2
Shadows of the New Sun: Stories in Honor of Gene Wolfe,2


In [170]:
books_no_dup.query('title == "Spellbound"')

Unnamed: 0,book_id,title,language,description,reviews_count,avg_rating,ratings_count,author_id
7348,17887350,Spellbound,en-GB,Her desire becomes his pleasure...\nMax Westin...,284,3.12,2618,19823
38763,2436547,Spellbound,,How far would you go to get rid of an annoying...,25,3.68,216,432899
39275,23340907,Spellbound,eng,"In the realm of Hesian there is Stavros, a det...",34,3.84,212,8955046
53154,11737399,Spellbound,,Logan Daniels has always led a sheltered life ...,36,3.82,118,4953744
53536,6470726,Spellbound,,From award-winning author Patricia Simpson com...,17,3.53,117,55706
55066,13540999,Spellbound,en-US,"Wylde Debraux is 16: Sassy, fiercely independe...",6,3.65,110,5353730
62207,562792,Spellbound,,Includes The Changeling Prince and The Conjure...,6,3.82,85,13014
71561,11376771,Spellbound,,Francesca DeVega is a successful healer in the...,11,3.82,62,1916427
77288,34144611,Spellbound,,The Spellbound Boxed Set is a compilation of 2...,16,4.13,52,6429917
77358,22375476,Spellbound,eng,The day after his arrival in the small town of...,4,2.65,52,4726063


# Author Data

In [171]:
authors = pd.read_csv('data/authors.csv')
authors.drop('Unnamed: 0',inplace=True, axis=1)
print(authors.shape)
authors.head()

(829529, 2)


Unnamed: 0,author_id,authors
0,604031,Ronald J. Fields
1,626222,Anita Diamant
2,10333,Barbara Hambly
3,9212,Jennifer Weiner
4,149918,Nigel Pennick


# Merging the Data

In [172]:
# We don't necessarily need to get the author names, but we will anyway
books_with_author = books_no_dup.merge(authors, on='author_id', how='left')
books_with_author.head()

Unnamed: 0,book_id,title,language,description,reviews_count,avg_rating,ratings_count,author_id,authors
0,3,Harry Potter and the Sorcerer's Stone (Harry P...,eng,Harry Potter's life is miserable. His parents ...,59856,4.45,4765497,1077326,J.K. Rowling
1,41865,"Twilight (Twilight, #1)",en-US,About three things I was absolutely positive.\...,90766,3.57,3941381,941441,Stephenie Meyer
2,5907,The Hobbit,en-US,In a hole in the ground there lived a hobbit. ...,28665,4.25,2099680,656983,J.R.R. Tolkien
3,15881,Harry Potter and the Chamber of Secrets (Harry...,eng,The Dursleys were so mean and hideous that sum...,27159,4.38,1821802,1077326,J.K. Rowling
4,34,The Fellowship of the Ring (The Lord of the Ri...,eng,"One Ring to rule them all, One Ring to find th...",11044,4.34,1813229,656983,J.R.R. Tolkien


In [173]:
# Subset our book data to only include popular books with a higher rating and review count
popular_books = books_with_author.query('ratings_count > 300 and reviews_count > 50')
popular_books.shape

(19268, 9)

In [174]:
# Get list of user for users who have reviewed more than 300 books
user_ids = list(reviews['user_id'].value_counts().to_frame().query('user_id > 300').index)

In [175]:
# Subset review data 
popular_users = reviews[reviews['user_id'].isin(user_ids)].reset_index(drop=True)
popular_users.shape

(419543, 3)

In [259]:
# Reduce columns of book data to only include what we need
pop_books = popular_books[['book_id', 'title','avg_rating','ratings_count']]
pop_books.head()

Unnamed: 0,book_id,title,avg_rating,ratings_count
0,3,Harry Potter and the Sorcerer's Stone (Harry P...,4.45,4765497
1,41865,"Twilight (Twilight, #1)",3.57,3941381
2,5907,The Hobbit,4.25,2099680
3,15881,Harry Potter and the Chamber of Secrets (Harry...,4.38,1821802
4,34,The Fellowship of the Ring (The Lord of the Ri...,4.34,1813229


In [261]:
# Remove paraentheses from book titles
pop_books['title'] = pop_books['title'].str.replace(r' \(.*\)','',  regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_books['title'] = pop_books['title'].str.replace(r' \(.*\)','',  regex=True)


In [262]:
pop_books.head()

Unnamed: 0,book_id,title,avg_rating,ratings_count
0,3,Harry Potter and the Sorcerer's Stone,4.45,4765497
1,41865,Twilight,3.57,3941381
2,5907,The Hobbit,4.25,2099680
3,15881,Harry Potter and the Chamber of Secrets,4.38,1821802
4,34,The Fellowship of the Ring,4.34,1813229


In [264]:
# Merge books and users
books_with_users = pop_books.merge(popular_users, on='book_id', how='inner')
print(books_with_users.shape)
books_with_users.head()

(233093, 6)


Unnamed: 0,book_id,title,avg_rating,ratings_count,user_id,rating
0,3,Harry Potter and the Sorcerer's Stone,4.45,4765497,a2f38c2c607f53e5c46b9b4271c4c5a9,5
1,3,Harry Potter and the Sorcerer's Stone,4.45,4765497,08d805375530cc208801531ca7fdefbc,4
2,3,Harry Potter and the Sorcerer's Stone,4.45,4765497,880dc69f3de6d55a0623d61dedd49868,5
3,3,Harry Potter and the Sorcerer's Stone,4.45,4765497,256a2d57f9629ba5bebdad5978958f5d,5
4,3,Harry Potter and the Sorcerer's Stone,4.45,4765497,8e7e5b546a63cb9add8431ee6914cf59,5


In [265]:
# Use pivot_table to create a user-item matrix
matrix = books_with_users.pivot_table(index='title', columns='user_id', values='rating').fillna(0)
print(matrix.shape)
matrix.head()

(16261, 831)


user_id,002eff40d3de8ff36174a48d26d93da7,009a47e49c0dc6e84d1c5e0eb4cdf7f6,00d40ea21d1c012c796a7f913e290457,00f430253f528f841dc91aa3f9498457,025c89955638692ff9ce15528e626ef5,025dd54f7581ba79329a6c39c1f4fcb9,02651037306c57971dea2f24ec9be70f,02a765c7f623e37bb80f0fb230ef773d,02f6c91ef77b301085298cb15c38f1ce,03dced7d30e4674f185c46afa3e9e8ce,...,fdc4c95ff61ed6f9b80992502e8fdc8a,fe42a5a8c82474a362ab80a624b1b619,fe7d2f1ecc244521ef709c1e7e8cfadd,fe99ac93a6a5b7807562530024e3784f,fefdf0c3ab5e41f109d312bf7f676357,ff3c1ccdad00142e4e3b6e856108bbf3,ffa5094acb2bca8fc8655538e60c400e,ffb72f9c157638ddc3a7c65d4f7a7209,ffe85ffe7b49f673b859add8cf50ea0f,fffc34d137f5c5c5e1ca1d6f325a4dcf
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""All You Zombies...""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til Death,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til Kingdom Come,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til the World Ends,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Recommendation System with Collaboritve Filtering 

In [266]:
similarity_scores = cosine_similarity(matrix)

In [267]:
similarity_scores.shape

(16261, 16261)

In [268]:
def recommend(book_name):
    # fetch index of book
    index = np.where(matrix.index == book_name)[0][0]
    
    # Get similar books
    similar_books = sorted(list(enumerate(similarity_scores[index])), key=lambda x:x[1], reverse=True)[1:11]
    
    for i in similar_books:
        print(matrix.index[i[0]])
    

In [269]:
recommend("The Hobbit")

The Two Towers
The Return of the King
The Fellowship of the Ring
The Lightning Thief
A Wizard of Earthsea
The Graveyard Book
The Truth is a Cave in the Black Mountains
The Ship Who Sang
A Wrinkle in Time
A Game of Thrones


In [295]:
recommend('The Gunslinger')

The Waste Lands
The Drawing of the Three
The Talisman
The Stand
Time Windows
'Salem's Lot
The Dracula Tape
Black House
The Passage
Wizard and Glass


In [317]:
recommend("Harry Potter and the Sorcerer's Stone")

Harry Potter and the Chamber of Secrets
Harry Potter and the Goblet of Fire
Harry Potter and the Order of the Phoenix
Harry Potter and the Deathly Hallows
Harry Potter and the Half-Blood Prince
New Moon
The Lightning Thief
Twilight
The Host
The Martian
