In [306]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse

from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity, cosine_distances
from sklearn.neighbors import NearestNeighbors

In [319]:
def file_read_authors():
    df = pd.read_json('./Dataset/goodreads_book_authors.json', lines=True)
    df.rename(columns={'average_rating': 'author_avg_rating', 'author_id': 'authors', 'text_reviews_count': 'aut_txt_rev_count','name': 'author_name', 'ratings_count': 'author_ratings_count'}, inplace=True)
    return df


def file_read(genre):
    df = pd.read_json('./Dataset/goodreads_books_'+ genre +'.json', lines=True)
    df.drop(columns=['isbn', 'series', 'kindle_asin', 'popular_shelves','similar_books','asin', 'is_ebook', 'edition_information', 'publication_year', 'url', 'image_url', 'format', 'link', 'publisher', 'publication_day', 'isbn13', 'publication_month', 'work_id'], inplace=True)
    return df

def fix_author(df):
    '''Extracts the author id from the disctionary of authors'''
    for index in df.index:
        author_id = df['authors'][index][0]['author_id']
        df['authors'][index] = int(author_id)

# add the authors names to df:
def add_author_name(df, aut):
    df = df.merge(aut[['author_name','authors']], on = 'authors', how = 'left')

In [326]:
# program starts here

# init authors df
df_authors = file_read_authors()

# init individual genre dfs
df_poetry = file_read('poetry')
df_children = file_read('children')

dfs = [df_poetry]

# process each df
for d in dfs:
    fix_author(d)
    add_author_name(d, df_authors)


    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['authors'][index] = int(author_id)


In [328]:
df_poetry.head(3)

Unnamed: 0,text_reviews_count,country_code,language_code,average_rating,description,authors,num_pages,book_id,ratings_count,title,title_without_series
0,1,US,eng,3.83,Number 30 in a series of literary pamphlets pu...,15585,80.0,16037549,3,Vision of Sir Launfal and Other Poems,Vision of Sir Launfal and Other Poems
1,2,US,,3.83,Fairy Tales gathers the unconventional verse d...,16073,128.0,22466716,37,Fairy Tales: Dramolettes,Fairy Tales: Dramolettes
2,7,US,,4.38,Three poems describe the nighttime adventures ...,18540,,926662,45,Growltiger's Last Stand and Other Poems,Growltiger's Last Stand and Other Poems


In [329]:
df_poetry = df_poetry.merge(df_authors[['author_name','authors']], on = 'authors', how = 'left')
df_poetry.head(3)

Unnamed: 0,text_reviews_count,country_code,language_code,average_rating,description,authors,num_pages,book_id,ratings_count,title,title_without_series,author_name
0,1,US,eng,3.83,Number 30 in a series of literary pamphlets pu...,15585,80.0,16037549,3,Vision of Sir Launfal and Other Poems,Vision of Sir Launfal and Other Poems,James Russell Lowell
1,2,US,,3.83,Fairy Tales gathers the unconventional verse d...,16073,128.0,22466716,37,Fairy Tales: Dramolettes,Fairy Tales: Dramolettes,Robert Walser
2,7,US,,4.38,Three poems describe the nighttime adventures ...,18540,,926662,45,Growltiger's Last Stand and Other Poems,Growltiger's Last Stand and Other Poems,T.S. Eliot


In [322]:
# See if the authors in our poetry df are in the authors df:
df_authors['authors'].isin(df_poetry['authors']).value_counts(), df_poetry['authors'].nunique()

(False    817364
 True      12165
 Name: authors, dtype: int64,
 12165)

---

#### REVIEWS and RATINGS

In [42]:
#Review dataframe: 
poetry_rev = pd.read_json('./Dataset/goodreads_reviews_poetry.json', lines=True)
poetry_rev.drop(columns=['date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments'], inplace=True)

poetry_rev.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,5,I have three younger siblings and we grew up w...
1,0ef32090550901ead25cb0ea21c4d36b,92270,2db1180992e2b0b1631a3ac5644bde84,5,This is my favorite collection of poetry.
2,0ef32090550901ead25cb0ea21c4d36b,908708,bca57fa40e92c9261b00b03dbebd96fe,4,"He's so disturbing. So very, very disturbing."
3,d37b46b2190ed7c518259f29b47a9b36,253264,cb1ebc02d8b2aff15735d513877463ce,5,I just reread this play for a class I am takin...
4,af157d0205b8a901dee6d4a2aed7e6ad,70885,8dca128b8e869048a7442c18659dbece,5,"Cuanto mas leo, mas me gusta. Su poesia es env..."


In [43]:
#Create pivot table

data = poetry_rev.pivot_table(index = 'book_id', values = 'rating', columns = 'user_id')
data.head()

user_id,000157a6f8331e9c9a21252e1fee91d1,000192962b87d560f00b06fdcbd71681,0004ae25e3cf5f5a44b6f1ccfdd3d343,0006260f85929db85eddee3a0bd0e504,0006a5b8cda1ba6d7b911dc575f6547b,0008e72711d978c23e3aa3bc7ba3871c,00098165da6612b8b26d453fd19bb754,000a1016fda6008d1edbba720ca00851,000adba19f8f49c25017d68c2a55d90f,000c08303ab122db5baf96607bd55054,...,fff7f660fc5277b7c544dd57dbbf95f3,fffba5600c5d01693b75964e7fbe193f,fffbd84c1c89d775d3568a1362db7ed6,fffbdf4175e907f5a0338d7a4a7a8e4e,fffc34d137f5c5c5e1ca1d6f325a4dcf,fffc475c53c2c59e160a4274aec002cf,fffcf6da0f39d7ab624e2a8da054d2c3,fffe3fca0160bd78ae5828b44fbeb72d,fffe68bb5313dd3733d25e3097beedf3,ffff601c0ffa34bd5ffbbf2caee30644
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
234,,,,,,,,,,,...,,,,,,,,,,
236,,,,,,,,,,,...,,,,,,,,,,
241,,,,,,,,,,,...,,,,,,,,,,
244,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


In [93]:
sparse_data = sparse.csr_matrix(data.fillna(0).values)

In [94]:
import sys

sys.getsizeof(data), sys.getsizeof(sparse_data)

(13808778520, 48)

## Cosine similarity

In [95]:
cosine_distances(sparse_data)

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [96]:
pairwise_distances(sparse_data, metric='cosine')

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [97]:
similarity_matrix = cosine_similarity(sparse_data)

In [98]:
sim_sparse = cosine_similarity(sparse_data, dense_output = False)

In [99]:
pd.DataFrame(
    similarity_matrix,
    index = data.index,
    columns = data.index
)

book_id,234,236,241,244,254,284,285,286,289,290,...,36393749,36403877,36407928,36410118,36417323,36418610,36441514,36443173,36447192,36485479
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
234,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
236,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
241,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36418610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36441514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
36443173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
36447192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
rec_df = pd.DataFrame.sparse.from_spmatrix(
    sim_sparse,
    index = data.index,
    columns = data.index
)

In [101]:
rec_df.reset_index()

book_id,book_id.1,234,236,241,244,254,284,285,286,289,...,36393749,36403877,36407928,36410118,36417323,36418610,36441514,36443173,36447192,36485479
0,234,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,236,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,241,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,244,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,254,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36407,36418610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36408,36441514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
36409,36443173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
36410,36447192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
rec_df[234].sort_values(ascending = False).head(10)

book_id
234         1.000000
601717      0.492366
333100      0.492366
3096023     0.492366
1184908     0.492366
3223649     0.492366
13573337    0.492366
4372479     0.492366
9477587     0.492366
409657      0.492366
Name: 234, dtype: Sparse[float64, 0]

In [185]:
def get_cosine_similarities(book_title, df): 
    if poetry_1['title'].str.contains(book_title).any():
        found_book = poetry_1[poetry_1['title'].str.contains(book_title)]
        found_book_id = found_book['book_id'].iloc[0]
        found_book_title = found_book['title'].iloc[0]

        print(f"My book is {found_book_title}")

        matched_books = []
        recs = rec_df[found_book_id].sort_values(ascending = False).head(20)

        for i, v in recs.items():
            if i != found_book_id:
                match_book = poetry_1[poetry_1['book_id'] == i]    
                match_book_title = match_book['title'].iloc[0]

                matched_books.append({
                    'book_id': i,
                    'title': match_book_title,
                    'value': v
                })
        
        matches_df = pd.DataFrame(matched_books)
        return matches_df


    else:
        return False

f = get_cosine_similarities('Dark Sons', rec_df) 
f.head(20)



My book is Dark Sons


Unnamed: 0,book_id,title,value
0,234,The Complete Verse and Other Nonsense,0
1,14578651,Stepping Over Seasons,0
2,14553489,When Thunder Comes: Poems for Civil Rights Lea...,0
3,14553491,A Strange Place to Call Home: The World's Most...,0
4,14553840,I Could Pee on This: And Other Poems by Cats,0
5,14562809,The Conference of Birds,0
6,14563663,Gods of Babel,0
7,14571182,Adventures in Form,0
8,14580479,Hummingbird,0
9,14546605,Floating Life,0


## Nearest neighbour

In [188]:
from fuzzywuzzy import process

def get_knn(book_title, df):
    if poetry_1['title'].str.contains(book_title).any():
        model = NearestNeighbors(algorithm='brute', n_neighbors = 40)
        model.fit(df)

        found_book = poetry_1[poetry_1['title'].str.contains(book_title)]
        found_book_id = found_book['book_id'].iloc[0]
        found_book_title = found_book['title'].iloc[0]
        idx=process.extractOne(found_book_title, poetry_1['title'])[2]
        
        print(f"My book is {found_book_title}")

        matched_books = []
        
        distances, indices = model.kneighbors(df[idx], n_neighbors=40)

        for i in indices[0]:
            if i != idx:
                match_book_id = poetry_1['book_id'][i]
                match_book_title = poetry_1['title'][i]

                # print(match_book_title)

                matched_books.append({
                    'book_id': match_book_id,
                    'title': match_book_title,
                })
        
        matches_df = pd.DataFrame(matched_books)
        return matches_df
    else:
        return False    


k = get_knn('Dark Sons', sparse_data) 
k.head()

My book is Dark Sons


Unnamed: 0,book_id,title
0,3137572,O Holy Cow!: The Selected Verse of Phil Rizzuto
1,17720226,Pop Corpse!
2,17675212,King Me
3,982469,The Land
4,29214079,Skeena


In [189]:
poetry_1[poetry_1['title'] == 'Dark Sons']

Unnamed: 0,text_reviews_count,country_code,language_code,popular_shelves,average_rating,similar_books,description,authors,num_pages,book_id,ratings_count,title,title_without_series
7,1,US,,"[{'count': '504', 'name': 'to-read'}, {'count'...",3.83,"[2078239, 178478, 709979, 824499, 4417990, 387...",,"[{'author_id': '25492', 'role': ''}]",,9495428,4,Dark Sons,Dark Sons
12242,3,US,,"[{'count': '503', 'name': 'to-read'}, {'count'...",3.83,"[2078239, 178478, 709979, 824499, 4417990, 387...",,"[{'author_id': '25492', 'role': ''}]",,8355975,11,Dark Sons,Dark Sons
15882,3,US,,"[{'count': '503', 'name': 'to-read'}, {'count'...",3.83,"[2078239, 178478, 709979, 824499, 4417990, 387...",A guy whose father ripped his heart out too.\n...,"[{'author_id': '25492', 'role': ''}]",208.0,30351254,4,Dark Sons,Dark Sons
16912,82,US,,"[{'count': '503', 'name': 'to-read'}, {'count'...",3.83,"[2078239, 178478, 709979, 824499, 4417990, 387...",Sam can't believe it when his father leaves th...,"[{'author_id': '25492', 'role': ''}]",224.0,848295,378,Dark Sons,Dark Sons
23597,9,US,,"[{'count': '503', 'name': 'to-read'}, {'count'...",3.83,"[2078239, 178478, 709979, 824499, 4417990, 387...",Sam can't believe it when his father leaves th...,"[{'author_id': '25492', 'role': ''}]",224.0,857001,24,Dark Sons,Dark Sons


In [186]:
# Commpare the similarities:

similar_books = poetry_1[poetry_1['title'] == 'Dark Sons']['similar_books'].iloc[0]
similar_books

['2078239',
 '178478',
 '709979',
 '824499',
 '4417990',
 '3873511',
 '1851603',
 '4215733',
 '702855',
 '2058371',
 '619833',
 '11248976',
 '342068',
 '645144',
 '3198720',
 '862675',
 '6238740',
 '2219479']

In [187]:
f[f['book_id'].isin(similar_books)]

Unnamed: 0,book_id,title,value


In [191]:
k[k['book_id'].isin(similar_books)]

Unnamed: 0,book_id,title
