In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import sparse
import requests
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import Normalizer
import re

### Part 0: Load Data

In [71]:
# Load df_books and genres
wd = os.getcwd()
df_books = pd.read_csv(wd + "/goodreads_books.csv")
df_inferred_genres = pd.read_csv(wd + "/inferred_genres.csv")

In [72]:
# Load sparse_reviews from file
df_reviews = sparse.load_npz(wd + "/user_reviews.npz")

# Load user (rows) and book (cols) indices
user_index = pd.read_csv(wd + "/user_index_for_sparse_matrix.csv").rename(columns={"0":"user_id"})
book_index = pd.read_csv(wd + "/book_index_for_sparse_matrix.csv").rename(columns={"0":"book_id"})

In [73]:
# Load my goodreads history
my_books = pd.read_csv(wd + "/goodreads_library_export.csv")

# Filter to books I've rated
my_books = my_books[["Book Id", "My Rating"]].rename(columns={"Book Id":"book_id", "My Rating":"rating"}).query("rating > 0")

# Reformat to fit sparse matrix
my_books = pd.merge(book_index, my_books, how="left", on="book_id").fillna(0.)
my_books = np.array(my_books["rating"]).reshape(1,-1)

# Add to sparse matrix
df_reviews = sparse.vstack([df_reviews, my_books])

# Replace 0 with 0.1 to prevent cosine similarities from zeroing
sparse_fill = 0.001
df_reviews.data[df_reviews.data == 0] = sparse_fill

### Part 1: Find Similar Users

In [74]:
### NOT WORKING AS EXPECTED -- NEED TO FIX

# Normalize reviews within users
norm = Normalizer()
df_reviews = norm.fit_transform(df_reviews) 

In [99]:
# Instantiate KNN
n_neighbors = 75
target_user = df_reviews.shape[0] - 1

nn_model = NearestNeighbors(
    metric="cosine",
    algorithm="auto",
    n_neighbors=n_neighbors,
    n_jobs=-1
)

# Fit to sparse matrix
nn_model.fit(df_reviews)

# Feed in user and get neighbors and distances
reader = df_reviews[target_user,:].toarray()
dists, neighbors = nn_model.kneighbors(reader, return_distance=True)

similar_users = pd.DataFrame(
    [pd.Series(neighbors.reshape(-1)), pd.Series(dists.reshape(-1))]).T.rename(columns={0:"user", 1:"distance"}
)

similar_users.head()

Unnamed: 0,user,distance
0,245119.0,0.0
1,68871.0,0.804018
2,169157.0,0.809443
3,242451.0,0.82769
4,147443.0,0.83146


In [137]:
# Get all books read by similar users
book_ind = []
book_rat = []
uid = []
target_user_books = []
target_user_book_rat = []
for nt in similar_users.itertuples():
    user = df_reviews[int(nt.user),:].toarray()
    book_inds = np.where(user[0] > 0)[0]
    ratings = user[0][np.where(user[0] > 0)[0]]
    for i in range(len(book_inds)):        
        book_ind.append(book_inds[i])
        book_rat.append(ratings[i])
        uid.append(nt.user)    
        if nt.distance < 0.000000001:
            target_user_books.append(book_inds[i])
            target_user_book_rat.append(ratings[i])

neighbor_user_ratings = pd.DataFrame([uid, book_ind, book_rat]).T.rename(columns={0:"uid",1:"book_index",2:"user_rating"})

# Join overall rating for each book
neighbor_user_ratings = pd.merge(book_index.reset_index(), neighbor_user_ratings, how="inner", left_on="index", right_on="book_index")
neighbor_user_ratings = pd.merge(neighbor_user_ratings, df_books, how="inner", on="book_id")

# Filter out books target reader has already read
neighbor_user_ratings = neighbor_user_ratings[~neighbor_user_ratings["book_index"].isin(target_user_books)]
neighbor_user_ratings.drop(["index", "book_index"], axis=1, inplace=True)

# Filter out later volumes in series using regex pattern
regex = r"#([2-9]|[1-9]\d+)"
neighbor_user_ratings = neighbor_user_ratings[~neighbor_user_ratings["title"].str.contains(regex)]

# Weight user book rating by cosine similarity

# Average weighted user book rating and overall avg rating

  neighbor_user_ratings = neighbor_user_ratings[~neighbor_user_ratings["title"].str.contains(regex)]


In [138]:
# View neighbors rating table
neighbor_user_ratings.head()

Unnamed: 0,book_id,uid,user_rating,title,avg_rating,ratings_count,year,language_code,image_url,url,...,non_fiction,children,YA,comic,none,5_star,4_star,3_star,2_star,1_star
0,100915,83359.0,0.26786,"The Lion, the Witch, and the Wardrobe (Chronic...",4.19,1575387.0,2005,eng,https://images.gr-assets.com/books/1353029077m...,https://www.goodreads.com/book/show/100915.The...,...,0,1,1,0,0,758045.0,526304.0,268039.0,56894.0,20019.0
1,10127019,149314.0,0.114992,The Lean Startup: How Today's Entrepreneurs Us...,4.04,95622.0,2011,en-US,https://images.gr-assets.com/books/1333576876m...,https://www.goodreads.com/book/show/10127019-t...,...,1,0,0,0,0,40364.0,34790.0,17280.0,4638.0,3206.0
2,10193060,122168.0,0.248452,Miserere: An Autumn Tale,3.79,747.0,2011,eng,https://images.gr-assets.com/books/1301084223m...,https://www.goodreads.com/book/show/10193060-m...,...,0,0,0,0,0,219.0,348.0,201.0,55.0,28.0
3,10245518,133863.0,0.158986,The Checklist Manifesto How to Get Things Right,4.03,268.0,2011,eng,https://images.gr-assets.com/books/1356488549m...,https://www.goodreads.com/book/show/10245518-t...,...,1,0,0,0,0,,,,,
4,10321016,204297.0,0.36823,Pitch Anything: An Innovative Method for Prese...,4.09,4346.0,2011,,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/10321016-p...,...,1,0,0,0,0,2059.0,1640.0,932.0,223.0,81.0


In [139]:
# View target user's ratings
target_user_ratings = pd.DataFrame([target_user_books, target_user_book_rat]).T.rename(columns={0:"book_index",1:"user_rating"})
target_user_ratings = pd.merge(book_index.reset_index(), target_user_ratings, how="inner", left_on="index", right_on="book_index")
target_user_ratings = pd.merge(target_user_ratings, df_books, how="inner", on="book_id")
target_user_ratings.head(10)

Unnamed: 0,index,book_id,book_index,user_rating,title,avg_rating,ratings_count,year,language_code,image_url,...,non_fiction,children,YA,comic,none,5_star,4_star,3_star,2_star,1_star
0,159556,12352452,159556.0,0.084853,"Season of the Witch: Enchantment, Terror and D...",4.22,2481.0,2012,,https://images.gr-assets.com/books/1329324459m...,...,1,0,0,0,0,1289.0,1244.0,422.0,68.0,22.0
1,222333,13166586,222333.0,0.084853,The Fish That Ate the Whale: The Life and Time...,3.97,1497.0,2012,eng,https://images.gr-assets.com/books/1327473440m...,...,1,0,0,1,0,589.0,735.0,389.0,99.0,21.0
2,270440,13554058,270440.0,0.084853,The Wandering Earth,4.01,894.0,2012,eng,https://images.gr-assets.com/books/1332444998m...,...,0,0,0,0,0,352.0,362.0,194.0,56.0,13.0
3,340694,15195,340694.0,0.113137,"The Complete Maus (Maus, #1-2)",4.53,79457.0,2003,en-GB,https://images.gr-assets.com/books/1327354180m...,...,1,0,1,1,0,55545.0,24026.0,6377.0,1015.0,348.0
4,442607,16256798,442607.0,0.056569,The One Thing: The Surprisingly Simple Truth B...,4.15,15237.0,2013,eng,https://images.gr-assets.com/books/1362177469m...,...,1,0,0,0,0,8374.0,6354.0,2949.0,807.0,352.0
5,562115,17987621,562115.0,0.084853,The Entrepreneurial State: Debunking Public vs...,3.89,358.0,2013,,https://images.gr-assets.com/books/1369818314m...,...,1,0,0,0,0,149.0,224.0,126.0,28.0,9.0
6,568445,18050143,568445.0,0.056569,"Zero to One: Notes on Startups, or How to Buil...",4.17,52400.0,2014,eng,https://images.gr-assets.com/books/1414347376m...,...,1,0,0,0,0,26363.0,21573.0,8747.0,2000.0,1140.0
7,623534,18490568,623534.0,0.084853,"Age of Ambition: Chasing Fortune, Truth, and F...",4.23,3432.0,2014,eng,https://images.gr-assets.com/books/1418113377m...,...,1,0,0,0,0,1703.0,1786.0,586.0,60.0,18.0
8,657155,18778874,657155.0,0.056569,The Second Amendment,4.01,430.0,2014,eng,https://images.gr-assets.com/books/1397767878m...,...,1,0,0,0,0,166.0,257.0,96.0,25.0,5.0
9,754447,20588662,754447.0,0.084853,An Indigenous Peoples' History of the United S...,4.27,1225.0,2014,,https://images.gr-assets.com/books/1395003842m...,...,1,0,0,0,0,763.0,512.0,182.0,51.0,21.0


In [140]:
# 20 most popular books among similar readers
popular_recs = neighbor_user_ratings.groupby(["title", "avg_rating", "ratings_count", "year"])["book_id"]\
    .count().reset_index().sort_values(by=["book_id", "avg_rating"], ascending=False)\
    .nlargest(20, "book_id").rename(columns={"book_id":"percent_similar_users_read"})

popular_recs["percent_similar_users_read"] = (popular_recs["percent_similar_users_read"] / n_neighbors).map('{:.1%}'.format)

popular_recs

Unnamed: 0,title,avg_rating,ratings_count,year,percent_similar_users_read
226,The Gunslinger,3.98,346978.0,1982,5.3%
186,"Surely You're Joking, Mr. Feynman!: Adventures...",4.29,95353.0,1997,4.0%
131,Ninefox Gambit (The Machineries of Empire #1),3.91,3425.0,2016,4.0%
100,Judas Unchained,4.3,21306.0,2005,2.7%
13,"Age of Myth (The Legends of the First Empire, #1)",4.27,9328.0,2016,2.7%
300,"Wool Omnibus (Silo, #1)",4.24,83601.0,2012,2.7%
221,The Golem and the Jinni (The Golem and the Jin...,4.1,61387.0,2013,2.7%
77,"Gateway (Heechee Saga, #1)",4.07,30282.0,2004,2.7%
157,"Ringworld (Ringworld, #1)",3.96,78483.0,2005,2.7%
231,The Hike,3.92,5603.0,2016,2.7%


In [141]:
# 10 highest rated books among similar readers
highest_rated_recs = neighbor_user_ratings.groupby(["title", "avg_rating", "ratings_count", "year"])["book_id"]\
    .count().reset_index().sort_values(by=["avg_rating", "book_id"], ascending=False)\
    .nlargest(20, "avg_rating").rename(columns={"book_id":"percent_similar_users_read"})

highest_rated_recs["percent_similar_users_read"] = highest_rated_recs["percent_similar_users_read"] / n_neighbors

for i in range(n_neighbors,-1,-1):
    results = highest_rated_recs.query("percent_similar_users_read > (@i / @n_neighbors)")    
    if len(results) >= 10:
        print(i)
        break

results

0


Unnamed: 0,title,avg_rating,ratings_count,year,percent_similar_users_read
264,The Story of Human Language,4.44,394.0,2004,0.013333
273,"The Walking Dead, Compendium 1",4.43,58434.0,2009,0.013333
104,"Kings of the Wyld (The Band, #1)",4.42,2285.0,2017,0.013333
10,Adventure Time With Fionna and Cake,4.41,2542.0,2013,0.013333
54,David Attenborough's Life on Air: Memoirs of a...,4.39,1836.0,2003,0.013333
28,"Berserk, Vol. 1 (Berserk, #1)",4.38,10778.0,2003,0.013333
242,The Looming Tower: Al-Qaeda and the Road to 9/11,4.38,13019.0,2006,0.013333
85,Homicide: A Year on the Killing Streets,4.37,10487.0,1991,0.013333
164,"Senlin Ascends (The Books of Babel, #1)",4.37,1410.0,2013,0.013333
202,The Complete Works of H.P. Lovecraft,4.34,9733.0,2011,0.013333


In [142]:
# Best recommendations based on rating, popularity, and user similarity


In [143]:
# Filter recommendations by genre, year, popularity

In [None]:
# Recs filtered by genre