In [1]:
from IPython.core.display import display, HTML
from collections import Counter
from scipy import spatial
from numpy import dot
from numpy.linalg import norm

import pandas as pd
import numpy as np
import json
import gzip
import re
import warnings

In [2]:
display(HTML("<style>.container { width:100% !important; }</style>"))
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Load book data
df_books_raw = pd.read_csv("https://book-recommender-goodreads.s3.us-east-2.amazonaws.com/booksdatagenresfull/books_full.csv", 
                           error_bad_lines=False, warn_bad_lines=False)

In [4]:
# Filter out non-English languages (need to leave nans)
language_regex = "afr|aka|ang|ara|arw|aus|aze|bel|ben|bos|bul|cat|cze|dan|dum|egy|elx|enm|epo|es-MX|est|eus|fil|fin|fre|frs|ger|glg|grc|gre|guj|heb|her|hin|hun|hye|ind|isl|ita|jav|jpn|kan|kat|kor|kur|lat|lav|lit|mal|mar|mkd|mon|msa|mul|nan|nep|nl|nld|nno|nob|nor|oci|ota|peo|per|pes|pol|por|pra|pt-BR|raj|rum|run|rus|scr|sin|slo|slv|spa|sqi|srp|swe|tam|tel|tha|tlh|tur|ukr|urd|vie|zho"
df_books_eng = df_books_raw[~df_books_raw["language_code"].str.contains(language_regex, na=False)]


In [5]:
# Check the shape of the dataframe
df_books_eng.shape

(426534, 7)

In [6]:
# Remove fiction, self-help, and finance books
df_books_eng = df_books_eng[~df_books_eng["popular_shelves"].str.contains("'fiction|thiller|science fiction|novels|comics|Self Help|Finance")]


In [7]:
def trunc_popular_shelves(cell):
  """Removes any genres with counts of 1"""
  trunc_cell = cell[0:cell.find("count='1'")]
  return trunc_cell 


In [8]:
# Removes any genres with counts of 1
df_books_eng['popular_shelves_trun'] = df_books_eng['popular_shelves'].apply(trunc_popular_shelves)


In [9]:
def parse_genre_str(cell_str):
    """Saves only genres of interest and only keep the top two"""
    genre_list = cell_str.split("),")
    regex_genres = "economics|politics|sociology|society|political-science|government|history|philosophy|theory"
    regex_compile = re.compile(regex_genres)
    regex_count = "(?<=count=').*?(?=',)"
    tuple_list = []
    for genre in genre_list:
        if regex_compile.search(genre):
            extracted_genre = re.findall(regex_genres, genre, flags=re.IGNORECASE)
            count_genre = re.findall(regex_count, genre, flags=re.IGNORECASE)
            tuple_list.append((extracted_genre[0], int(count_genre[0])))
    return str(tuple_list[0:2]) if len(tuple_list) > 0 else tuple_list

In [10]:
# Save only genres of interest and only keep the top two
df_books_eng['popular_shelves_clean'] = df_books_eng['popular_shelves_trun'].apply(parse_genre_str)


In [11]:
# Create dummy variables for genres of interest
df_books_eng["economics"] = df_books_eng.apply(lambda x: 1 if "economics" in x["popular_shelves_clean"] else 0, axis=1)
df_books_eng["politics"] = df_books_eng.apply(lambda x: 1 if "politics" in x["popular_shelves_clean"] else 0, axis=1)
df_books_eng["sociology"] = df_books_eng.apply(lambda x: 1 if "sociology" in x["popular_shelves_clean"] else 0, axis=1)
df_books_eng["society"] = df_books_eng.apply(lambda x: 1 if "society" in x["popular_shelves_clean"] else 0, axis=1)
df_books_eng["political-science"] = df_books_eng.apply(lambda x: 1 if "political-science" in x["popular_shelves_clean"] else 0, axis=1)
df_books_eng["government"] = df_books_eng.apply(lambda x: 1 if "government" in x["popular_shelves_clean"] else 0, axis=1)
df_books_eng["history"] = df_books_eng.apply(lambda x: 1 if "history" in x["popular_shelves_clean"] else 0, axis=1)
df_books_eng["philosophy"] = df_books_eng.apply(lambda x: 1 if "philosophy" in x["popular_shelves_clean"] else 0, axis=1)
df_books_eng["theory"] = df_books_eng.apply(lambda x: 1 if "theory" in x["popular_shelves_clean"] else 0, axis=1)


In [12]:
# Trim the dataframe for columns of interest
df_books = df_books_eng[['title', 'book_id', 'publication_year', 'ratings_count', 'language_code', 'average_rating', 'economics', 'politics', 'sociology', 'society', 'political-science', 'government', 'history', 'philosophy', 'theory']]



In [13]:
# Filter for only books that match at least one of the below genres
df_books = df_books.loc[(df_books["economics"]==1) | \
                 (df_books["politics"]==1) | \
                 (df_books["political-science"]==1) | \
                 (df_books["government"]==1) | \
                 (df_books["history"]==1) | \
                 (df_books["philosophy"]==1) | \
                 (df_books["theory"]==1)]


In [14]:
# Filter for books with at least 100 ratings
df_books = df_books.loc[df_books["ratings_count"]>=100]

In [15]:
# See how many books are in each genre (there is overlap)
print("economics:", df_books.loc[(df_books["economics"]==1)].shape[0])
print("politics:", df_books.loc[(df_books["politics"]==1)].shape)
print("political-science:", df_books.loc[(df_books["political-science"]==1)].shape[0])
print("government:", df_books.loc[(df_books["government"]==1)].shape[0])
print("history:", df_books.loc[(df_books["history"]==1)].shape[0])
print("philosophy:", df_books.loc[(df_books["philosophy"]==1)].shape[0])
print("theory:", df_books.loc[(df_books["theory"]==1)].shape[0])

economics: 3081
politics: (7476, 15)
political-science: 141
government: 26
history: 23305
philosophy: 9792
theory: 1401


In [16]:
# Preview the data
df_books.tail(5)

Unnamed: 0,title,book_id,publication_year,ratings_count,language_code,average_rating,economics,politics,sociology,society,political-science,government,history,philosophy,theory
523632,Eiger Dreams,10849,2011.0,103,,3.97,0,0,0,0,0,0,1,0,0
523641,Great Masters: Mozart: His Life and Music (Gre...,3523380,,114,,4.34,0,0,0,0,0,0,1,0,0
523663,The Earth Is Weeping: The Epic Story of the In...,31678077,,147,,4.21,0,0,0,0,0,0,1,0,0
523677,The Muslim Jesus: Sayings and Stories in Islam...,282244,2003.0,137,,3.82,0,0,0,0,0,0,1,1,0
523678,The Lonely Soldier: The Private War of Women S...,6093345,2009.0,207,,4.09,0,1,0,0,0,0,1,0,0


In [17]:
# Create a set of all the book ids (after filtering)
genre_book_ids = set(df_books["book_id"])


## Interactions

In [18]:
df_inter_raw = pd.read_csv("https://book-recommender-goodreads.s3.us-east-2.amazonaws.com/bookinteractionselectread10m/book-interactions-10m.csv", 
                           error_bad_lines=False)


In [19]:
df_inter_filtered = df_inter_raw[df_inter_raw["book_id"].isin(genre_book_ids)]

In [20]:
df_inter_filtered.shape

(907463, 5)

In [21]:
# Count number of times each book was read
book_counts = Counter(df_inter_filtered["book_id"])

In [22]:
# Create sorted list of book read counts
book_counts_sorted = sorted(book_counts.items(), key=lambda item: (-item[1], item[0]))

# Preview top 5 most read
print(book_counts_sorted[0:10])

[(1000, 43323), (1116, 25855), (586, 23755), (998, 14386), (5239, 10513), (531, 7736), (5238, 7128), (621, 6942), (1590, 5619), (13318, 5371)]


In [23]:
# Keep only books with more than one reader
top_books = [book_tuple[0] for book_tuple in book_counts_sorted if (book_tuple[1] > 1)]

In [24]:
len(top_books)

13783

In [25]:
df_inter = df_inter_filtered[['user_id', 'book_id', 'rating']]

In [26]:
df_inter.head()

Unnamed: 0,user_id,book_id,rating
9,0,914,4
12,0,908,4
13,0,905,4
26,0,842,3
27,0,841,4


In [28]:
# Count number of books each user has read 
user_book_count = Counter(df_inter["user_id"])
low_count_users = [x for x, count in user_book_count.items() if count <= 1]

In [29]:
# Keep only users with more than one read
df_inter = df_inter[~df_inter["user_id"].isin(low_count_users)]

In [30]:
df_inter.shape

(897717, 3)

# Create Book-User Matrices

In [33]:
def create_book_user_matrix(df):
    '''
    Return a matrix with book ids as rows and user ids on the columns
    '''
    user_item_nans = df.pivot(index='book_id', columns='user_id', values='rating')
    user_item_counts = user_item_nans.fillna(0)
    user_item_bool = user_item_counts > 0
    user_item = user_item_bool.astype(int)

    return user_item # return the user_item matrix 

In [34]:
book_user = create_book_user_matrix(df_inter)

In [36]:
book_user.head()

user_id,0,1,2,3,4,5,6,8,9,11,...,172067,172068,172069,172070,172071,172072,172073,172075,172076,172078
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [37]:
def create_book_user_ratings_matrix(df):
    '''
    Return a matrix with book ids as rows and user ids on the columns
    '''
    book_user_nans = df.pivot(index='book_id', columns='user_id', values='rating')
    book_user_counts = book_user_nans.fillna(0)

    return book_user_counts # return the user_item matrix 

In [38]:
book_user_ratings = create_book_user_ratings_matrix(df_inter)

In [39]:
book_user_ratings.head()

user_id,0,1,2,3,4,5,6,8,9,11,...,172067,172068,172069,172070,172071,172072,172073,172075,172076,172078
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Collaborative Filtering with Pearson Correlation

## With Binary Matrix

In [117]:
def find_similar_books_pearson(book_id, user_item=book_user):
    """For each book, calculate cosine simularity for book_id"""
    # Initiate dataframe with fake row
    df_simularity = pd.DataFrame([[9999,0]], columns = ['book_id', 'sim'])
    for book in user_item.index:
        df_simularity = df_simularity.append(pd.DataFrame([[book, np.corrcoef(user_item.loc[book], user_item.loc[book_id])[1][0]]], columns=df_simularity.columns)) 
       
    return df_simularity # return a list of the users in order from most to least similar


In [118]:
# Basic Economics 3023
df_book_similarity_be = find_similar_books_pearson(3023)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [119]:
# Globalization and its Discontents
df_book_similarity_gd = find_similar_books_pearson(87661)

In [120]:
# The Shock Doctrine: The Rise of Disaster Capitalism 1237300
df_book_similarity_sd = find_similar_books_pearson(1237300)

In [121]:
# Modern Times: The World from the Twenties to the Nineties 55304
df_book_similarity_md = find_similar_books_pearson(55304)

In [122]:
def match_sim_book_title(df_book_similarity, genre_1, genre_2):
    df_book_similarity = df_book_similarity[1:]
    df_books_filterd_sim = df_books[df_books["book_id"].isin(df_book_similarity["book_id"])]
    df_sim_final = pd.merge(df_books_filterd_sim, df_book_similarity, how="left", on="book_id")
    df_sim_final = df_sim_final.sort_values(by='sim', ascending=True)
    df_sim_same_genres = df_sim_final.loc[(df_sim_final[genre_1]==1) & \
                     (df_sim_final[genre_2]==1)]
    return df_sim_same_genres[["title", "book_id", "sim"]].sort_values(by=['sim'])

In [334]:
# Basic Economics 3023
match_sim_book_title(df_book_similarity_be, "economics", "politics").head()

Unnamed: 0,title,book_id,sim
8486,"Pathologies of Power: Health, Human Rights and the New War on the Poor",10232,-0.002563
10948,The Hydrogen Economy: The Creation of the Worldwide Energy Web and the Redis...,23164,-0.001538
9216,The Starfish and the Spider: The Unstoppable Power of Leaderless Organizations,21314,-0.001397
11609,In Our Hands: A Plan to Replace the Welfare State,78092,-0.001291
10786,Applied Economics: Thinking Beyond Stage One,3041,-0.001278


In [153]:
# Globalization and its Discontents
match_sim_book_title(df_book_similarity_gd, "economics", "politics").head()

Unnamed: 0,title,book_id,sim
10948,The Hydrogen Economy: The Creation of the Worl...,23164,-0.00181
9216,The Starfish and the Spider: The Unstoppable P...,21314,-0.001644
5889,The Good Society: The Humane Agenda,55267,-0.001481
572,Natural Capitalism,683,-0.001347
10789,The Vision of the Anointed: Self-Congratulatio...,3044,-0.00133


In [330]:
# The Shock Doctrine: The Rise of Disaster Capitalism 1237300
match_sim_book_title(df_book_similarity_sd, "economics", "politics").head()

Unnamed: 0,title,book_id,sim
8484,"Pathologies of Power: Health, Human Rights and the New War on the Poor",10232,-0.000503
10946,The Hydrogen Economy: The Creation of the Worldwide Energy Web and the Redis...,23164,-0.000302
10786,Knowledge And Decisions,3042,-0.0003
9214,The Starfish and the Spider: The Unstoppable Power of Leaderless Organizations,21314,-0.000274
11607,In Our Hands: A Plan to Replace the Welfare State,78092,-0.000253


In [331]:
# Modern Times: The World from the Twenties to the Nineties 55304
match_sim_book_title(df_book_similarity_md, "economics", "politics").head()

Unnamed: 0,title,book_id,sim
10948,The Hydrogen Economy: The Creation of the Worldwide Energy Web and the Redis...,23164,-0.001415
10788,Knowledge And Decisions,3042,-0.001409
10786,Applied Economics: Thinking Beyond Stage One,3041,-0.001175
5889,The Good Society: The Humane Agenda,55267,-0.001158
572,Natural Capitalism,683,-0.001053


## With Ratings Matrix

In [124]:
def find_similar_books_ratings_pearson(book_id, user_item=book_user_ratings):
    """For each book, calculate cosine simularity for book_id"""
    # Initiate dataframe with fake row
    df_simularity = pd.DataFrame([[9999,0]], columns = ['book_id', 'sim'])
    for book in user_item.index:
        df_simularity = df_simularity.append(pd.DataFrame([[book, np.corrcoef(user_item.loc[book], user_item.loc[book_id])[1][0]]], columns=df_simularity.columns)) 
       
    return df_simularity # return a list of the users in order from most to least similar


In [317]:
# Basic Economics 3023
df_book_similarity_ratings_be = find_similar_books_ratings_pearson(3023)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [335]:
match_sim_book_title(df_book_similarity_ratings_be, "economics", "politics").head()

Unnamed: 0,title,book_id,sim
8486,"Pathologies of Power: Health, Human Rights and the New War on the Poor",10232,-0.002427
10948,The Hydrogen Economy: The Creation of the Worldwide Energy Web and the Redis...,23164,-0.001445
9216,The Starfish and the Spider: The Unstoppable Power of Leaderless Organizations,21314,-0.001328
10786,Applied Economics: Thinking Beyond Stage One,3041,-0.001214
11609,In Our Hands: A Plan to Replace the Welfare State,78092,-0.001212


In [323]:
# The Shock Doctrine: The Rise of Disaster Capitalism 1237300
df_book_similarity_ratings_sd = find_similar_books_ratings_pearson(1237300)

In [339]:
match_sim_book_title(df_book_similarity_ratings_sd, "economics", "politics").head()

Unnamed: 0,title,book_id,sim
8486,"Pathologies of Power: Health, Human Rights and the New War on the Poor",10232,-0.000491
10788,Knowledge And Decisions,3042,-0.000296
10948,The Hydrogen Economy: The Creation of the Worldwide Energy Web and the Redis...,23164,-0.000293
9216,The Starfish and the Spider: The Unstoppable Power of Leaderless Organizations,21314,-0.000269
10786,Applied Economics: Thinking Beyond Stage One,3041,-0.000246


# Collaborative Filtering with Surprise

In [40]:
from surprise import KNNBasic, KNNWithMeans, KNNWithMeansDifferViews
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

In [99]:
#df_inter_small = df_inter[1:50000]

In [89]:
reader = Reader(rating_scale=(1, 5))
dataset_inter = Dataset.load_from_df(df_inter, reader)

In [90]:
trainset, testset = train_test_split(dataset_inter, test_size=.01)

In [91]:
sim_options = {'name': 'cosine',
               'user_based': False,  # compute  similarities between items,
               'min_support': 5
               }
algo = KNNWithMeansDifferViews(sim_options=sim_options)

In [92]:
trainset.n_items

16364

In [93]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3261


1.326136152741505

In [137]:
book_id_n = 4201 # Modern Times (inner id)
algo.get_neighbors(book_id_n, k=6)

[1294, 295, 57, 1305, 3842, 448]

In [150]:
# Get 10 least similar books
top_n_neighbors = algo.get_neighbors(book_id_n, k=trainset.n_items)[-50:]

In [151]:
def convert_to_titles(top_n_neighbors, genre_1, genre_2):
    raw_ids_list = []
    for num in top_n_neighbors:
        try:
            raw_id = algo.trainset.to_raw_iid(num)
            raw_ids_list.append(raw_id)
        except Exception:
            pass
    df_books_sim = df_books[df_books["book_id"].isin(raw_ids_list)][['title', 'book_id', 'economics', 'politics', 'sociology',
                                                   'society', 'political-science', 'government', 'history', 'philosophy', 'theory']] 
    df_books_sim = df_books_sim.loc[(df_books_sim[genre_1]==1) & \
                     (df_books_sim[genre_2]==1)]
    return df_books_sim


In [152]:
convert_to_titles(top_n_neighbors, "politics", "history")

Unnamed: 0,title,book_id,economics,politics,sociology,society,political-science,government,history,philosophy,theory
91742,"Curveball: Spies, Lies, and the Man Behind The...",646563,0,1,0,0,0,0,1,0,0
118101,"Beyond the White House: Waging Peace, Fighting...",1852087,0,1,0,0,0,0,1,0,0
245624,The Vanishing Newspaper: Saving Journalism in ...,572268,0,1,0,0,0,0,1,0,0
281903,Modern Times: The World from the 20s to the 90s,616692,0,1,0,0,0,0,1,0,0
298376,They Thought They Were Free: The Germans 1933-45,978689,0,1,0,0,0,0,1,0,0
315046,Public Power in the Age of Empire,9775,0,1,0,0,0,0,1,0,0


In [113]:
algo.trainset.to_inner_iid(55304)

4201

In [133]:
algo.trainset.to_raw_iid(4201)

55304

# Conclusions

There are examples of recommendations that are of differing opinions supplied via the Pearson negative correlation method. However, the results are too inconsistent to rule out their appearance being due to chance. Even if their appearance is not due to chance, we still tend to see recommendations for books at are not of conflicting views, since after all most books will have little shared interactions between them. Due to the recommender system not passing manual review quality checks, an evaluation metric was not chosen.

To improve upon the results, there are two promising approaches I would try next. First, user-generated "progressive" and "conservative" labels would be used in the modeling process. Additionally automated machine learning would be used to apply labels to books not already labeled progressive or conservative. Then, genre filters would be applied. Secondly, more research would be put into users that tend to only read progressive and conservative books and then modify a similarity formula to work with these types of users.