In [1]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm 

# Don't collapse Pandas Dataframes:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.precision', 10)

In [2]:
## Given an author recommend similar book ids based on features:
## Genre
## Publisher
## Normalized User Rating

In [3]:
def normalize(data): ## Normalize Data between 0 and 1
    min_val = min(data)
    if min_val < 0:
        data = [x + abs(min_val) for x in data]
    max_val = max(data)
    normalized_data = [x / max_val for x in data]
    return normalized_data
  
def one_hot_encoding_column(df, column):
    ohe_df = pd.get_dummies(df[column])
    ohe_df.reset_index(drop = True, inplace = True)
    df = pd.concat([df, ohe_df], axis = 1)
    return df

def cosine_sim(v1, v2):
    cosine_similarity = sum((dot(v1,v2) / (norm(v1) * norm(v2))))
    # print(cosine_similarity)
    return cosine_similarity

def recommend(df, author_id, number_of_recommendations):
    vector = df.loc[author_id].values
    df['Cosine Similarity'] = df.apply(lambda x: cosine_sim(vector, x.values), axis=1)
    return df


In [7]:
## Import Dataset
df = pd.read_csv('dataset.csv')
df['num_pages_norm'] = normalize(df['num_pages'].values)
df['book_rating_norm'] = normalize(df['book_rating'].values)
df['book_price_norm'] = normalize(df['book_price'].values)
print('Dataset Shape: {0}'.format(df.shape))
df.sort_values(['author_id'], ascending=False).head(20)

Dataset Shape: (100000, 13)


Unnamed: 0,book_id,author_id,book_genre,user_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang,num_pages_norm,book_rating_norm,book_price_norm
78619,746,450,4,7903,109,8,14,2019,170,3,0.155714,0.8,0.85
80940,1030,450,10,10686,203,3,28,2021,114,6,0.29,0.3,0.57
70280,90,450,4,154,123,6,37,2012,111,1,0.175714,0.6,0.555
21205,1059,450,5,12844,608,1,6,2010,61,5,0.868571,0.1,0.305
59330,1976,450,5,13232,625,1,33,2006,16,5,0.892857,0.1,0.08
62361,2720,450,3,15014,126,3,14,2008,125,2,0.18,0.3,0.625
12534,701,450,1,2844,587,6,42,2009,196,4,0.838571,0.6,0.98
13346,2029,450,3,9652,518,5,30,2006,154,7,0.74,0.5,0.77
49980,2155,450,3,10451,280,4,49,2003,122,3,0.4,0.4,0.61
85668,1751,450,3,23773,205,1,14,2019,78,1,0.292857,0.1,0.39


In [8]:
df = one_hot_encoding_column(df = df, column = 'book_price') # Feature & Labels
df = one_hot_encoding_column(df = df, column = 'book_id') # Feature & Labels
df = one_hot_encoding_column(df = df, column = 'book_rating_norm') # Feature & Labels
df.drop(columns = ['publish_year', 'num_pages', 'book_rating', 'book_price', 'text_lang', 'author_id', 'book_genre'], inplace = True) # Remove Columns that aren't Features
df.set_index('book_id', inplace=True) # Set Instance as Index
df.sort_values(['book_id'], ascending=False).head()

Unnamed: 0_level_0,user_id,publisher_id,num_pages_norm,book_rating_norm,book_price_norm,1,2,3,4,5,...,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000,29561,33,0.684286,0.8,0.46,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3000,1954,42,0.227143,0.5,0.795,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3000,4738,50,0.538571,0.4,0.13,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3000,1794,2,0.207143,0.5,0.705,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3000,10960,48,0.5,0.6,0.39,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [9]:
# ran on a sample as an example
book_id = 300

number_of_recommendations = 20
recommendation_df = recommend(df, book_id, number_of_recommendations)
recommendation_df = recommendation_df[['author_id','book_genre','book_rating_norm','cosine_similarity']]
recommendation_df.sort_values(['Cosine Similarity'], ascending=False).head(number_of_recommendations) # As the cosine similarity measurement gets closer to 1, then the angle between the two vectors A and B is smaller

KeyError: "['author_id', 'book_genre', 'cosine_similarity'] not in index"

In [None]:
# ran on a sample as an example
author_id = 250
number_of_recommendations = 20
recommendation_df = recommend(df, author_id, number_of_recommendations)
recommendation_df = recommendation_df[['book_genre','book_rating_norm','Cosine Similarity']]
recommendation_df.sort_values(['Cosine Similarity'], ascending=False).head(number_of_recommendations) # As the cosine similarity measurement gets closer to 1, then the angle between the two vectors A and B is smaller

In [None]:
# ran on a sample as an example
author_id = 100
number_of_recommendations = 20
recommendation_df = recommend(df, author_id, number_of_recommendations)
recommendation_df = recommendation_df[['book_genre','book_rating_norm','Cosine Similarity']]
recommendation_df.sort_values(['Cosine Similarity'], ascending=False).head(number_of_recommendations) # As the cosine similarity measurement gets closer to 1, then the angle between the two vectors A and B is smaller

In [None]:
# ran on a sample as an example
author_id = 72
number_of_recommendations = 20
recommendation_df = recommend(df, author_id, number_of_recommendations)
recommendation_df = recommendation_df[['book_genre','book_rating_norm','Cosine Similarity']]
recommendation_df.sort_values(['Cosine Similarity'], ascending=False).head(number_of_recommendations) # As the cosine similarity measurement gets closer to 1, then the angle between the two vectors A and B is smaller