In [1]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm 

# Don't collapse Pandas Dataframes:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.precision', 10)

In [2]:
## Given a Book ID recommend similar Book IDs based on Features:
## Genre
## Publisher
## Normalized User Rating

In [3]:
def normalize(data): ## Normalize Data between 0 and 1
    min_val = min(data)
    if min_val < 0:
        data = [x + abs(min_val) for x in data]
    max_val = max(data)
    normalized_data = [x/max_val for x in data]
    return normalized_data
  
def one_hot_encoding_column(df, column):
    ohe_df = pd.get_dummies(df[column])
    ohe_df.reset_index(drop = True, inplace = True)
    df = pd.concat([df, ohe_df], axis = 1)
    return df

def cosine_sim(v1,v2):
    cosine_similarity = sum((dot(v1,v2)/(norm(v1)*norm(v2))))
    # print(cosine_similarity)
    return cosine_similarity

def recommend(df, book_id, number_of_recommendations):
    vector = df.loc[book_id].values
    df['Cosine Similarity'] = df.apply(lambda x: cosine_sim(vector, x.values), axis=1)
    return df


In [4]:
## Import Dataset
df = pd.read_csv('dataset.csv')
df['num_pages_norm'] = normalize(df['num_pages'].values)
df['book_rating_norm'] = normalize(df['book_rating'].values)
df['book_price_norm'] = normalize(df['book_price'].values)
print('Dataset Shape: {0}'.format(df.shape))
df.sort_values(['book_id'], ascending=False).head(20)

Dataset Shape: (100000, 13)


Unnamed: 0,book_id,author_id,book_genre,user_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang,num_pages_norm,book_rating_norm,book_price_norm
85340,3000,139,1,29561,479,8,33,2016,92,7,0.6842857143,0.8,0.46
22544,3000,213,5,1954,159,5,42,2004,159,1,0.2271428571,0.5,0.795
63678,3000,402,1,4738,377,4,50,2004,26,6,0.5385714286,0.4,0.13
65210,3000,7,7,1794,145,5,2,2014,141,4,0.2071428571,0.5,0.705
94237,3000,313,5,10960,350,6,48,2010,78,2,0.5,0.6,0.39
43063,3000,187,5,12893,664,6,2,2014,72,1,0.9485714286,0.6,0.36
77475,3000,39,3,12174,691,2,16,2020,81,5,0.9871428571,0.2,0.405
68483,3000,376,3,1928,329,8,27,2001,85,4,0.47,0.8,0.425
25087,3000,282,3,1627,87,10,40,2003,79,5,0.1242857143,1.0,0.395
58198,3000,69,2,16670,123,7,45,2002,163,7,0.1757142857,0.7,0.815


In [5]:
df = one_hot_encoding_column(df = df, column = 'book_genre') # Feature & Labels
df = one_hot_encoding_column(df = df, column = 'publisher_id') # Feature & Labels
df = one_hot_encoding_column(df = df, column = 'book_rating_norm') # Feature & Labels
df.drop(columns = ['publish_year', 'num_pages', 'book_rating', 'book_price', 'text_lang'], inplace = True) # Remove Columns that aren't Features
df.set_index('book_id', inplace=True) # Set Instance as Index
df.sort_values(['book_id'], ascending=False).head()

Unnamed: 0_level_0,author_id,book_genre,user_id,publisher_id,num_pages_norm,book_rating_norm,book_price_norm,1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1
3000,139,1,29561,33,0.6842857143,0.8,0.46,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3000,213,5,1954,42,0.2271428571,0.5,0.795,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3000,402,1,4738,50,0.5385714286,0.4,0.13,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
3000,7,7,1794,2,0.2071428571,0.5,0.705,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3000,313,5,10960,48,0.5,0.6,0.39,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0


In [6]:
# ran on a sample as an example
book_id = 1001
number_of_recommendations = 20
recommendation_df = recommend(df, book_id, number_of_recommendations)
recommendation_df = recommendation_df[['book_genre','publisher_id','book_rating_norm','Cosine Similarity']]
recommendation_df.sort_values(['Cosine Similarity'], ascending=True).head(number_of_recommendations) # As the cosine similarity measurement gets closer to 1, then the angle between the two vectors A and B is smaller

Unnamed: 0_level_0,book_genre,publisher_id,book_rating_norm,Cosine Similarity
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
604,9,10,0.8,0.0969753622
439,7,37,0.2,0.1057412977
472,3,16,0.2,0.109763365
1565,6,40,0.1,0.1126908825
1884,6,26,0.8,0.1140892633
96,9,23,0.2,0.1179486165
1316,1,30,0.1,0.1311805653
1287,1,42,0.6,0.1755700205
147,1,2,0.6,0.1855355105
2207,4,50,0.9,0.2100611255


In [7]:
# ran on a sample as an example
book_id = 1318
number_of_recommendations = 20
recommendation_df = recommend(df, book_id, number_of_recommendations)
recommendation_df = recommendation_df[['book_genre','publisher_id','book_rating_norm','Cosine Similarity']]
recommendation_df.sort_values(['Cosine Similarity'], ascending=True).head(number_of_recommendations) # As the cosine similarity measurement gets closer to 1, then the angle between the two vectors A and B is smaller

Unnamed: 0_level_0,book_genre,publisher_id,book_rating_norm,Cosine Similarity
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
604,9,10,0.8,0.1106821333
439,7,37,0.2,0.1209843698
472,3,16,0.2,0.1257210454
1565,6,40,0.1,0.1291558936
1884,6,26,0.8,0.1308032695
96,9,23,0.2,0.1353378346
1316,1,30,0.1,0.1508989293
1287,1,42,0.6,0.2030889574
147,1,2,0.6,0.2148199329
2207,4,50,0.9,0.2447819414


In [8]:
# ran on a sample as an example
book_id = 700
number_of_recommendations = 20
recommendation_df = recommend(df, book_id, number_of_recommendations)
recommendation_df = recommendation_df[['book_genre','publisher_id','book_rating_norm','Cosine Similarity']]
recommendation_df.sort_values(['Cosine Similarity'], ascending=True).head(number_of_recommendations) # As the cosine similarity measurement gets closer to 1, then the angle between the two vectors A and B is smaller

Unnamed: 0_level_0,book_genre,publisher_id,book_rating_norm,Cosine Similarity
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
604,9,10,0.8,0.0984989967
439,7,37,0.2,0.1068544636
472,3,16,0.2,0.1106630457
1565,6,40,0.1,0.113459939
1884,6,26,0.8,0.114777085
96,9,23,0.2,0.1184411224
1316,1,30,0.1,0.1310302299
1287,1,42,0.6,0.1732065552
147,1,2,0.6,0.1826309534
2207,4,50,0.9,0.2038436753


In [9]:
# ran on a sample as an example
book_id = 302
number_of_recommendations = 20
recommendation_df = recommend(df, book_id, number_of_recommendations)
recommendation_df = recommendation_df[['book_genre','publisher_id','book_rating_norm','Cosine Similarity']]
recommendation_df.sort_values(['Cosine Similarity'], ascending=True).head(number_of_recommendations) # As the cosine similarity measurement gets closer to 1, then the angle between the two vectors A and B is smaller

Unnamed: 0_level_0,book_genre,publisher_id,book_rating_norm,Cosine Similarity
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
604,9,10,0.8,0.1155832131
439,7,37,0.2,0.1257963722
472,3,16,0.2,0.1304207549
1565,6,40,0.1,0.1338765689
1884,6,26,0.8,0.1354437055
96,9,23,0.2,0.1399143951
1316,1,30,0.1,0.155288699
1287,1,42,0.6,0.2067419293
147,1,2,0.6,0.2181920991
2207,4,50,0.9,0.2461256027
