In [2]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm 

df = pd.read_csv(r'd:\naomy\LIA-FastAPI-MySQL\data\data.csv')
df.head()

Unnamed: 0,id,title,author,description,genre,classification,pages,owner_id
0,1,Matadouro cinco,Kurt Vonnegut,Um livro que zomba da estupidez humana com um ...,Science Fiction,5,288,1
1,2,Cama de gato,Kurt Vonnegut,Mais um livro incrível que zomba da condição h...,Science Fiction,5,280,1
2,3,Cem anos de solidão,Gabriel Garcia Marquez,Um realismo fantástico lindo e cativante,Fiction Novel,5,448,1
3,4,Cem anos de solidão,Gabriel Garcia Marquez,,Fiction Novel,5,448,2
4,5,Sobre os ossos dos mortos,Olga Tockarzuck,Uma história sobre nossa relação com a naturez...,Fiction Novel,5,256,1


In [3]:
df.rename(columns={'id':'book_id'},inplace=True)
df.head()

Unnamed: 0,book_id,title,author,description,genre,classification,pages,owner_id
0,1,Matadouro cinco,Kurt Vonnegut,Um livro que zomba da estupidez humana com um ...,Science Fiction,5,288,1
1,2,Cama de gato,Kurt Vonnegut,Mais um livro incrível que zomba da condição h...,Science Fiction,5,280,1
2,3,Cem anos de solidão,Gabriel Garcia Marquez,Um realismo fantástico lindo e cativante,Fiction Novel,5,448,1
3,4,Cem anos de solidão,Gabriel Garcia Marquez,,Fiction Novel,5,448,2
4,5,Sobre os ossos dos mortos,Olga Tockarzuck,Uma história sobre nossa relação com a naturez...,Fiction Novel,5,256,1


In [4]:
def normalize(data):
    '''
    Normalize input data to be between 0 and 1
    
    params:
        data: values you want to normalize
    
    returns:
        The input data normalized between 0 and 1
    '''
    min_val = min(data)
    if min_val < 0:
        data = [x + abs(min_val) for x in data]
    max_val = max(data)
    return [x/max_val for x in data]

In [5]:
df['pages_norm'] = normalize(df['pages'].values)
df['book_rating_norm'] = normalize(df['classification'].values)

In [6]:
def ohe(df, enc_col):
    '''
    This function will one hot encode the specified column and add it back
    onto the input dataframe
    
    params:
        df (DataFrame) : The dataframe you wish for the results to be appended to
        enc_col (String) : The column you want to OHE
    
    returns:
        The OHE columns added onto the input dataframe
    '''
    
    ohe_df = pd.get_dummies(df[enc_col])
    ohe_df.reset_index(drop = True, inplace = True)
    return pd.concat([df, ohe_df], axis = 1)

In [7]:
df = ohe(df = df, enc_col = 'genre')
df = ohe(df = df, enc_col = 'author')


In [8]:
cols = ['pages', 'genre', 'description', 'title', 'author']
df.drop(columns = cols, inplace = True)
df.set_index('book_id', inplace = True)


In [9]:
class CBRecommend():
    def __init__(self, df):
        self.df = df
        
    def cosine_sim(self, v1,v2):
        '''
        This function will calculate the cosine similarity between two vectors
        '''
        return dot(v1,v2)/(norm(v1)*norm(v2))
    
    def recommend(self, book_id, n_rec):
        """
        df (dataframe): The dataframe
        song_id (string): Representing the song name
        n_rec (int): amount of rec user wants
        """
        
        # calculate similarity of input book_id vector w.r.t all other vectors
        inputVec = self.df.loc[book_id].values
        self.df['sim']= self.df.apply(lambda x: self.cosine_sim(inputVec,x.values), axis=1)
        
        # returns top n user specified books
        return self.df.nlargest(columns='sim',n=n_rec)


In [12]:
t = df.copy()
cbr = CBRecommend(df = t)

cbr.recommend(book_id = t.index[76], n_rec = 10)

Unnamed: 0_level_0,classification,owner_id,pages_norm,book_rating_norm,Dystopian,"Fantasy, Adventure",Fiction Novel,Graphic Novel,"Mystery, Suspense, Horror",Nonfiction,...,Scott Smith,Stephen King,Stephenie Meyer,Ted Chiang,Ursula K LeGuin,Victor Bonini,Victor LaValle,William Peter Blatty,William Shakespeare,sim
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77,3,1,0.35,0.6,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
107,5,2,0.5,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.93576
4,5,2,0.466667,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.935646
84,5,2,0.21875,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.933785
78,5,2,0.166667,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.933165
27,5,1,0.541667,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.930315
3,5,1,0.466667,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.930144
40,4,1,0.466667,0.8,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.930088
106,4,1,0.5,0.8,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.930074
39,4,1,0.383333,0.8,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.929895
