In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# A. Load The Datasets

In [None]:
df_books = pd.read_csv('book_data.csv')
df_books.head()

# B. Cleaning Datasets

## 1. Check null values

In [None]:
df_books.isnull().sum()

## 2. Remove Null Values

I need book_pages and image_url feature to show it on the apps. <br>
So, i'll remove all of null values in book_pages, genres, and image_url feature.

In [None]:
df_books = df_books.dropna(subset=['book_pages', 'genres', 'image_url'])
df_books.isnull().sum()

# C. Convert genres into Vector

- Convert some authors in the datsets into Vector
- I split between author using '|' as a separator because '|' also used in the datasets as a separator
- I only use first 8500 data as model because of Memory reason
- Check total genres after separated

In [None]:
ext = CountVectorizer(tokenizer= lambda x: x.split('|'))
mauthor = ext.fit_transform(df_books['book_authors'].head(8500))
len(ext.get_feature_names())

Insert the vector into an array and check the shape

In [None]:
print(mauthor.toarray())
print(mauthor.toarray().shape)

# D. Measuring Cosine Similarity

Measure the similarity between author using Cosine similarity

Cosine Similarity Formula = $ \displaystyle \frac {\sum (_{n}A x _{n}B)} {\sqrt {\sum (_{n}A)^2} x \sqrt {\sum (_{n}B)^2}} $

In [None]:
cos_score = cosine_similarity(mauthor)
cos_score[0]

# E. Find Similar Books

#### 1. Indexing where user's favorite book is located. In this case we use 'The Hobbit' as the title of user's favorite book

In [None]:
buku_suka = 'The Hobbit'
index_suka = df_books[df_books['book_title'] == buku_suka].index[0]
index_suka

#### 2. Check wether the index where favorite book is located has 100% similarity or not 

In [None]:
buku_sama = list(enumerate(cos_score[index_suka]))
buku_sama[index_suka]

#### 3. Rank 10 index from the highest to the lowest similarity score

In [None]:
# rangking
buku_mirip = sorted(buku_sama, key=lambda x:x[1], reverse=True)
buku_mirip.remove(buku_sama[index_suka])
buku_mirip[:10]

#### 4. Show titles of ranked similar books

In [None]:
# Rekomendasi untuk kamu yg suka "The Hobbit"

for i in buku_mirip[:10]:
    print(df_books.iloc[i[0]]['book_title'])