In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer #histogram(array) representation of words
from sklearn.metrics.pairwise import cosine_similarity #to calculate the cosine_similarity
from sklearn import datasets
from scipy import sparse
#data from goodbooks-10k dataset
df = pd.read_csv('books.csv')
print(df.keys())

Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')


In [2]:
features = ['authors', 'original_publication_year','average_rating']
for feature in features:
    df[feature] = df[feature].fillna(' ') #filling all NaNs with blank string
# print(df[['artist_mbtags']])

In [4]:
df["original_publication_year"] = df["original_publication_year"].astype(str) #changes the year column from int to str
df["average_rating"] = df["average_rating"].astype(str) #changes the year column from int to str


def combine_features(row):
    '''combines the values of the columns into 1 string'''
    return row['authors']+' '+row['original_publication_year']+' '+row['average_rating']

#applying combined_features() method over each rows of dataframe 
#and storing the combined string in “combined_features” column
df["combined_features"] = df.apply(combine_features,axis=1)

In [8]:
df['combined_features']
df.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,combined_features,index
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,Suzanne Collins 2008.0 4.34,0
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,"J.K. Rowling, Mary GrandPré 1997.0 4.44",1
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,Stephenie Meyer 2005.0 3.57,2
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,Harper Lee 1960.0 4.25,3
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,F. Scott Fitzgerald 1925.0 3.89,4


In [6]:
cv = CountVectorizer() #creating new CountVectorizer() object
count_matrix = cv.fit_transform(df['combined_features']) #feeding combined strings(book contents) to CountVectorizer() object
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [9]:
df['index'] = df.index #make a new column that holds the index 

def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]


In [13]:
def get_five_similar_books(book_user_likes):
    # movie_user_likes = 'Avatar'
    book_index = get_index_from_title(book_user_likes)

    #accessing the row corresponding to given movie to find
    # all the similarity scores for that movie and then enumerating over it
    similar_books = list(enumerate(cosine_sim[book_index])) #tuple (index,similarity)

    #sorting the books by their similarity (second index in the tuple) cosine
    sorted_similar_books = sorted(similar_books,key=lambda x:x[1],reverse=True)[1:]

    i = 0
    book_array = [] #create an empty list
    for element in sorted_similar_books:
            book_array.append(get_title_from_index(element[0])) #element[0] gives us index of the movie
            i += 1
            if i >= 5:
                break

    return book_array



if __name__ == '__main__':
    print(get_five_similar_books('The Notebook (The Notebook, #1)'))

['The Lucky One', 'The Rescue', 'True Believer (Jeremy Marsh & Lexie Darnell, #1)', 'Two By Two', 'Three Weeks With My Brother']
