In [2]:
# import spacy

In [1]:
# !python -m spacy download en_core_web_lg
import spacy
import pandas as pd
import numpy as np
import spacy
import json
from spacy.lang.en import English

In [2]:
#load the csv
%time
filename = '/Users/mattmastin/Desktop/trimmed_20k.csv'
df = pd.read_csv(filename).drop('Unnamed: 0', axis=1)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.11 µs


In [24]:
df.head()

Unnamed: 0,book_title,author,avg_rating,ISBN,genre,description
0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,4.33,439023483,"['Young Adult', 'Fiction', 'Science Fiction', ...","Could you survive on your own, in the wild, wi..."
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.49,439358078,"['Fantasy', 'Young Adult', 'Fiction']",There is a door at the end of a silent corrido...
2,"Twilight (Twilight, #1)",Stephenie Meyer,3.59,316015849,"['Young Adult', 'Fantasy', 'Romance', 'Paranor...",About three things I was absolutely positive.F...
3,The Book Thief,Markus Zusak (Goodreads Author),4.37,375831002,"['Historical', 'Historical Fiction', 'Fiction'...",It is 1939. Nazi Germany. The country is holdi...
4,Animal Farm,George Orwell,3.92,452284244,"['Classics', 'Fiction', 'Science Fiction', 'Dy...",George Orwell's timeless and timely allegorica...


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19900 entries, 0 to 19899
Data columns (total 6 columns):
book_title     19900 non-null object
author         19900 non-null object
avg_rating     19900 non-null float64
ISBN           19900 non-null int64
genre          19900 non-null object
description    19900 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 932.9+ KB


In [3]:
df['avg_rating'] = float(df['avg_rating'].iloc)
df['ISBN'] = int(df['ISBN'].iloc)

TypeError: float() argument must be a string or a number, not '_iLocIndexer'

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19900 entries, 0 to 19899
Data columns (total 6 columns):
book_title     19900 non-null object
author         19900 non-null object
avg_rating     19900 non-null float64
ISBN           19900 non-null int64
genre          19900 non-null object
description    19900 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 932.9+ KB


In [31]:
df.head()

Unnamed: 0,book_title,author,avg_rating,ISBN,genre,description
0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,4.33,439023483,"['Young Adult', 'Fiction', 'Science Fiction', ...","Could you survive on your own, in the wild, wi..."
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.33,439023483,"['Fantasy', 'Young Adult', 'Fiction']",There is a door at the end of a silent corrido...
2,"Twilight (Twilight, #1)",Stephenie Meyer,4.33,439023483,"['Young Adult', 'Fantasy', 'Romance', 'Paranor...",About three things I was absolutely positive.F...
3,The Book Thief,Markus Zusak (Goodreads Author),4.33,439023483,"['Historical', 'Historical Fiction', 'Fiction'...",It is 1939. Nazi Germany. The country is holdi...
4,Animal Farm,George Orwell,4.33,439023483,"['Classics', 'Fiction', 'Science Fiction', 'Dy...",George Orwell's timeless and timely allegorica...


In [4]:
#create the spacy docs column of the book descriptions
%time
nlp = spacy.load("en_core_web_sm")
docs = list(nlp.pipe(df.description))
df['docs'] = docs

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 5.96 µs


In [None]:
#pickle the df for fast loading after server restart:
df.to_pickle('df_pickle.pkl', compression='infer')

In [8]:
#dependencies: pandas, numpy, json, spacy, and en_core_web_lg (python -m spacy download en_core_web_lg)

# import pandas as pd
# import numpy as np
# import spacy
# import json
# from spacy.lang.en import English

# #load the csv
# filename = '/Users/mattmastin/Desktop/trimmed_20k.csv'
# df = pd.read_csv(filename).drop('Unnamed: 0', axis=1)

# #create the spacy docs column of the book descriptions
# nlp = spacy.load("en_core_web_lg")
# docs = list(nlp.pipe(df.description))
# df['docs'] = docs



#to load the pickled df back in the future:
df = pd.read_pickle('df_pickle.pkl')


#Functions:

def get_recs_from_desc(input_string, from_isbn=False):
    '''Takes a book description, converts to a spacy doc object and 
    calculates the similarity score for all other books in the dataframe 
    (variable called df), sorts and returns the top 10 as a json object
    containing title, author, avg rating and ISBN'''
    
    #convert input string of hypthetical book description into spacy doc object
    test_doc = nlp(input_string)
    
    #instantiate empty list of similarity scores:
    sims = []
    
    #iterate over the doc object for each book in the df to get the similarity score and append to list
    for doc in df.docs:
        sim = test_doc.similarity(doc)
        sims.append(sim)
    
    #sort the list and grab the top 10:
    if from_isbn:
        #skip the 0th ranked book which will be the bookused to get the input_string of the description:
        top10 = pd.Series(sims).sort_values(ascending=False).iloc[1:11]
    else:
        top10 = pd.Series(sims).sort_values(ascending=False).iloc[:10]
    
    #instantiate empty list to store the python dicts of each book
    books = []
    
    #iterate thru the top 10 ranked simlilar books and populate the book list w/ dictionaries for each book
    for i in top10.index:
        book = {}
        book['title'] = df.iloc[i]['book_title']
        book['author'] = df.iloc[i]['author']
        book['avg_rating'] = df.iloc[i]['avg_rating']
        book['ISBN'] = df.iloc[i]['ISBN']
        books.append(book)
    return json.dumps(books)


def get_books_by_author(author):
    '''Takes an authors name string input and returns json object of the top 10 highest rated books 
    by that author'''
    
    #limited to top 10- can return all books by author if we want (or fewer)
    books_df = df[df.author == author].sort_values('avg_rating', ascending=False).reset_index().head(10)
    
    #if no books by that author are found, returns error message (string)
    if len(books_df) == 0:
        return 'Author not found in database- check for correct spelling'
    
    #instantiate empty list to store the python dicts of each book
    books = []
    
    #iterate thru the authors books and populate the book list with dictionaries for each book
    for i in range(len(books_df)):
        book = {}
        book['title'] = books_df.iloc[i]['book_title']
        book['author'] = books_df.iloc[i]['author']
        book['avg_rating'] = books_df.iloc[i]['avg_rating']
        book['ISBN'] = books_df.iloc[i]['ISBN']
        books.append(book)
    
    #return the list of dictionaries (books) as json object:
    return json.dumps(books)

#user clicks on a book by author, that should send us the isbn and we will get the book 
#description from df and return recommendations based on that

def get_recs_from_isbn(ISBN):
    #get the book description from df:
    description = df[df['ISBN']== ISBN].description
    
    #pass in the description to the get recommendations function and set from_isbn=True
    return get_recs_from_desc(description, from_isbn=True)


In [28]:
def get_recs_from_desc(input_string, from_isbn=False):
    '''Takes a book description, converts to a spacy doc object and 
    calculates the similarity score for all other books in the dataframe 
    (variable called df), sorts and returns the top 10 as a json object
    containing title, author, avg rating and ISBN'''
    
    #convert input string of hypthetical book description into spacy doc object
    test_doc = nlp(input_string)
    
    #instantiate empty list of similarity scores:
    sims = []
    
    #iterate over the doc object for each book in the df to get the similarity score and append to list
    for doc in df.docs:
        sim = test_doc.similarity(doc)
        sims.append(sim)
    
    #sort the list and grab the top 10:
    if from_isbn:
        #skip the 0th ranked book which will be the bookused to get the input_string of the description:
        top10 = pd.Series(sims).sort_values(ascending=False).iloc[1:11]
    else:
        top10 = pd.Series(sims).sort_values(ascending=False).iloc[:10]
    
    #instantiate empty list to store the python dicts of each book
    books = []
    
    #iterate thru the top 10 ranked simlilar books and populate the book list w/ dictionaries for each book
    for i in top10.index:
        book = {}
        book['title'] = df.iloc[i]['book_title']
        book['author'] = df.iloc[i]['author']
        book['avg_rating'] = df.iloc[i]['avg_rating']
        book['ISBN'] = df.iloc[i]['ISBN']
        books.append(book)
    return books

In [29]:
example = 'thrilling spy novel set in post war Russia'

In [30]:
%time
get_recs_from_desc(example)

CPU times: user 30 µs, sys: 0 ns, total: 30 µs
Wall time: 34.1 µs


AttributeError: 'DataFrame' object has no attribute 'docs'

In [16]:
%time
example2 = 'dog and cat lover gets into trouble'
output = get_recs_from_desc(example2)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs


In [17]:
json.dumps(output)

TypeError: Object of type int64 is not JSON serializable

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19900 entries, 0 to 19899
Data columns (total 7 columns):
book_title     19900 non-null object
author         19900 non-null object
avg_rating     19900 non-null float64
ISBN           19900 non-null int64
genre          19900 non-null object
description    19900 non-null object
docs           19900 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 1.1+ MB


In [20]:
df['avg_rating'] = int(df['avg_rating'].iloc[0])

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19900 entries, 0 to 19899
Data columns (total 7 columns):
book_title     19900 non-null object
author         19900 non-null object
avg_rating     19900 non-null int64
ISBN           19900 non-null int64
genre          19900 non-null object
description    19900 non-null object
docs           19900 non-null object
dtypes: int64(2), object(5)
memory usage: 1.1+ MB
