In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

%matplotlib inline
%config InlineBackend.feature_format = 'retina'

### Loading in the DataFrame that I made which includes data about all the books. The text of the books has been removed now that I have data about the words, syllables, reading levels, and subjects extracted. I also make the subjects easier to parse through and load up the tfidf vectorized books dataframe and the count vectorized books dataframe.

In [10]:
meta_stats = pd.read_csv('meta_stats_all_notext', encoding = 'utf8', index_col = 'Unnamed: 0')

In [11]:
meta_stats['subjects2'] = meta_stats['subjects2'].map(lambda x: x.replace('[', '').replace(']','').split(','))

def clean_sub2(sublist):
    subs = []
    for s in sublist:
        subs.append(s.strip())
    return subs

meta_stats['subjects2'] = meta_stats['subjects2'].map(clean_sub2)

In [12]:
def unique_words(subjects):
    s = []
    for x in subjects:
        s.extend(x.split(' '))
    return np.unique(s)

meta_stats['subjects3'] = meta_stats['subjects2'].map(unique_words)

In [13]:
tvec_all = pd.read_csv('tvec_all', encoding = 'utf8', index_col = 'Unnamed: 0')

In [14]:
cvec_all = pd.read_csv('cvec_all', encoding = 'utf8', index_col = 'Unnamed: 0')

In [15]:
def jaccard(list1, list2):
    
    a = set(list1)
    b = set(list2)

    return round(len(a.intersection(b)) / float(len(a.union(b))), 5)

### The user selects a book and the dataframe is narrowed down to only include the books within a range of appropriate reading levels of that book.

In [25]:
def recommend_books(title):
    
    try:
        chosen_title = title
        chosen_book = meta_stats[meta_stats['title'] == chosen_title]
        chosen_id = chosen_book['book_id'].values[0]
        print ('Looking up books that are like %s' % title)
    except:
        print ('This book is not in the dataset. Try another book!')
        return None
    
    #Determining reading level of the recommended books
    
    flesch_k_max = chosen_book['flesch_kincaid_grade'].values[0] + 0.25
    flesch_k_min = chosen_book['flesch_kincaid_grade'].values[0] - 1.00
    print ('%s has a reading level of %s' % (title, str(round(chosen_book['flesch_kincaid_grade'], 2))))
    print ("The reading level of your recommended books will be between " + str(round(flesch_k_min, 2)) + " and " + str(round(flesch_k_max, 2)))
    
    flesch_k_subset = meta_stats[meta_stats['flesch_kincaid_grade'].between(flesch_k_min, flesch_k_max)].reset_index(drop = True)
    print ("There are " + str(flesch_k_subset.shape[0]) + " books that are within your reading level range.")
        
    subset_bids = flesch_k_subset[['book_id']]
    
    #Creating a column to recommend books based upon TFIDF
    tvec_subset = subset_bids.merge(tvec_all, how = 'inner', on = 'book_id').reset_index(drop = True)
    book_ids = tvec_subset.book_id
    tvec_subset = tvec_subset.drop('book_id', axis = 1)

    t_subset_distances = cosine_distances(tvec_subset)
    t_subset_distances_df = pd.DataFrame(t_subset_distances, columns=book_ids)
    t_subset_distances_df['book_id'] = book_ids

    t_chosen_distances = t_subset_distances_df[[chosen_id, 'book_id']]
    flesch_k_subset = flesch_k_subset.merge(t_chosen_distances, how = 'inner', on = 'book_id')
    flesch_k_subset.rename(columns = {chosen_id : 'tfidf_distances'}, inplace = True)
    
    #Creating a column to recommend books based upon count vectorizer
    cvec_subset = subset_bids.merge(cvec_all, how = 'inner', on = 'book_id').reset_index(drop = True)
    book_ids = cvec_subset.book_id
    
    cvec_subset = cvec_subset.drop('book_id', axis = 1)

    c_subset_distances = cosine_distances(cvec_subset)
    c_subset_distances_df = pd.DataFrame(c_subset_distances, columns=book_ids)
    c_subset_distances_df['book_id'] = book_ids

    c_chosen_distances = c_subset_distances_df[[chosen_id, 'book_id']]
    flesch_k_subset = flesch_k_subset.merge(c_chosen_distances, how = 'inner', on = 'book_id')
    flesch_k_subset.rename(columns = {chosen_id : 'cvec_distances'}, inplace = True)
    
    #Creating a column to recommend books based upon subject similarity
    
    subject_jaccards = []

    for book in range(flesch_k_subset.shape[0]):
        subject_jaccards.append(jaccard(flesch_k_subset.ix[book,'subjects3'].tolist(), 
                                                chosen_book['subjects3'].values[0].tolist()))
    
    flesch_k_subset['subject_jaccard'] = subject_jaccards

    mask_title = flesch_k_subset['title'] != chosen_title
    s_recs = flesch_k_subset[mask_title].sort_values('subject_jaccard', ascending = False).head(10).title.values + ' by ' + flesch_k_subset[mask_title].sort_values('subject_jaccard', ascending = False).head(10).author.values 
    t_recs = flesch_k_subset[mask_title].sort_values('tfidf_distances').head(10).title.values + ' by ' + flesch_k_subset[mask_title].sort_values('tfidf_distances').head(10).author.values 
    c_recs = flesch_k_subset[mask_title].sort_values('cvec_distances').head(10).title.values + ' by ' + flesch_k_subset[mask_title].sort_values('cvec_distances').head(10).author.values 

    recommended_books = pd.DataFrame({
        'Subject Based Recommendation' : s_recs,
        'tf-idf Based Recommendation' : t_recs,
        'Count Vectorizer Based Recommendation' : c_recs
    })

    
    return recommended_books

In [28]:
recommend_books("Metamorphosis")

Looking up books that are like Metamorphosis
Metamorphosis has a reading level of 12.26
The reading level of your recommended books will be between 11.26 and 12.51
There are 1162 books that are within your reading level range.


Unnamed: 0,Count Vectorizer Based Recommendation,Subject Based Recommendation,tf-idf Based Recommendation
0,"Nightmare Tales by Blavatsky, H. P. (Helena Pe...",The Private Memoirs and Confessions of a Justi...,Lippincott's Magazine of Popular Literature an...
1,Old Gorgon Graham: More Letters from a Self-Ma...,Maggie Miller: The Story of Old Hagar's Secret...,Lippincott's Magazine of Popular Literature an...
2,"A Girl Among the Anarchists by Meredith, Isabel","The House on the Moor, v. 2/3 by Oliphant, Mrs...","Chantry House by Yonge, Charlotte M. (Charlott..."
3,A Picture-book of Merry Tales by Anonymous,"Mabel: A Novel. Vol. 2 (of 3) by Newby, C. J.,...","At the Mercy of Tiberius by Evans, Augusta J. ..."
4,"The Altar Fire by Benson, Arthur Christopher","Falkland, Complete by Lytton, Edward Bulwer Ly...",The Bed-Book of Happiness: Being a colligation...
5,"Spiritual Adventures by Symons, Arthur","Falkland, Book 4. by Lytton, Edward Bulwer Lyt...",The Historical Romances of Georg Ebers by Eber...
6,"Mark Rutherford's Deliverance by White, Willia...","Albrecht by Bates, Arlo","The Tenant of Wildfell Hall by Brontë, Anne"
7,The Complete Short Works of Georg Ebers by Ebe...,"Poems and Tales from Romania by Sumanaru, Simona",Lippincott's Magazine of Popular Literature an...
8,"The Serapion Brethren, Vol. II by Hoffmann, E....","A Woman's Love by Opie, Amelia",The Complete Short Works of Georg Ebers by Ebe...
9,The Bed-Book of Happiness: Being a colligation...,"The Bride of Dreams by Eeden, Frederik van","How to Fail in Literature: A Lecture by Lang, ..."
