# ReadIt (Book Recommendation App)

## Data Processing

In [1]:
import pandas as pd
import numpy as np
import re

In [214]:
df = pd.read_csv("data/goodreads_data.csv", encoding="UTF-8", sep=",")
df.head()

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL
0,0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Ki...
1,1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,9278135,https://www.goodreads.com/book/show/72193.Harr...
2,2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical...",4.28,3944155,https://www.goodreads.com/book/show/1885.Pride...
3,3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"['Classics', 'Nonfiction', 'History', 'Biograp...",4.18,3488438,https://www.goodreads.com/book/show/48855.The_...
4,4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",3.98,3575172,https://www.goodreads.com/book/show/170448.Ani...


In [215]:
# get rid of unneeded columns
wanted = ['Book', 'Author', 'Description', 'Genres', 'Avg_Rating', 'Num_Ratings']
df_edit = df[wanted]
df_edit.head()


Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.27,5691311
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,9278135
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical...",4.28,3944155
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"['Classics', 'Nonfiction', 'History', 'Biograp...",4.18,3488438
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",3.98,3575172


In [216]:
df_edit.to_csv('data/books_edit.csv', sep = ';', encoding = 'UTF-8', index = False)      # write to csv
df = pd.read_csv('data/books_edit.csv', sep = ';', encoding = 'UTF-8')
df.head()

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.27,5691311
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,9278135
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical...",4.28,3944155
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"['Classics', 'Nonfiction', 'History', 'Biograp...",4.18,3488438
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",3.98,3575172


In [217]:
df.dtypes

Book            object
Author          object
Description     object
Genres          object
Avg_Rating     float64
Num_Ratings     object
dtype: object

In [218]:
df['Num_Ratings'] = df['Num_Ratings'].str.replace(',', '', regex=False)
df['Num_Ratings'] = df['Num_Ratings'].astype(np.int64)

In [219]:
df = df.convert_dtypes()
df.dtypes

Book           string[python]
Author         string[python]
Description    string[python]
Genres         string[python]
Avg_Rating            Float64
Num_Ratings             Int64
dtype: object

In [220]:
df.columns = pd.Index(['Title', 'Author', 'Description', 'Genres', 'Avg Rating', 'Num of Ratings'])
df.head()

Unnamed: 0,Title,Author,Description,Genres,Avg Rating,Num of Ratings
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.27,5691311
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,9278135
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical...",4.28,3944155
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"['Classics', 'Nonfiction', 'History', 'Biograp...",4.18,3488438
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",3.98,3575172


In [221]:
# cleaning genres 
df[['Genre1', 'Genre2', 'Genre3', 'Genre4', 'Genre5', 'Genre6']] = df['Genres'].str.split(',', n=5, expand=True)

unwanted = ['Genres', 'Genre6']
df.drop(unwanted, axis=1, inplace=True)

patt = r"[a-zA-Z]+"
gen_cols = ['Genre1', 'Genre2', 'Genre3', 'Genre4', 'Genre5']
for g in gen_cols:
    df[g] = df[g].apply(lambda x: " ".join(re.findall("[a-zA-Z]+", str(x))))
    
    
# drop NULLs & reset index
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,Title,Author,Description,Avg Rating,Num of Ratings,Genre1,Genre2,Genre3,Genre4,Genre5
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,4.27,5691311,Classics,Fiction,Historical Fiction,School,Literature
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,4.47,9278135,Fantasy,Fiction,Young Adult,Magic,Childrens
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...",4.28,3944155,Classics,Fiction,Romance,Historical Fiction,Literature
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,4.18,3488438,Classics,Nonfiction,History,Biography,Memoir
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,3.98,3575172,Classics,Fiction,Dystopia,Fantasy,Politics


In [222]:
def move_columns_behind(df, cols_to_move, behind_col):
    cols = df.columns.tolist()
    for col in cols_to_move:
        cols.remove(col)
    idx = cols.index(behind_col) + 1
    for i, col in enumerate(cols_to_move):
        cols.insert(idx + i, col)
    return df[cols]

df = move_columns_behind(df, ['Genre1', 'Genre2', 'Genre3', 'Genre4', 'Genre5'], 'Description')
df.head()

Unnamed: 0,Title,Author,Description,Genre1,Genre2,Genre3,Genre4,Genre5,Avg Rating,Num of Ratings
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,Classics,Fiction,Historical Fiction,School,Literature,4.27,5691311
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,Fantasy,Fiction,Young Adult,Magic,Childrens,4.47,9278135
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...",Classics,Fiction,Romance,Historical Fiction,Literature,4.28,3944155
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,Classics,Nonfiction,History,Biography,Memoir,4.18,3488438
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,Classics,Fiction,Dystopia,Fantasy,Politics,3.98,3575172


In [223]:
# removing duplicate items

# Sort so best entries come first: highest num ratings, then lowest index
df = df.sort_values(by=['Num of Ratings'], ascending=False).sort_index(kind='stable')
# Drop duplicates, keeping the first (best) one
df = df.drop_duplicates(subset=['Title', 'Author'], keep='first')

## Weighted Rating Column

In [224]:
# clean the data 
df['Avg Rating'] = df['Avg Rating'].fillna(0.0)
df['Num of Ratings'] = df['Num of Ratings'].fillna(0).astype(int)

In [225]:
df.insert(df.columns.get_loc("Num of Ratings") + 1, "Weighted Rating", [0.0]*len(df))

In [226]:
# Compute C (overall weighted average rating)
C = (df['Avg Rating'] * df['Num of Ratings']).sum() / df['Num of Ratings'].sum()

# choose a minimum number of ratings threshold
m = df['Num of Ratings'].quantile(0.85)

df['Weighted Rating'] = ( (df['Num of Ratings'] / (df['Num of Ratings'] + m)) * df['Avg Rating'] + (m / (df['Num of Ratings'] + m)) * C )

In [227]:
df.sort_values(by="Weighted Rating" , ascending=False).head(20)

Unnamed: 0,Title,Author,Description,Genre1,Genre2,Genre3,Genre4,Genre5,Avg Rating,Num of Ratings,Weighted Rating
15,Harry Potter and the Deathly Hallows (Harry Po...,J.K. Rowling,"Harry has been burdened with a dark, dangerous...",Fantasy,Young Adult,Fiction,Magic,Childrens,4.62,3468276,4.602601
2602,A Court of Mist and Fury (A Court of Thorns an...,Sarah J. Maas,Feyre survived Amarantha's clutches to return ...,Fantasy,Romance,Young Adult,New Adult,Fiction,4.63,1080733,4.576948
4861,"Words of Radiance (The Stormlight Archive, #2)",Brandon Sanderson,"Words of Radiance, Book Two of the Stormlight ...",Fantasy,Fiction,Epic Fantasy,High Fantasy,Audiobook,4.76,299887,4.5711
26,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,"Harry Potter, along with his best friends, Ron...",Fantasy,Fiction,Young Adult,Magic,Childrens,4.58,3808160,4.565303
36,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,"It is the middle of the summer, but there is a...",Fantasy,Young Adult,Fiction,Magic,Childrens,4.58,3048651,4.561777
670,The Nightingale,Kristin Hannah,In love we find out who we want to be.In war w...,Historical Fiction,Fiction,Historical,World War II,War,4.6,1113296,4.551212
33,Harry Potter and the Goblet of Fire (Harry Pot...,J.K. Rowling,It is the summer holidays and soon Harry Potte...,Fantasy,Young Adult,Fiction,Magic,Childrens,4.56,3397378,4.544253
720,"The Way of Kings (The Stormlight Archive, #1)",Brandon Sanderson,From #1 New York Times bestselling author Bran...,Fantasy,Fiction,Epic Fantasy,High Fantasy,Audiobook,4.65,427106,4.528945
2925,"Crooked Kingdom (Six of Crows, #2)",Leigh Bardugo,Welcome to the world of the Grisha.Kaz Brekker...,Fantasy,Young Adult,Fiction,Romance,Young Adult Fantasy,4.61,540985,4.517015
135,"The Return of the King (The Lord of the Rings,...",J.R.R. Tolkien,In the third volume of The Lord of the Rings t...,Fantasy,Fiction,Classics,Adventure,High Fantasy,4.56,830217,4.501589


In [228]:
def has_garbage(text):
    return bool(re.search(r'[\x00-\x1F\x7F-\x9F]', str(text)))

# Filter out rows with garbage in the 'Book' column
df = df[~df['Title'].apply(has_garbage)].reset_index(drop=True)

In [229]:
df.to_pickle('data/books.pkl')

In [2]:
import pandas as pd

df = pd.read_pickle("data/books.pkl")
df.head(10)

Unnamed: 0,Title,Author,Description,Genre1,Genre2,Genre3,Genre4,Genre5,Avg Rating,Num of Ratings,Weighted Rating
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,Classics,Fiction,Historical Fiction,School,Literature,4.27,5691311,4.266329
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,Fantasy,Fiction,Young Adult,Magic,Childrens,4.47,9278135,4.465233
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...",Classics,Fiction,Romance,Historical Fiction,Literature,4.28,3944155,4.274461
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,Classics,Nonfiction,History,Biography,Memoir,4.18,3488438,4.177014
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,Classics,Fiction,Dystopia,Fantasy,Politics,3.98,3575172,3.983439
5,The Little Prince,Antoine de Saint-Exupéry,A pilot stranded in the desert awakes one morn...,Classics,Fiction,Fantasy,Childrens,France,4.32,1924063,4.30668
6,1984,George Orwell,The new novel by George Orwell is the major wo...,Classics,Fiction,Science Fiction,Dystopia,Literature,4.19,4201429,4.187235
7,The Great Gatsby,F. Scott Fitzgerald,Alternate Cover Edition ISBN: 0743273567 (ISBN...,Classics,Fiction,School,Historical Fiction,Literature,3.93,4839642,3.933745
8,The Catcher in the Rye,J.D. Salinger,It's Christmas time and Holden Caulfield has j...,Classics,Fiction,Young Adult,Literature,School,3.81,3315881,3.819507
9,The Lord of the Rings,J.R.R. Tolkien,"One Ring to rule them all, One Ring to find th...",Fantasy,Classics,Fiction,Adventure,Science Fiction Fantasy,4.52,644766,4.453532


In [231]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# Function for removing NonAscii characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def cleanString(text):
    text = text.lower()
    text = text.split()
    # remove stopwords (like and, the, etc.)
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    # remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

# Applying all the functions in description and storing as a cleaned_desc
df['cleaned_Desc'] = df['Description'].apply(_removeNonAscii)
df['cleaned_Desc'] = df.cleaned_Desc.apply(func = cleanString)
df['cleaned_Desc'] = df.cleaned_Desc.apply(func=remove_html)

# view
df.sample(5)

Unnamed: 0,Title,Author,Description,Genre1,Genre2,Genre3,Genre4,Genre5,Avg Rating,Num of Ratings,Weighted Rating,cleaned_Desc
4929,The River Why,David James Duncan,"Flyfishing genius Gus Orviston, seeking refuge...",Fiction,Nature,Philosophy,Novels,Literature,4.23,11155,4.100538,flyfishing genius gus orviston seeking refuge ...
2909,"Illuminae (The Illuminae Files, #1)",Amie Kaufman,"This morning, Kady thought breaking up with Ez...",Science Fiction,Young Adult,Romance,Fantasy,Fiction,4.25,145814,4.177874,morning kady thought breaking ezra hardest thi...
5526,Runaway: Stories,Alice Munro,The incomparable Alice Munro’s bestselling and...,Short Stories,Fiction,Canada,Literary Fiction,Contemporary,3.98,23600,4.070102,incomparable alice munros bestselling rapturou...
9130,"Captive in the Dark (The Dark Duet, #1)",C.J. Roberts,Caleb is a man with a singular interest in rev...,Dark,Romance,Erotica,BDSM,Adult,3.95,70891,4.036162,caleb man singular interest revenge kidnapped ...
8277,"Guardians of the West (The Malloreon, #1)",David Eddings,Garion has slain the evil God Torak and been c...,Fantasy,Fiction,Epic Fantasy,Science Fiction Fantasy,High Fantasy,4.1,52623,4.091873,garion slain evil god torak crowned king riva ...


In [3]:
df.sample(5)

Unnamed: 0,Title,Author,Description,Genre1,Genre2,Genre3,Genre4,Genre5,Avg Rating,Num of Ratings,Weighted Rating
4542,Warping Minds & Other Misdemeanors (The Guild ...,Annette Marie,"My name is Kit Morris, and welcome to my warpe...",Urban Fantasy,Fantasy,Paranormal,Magic,Romance,4.25,5311,4.095234
1519,Escape from Camp 14: One Man's Remarkable Odys...,Blaine Harden,"A New York Times bestseller, the shocking stor...",Nonfiction,Biography,History,Memoir,Politics,4.0,65543,4.056604
2451,The DUFF: Designated Ugly Fat Friend (Hamilton...,Kody Keplinger,Seventeen-year-old Bianca Piper is cynical and...,Young Adult,Romance,Contemporary,Fiction,Chick Lit,3.8,191408,3.909528
8714,Without Alice,D.J. Kirkby,Have you ever had a secret? One so important t...,,,,,,3.45,102,4.087674
27,Wuthering Heights,Emily Brontë,You can find the redesigned cover of this edit...,Classics,Fiction,Romance,Gothic,Historical Fiction,3.88,1684275,3.893559


Recommendation Algorithm

- Gather a list of all the books that match with the ``genre`` the user inputted
- See if the inputted ``title`` matches with any of the books within the list
    - If not contained within the subset, expand the search to the larger dataset
    - If the book is found, take into account all of its genres
- Find all books that share at least one genre with both the input genre (and the book’s genres, if needed)
- Rank the books based on how similar their descriptions are
- If one of the books in the ranking is the input ``title``, remove it from the list

In [4]:
import numpy as np

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def recommend(title="", genre=""):
    
    # constants
    GENRE_SIMILARITY_THRESHOLD = 0.2
    FULL_SIMILARITY_THRESHOLD = 0.1
    MAX_RECOMMENDATIONS = 5
    
    title = title.lower()
    genre = genre.lower()
    
    # Filter by genre
    if genre.strip():
        temp = df[
            (df['Genre1'].str.lower() == genre) |
            (df['Genre2'].str.lower() == genre) |
            (df['Genre3'].str.lower() == genre) |
            (df['Genre4'].str.lower() == genre) |
            (df['Genre5'].str.lower() == genre)
        ].reset_index(drop=True)
    else:
        temp = df.copy()

    # Recommend top 5 books by weighted rating if title is empty
    if not title.strip():
        recs = temp.sort_values(by='Weighted Rating', ascending=False).head(5)
        print(f"Top books in '{genre}': {', '.join(recs['Title'])}")
        return

    matchTitle = None
    genreTitles = temp["Title"].str.lower()
    # Exact match in genre-filtered data
    if title in genreTitles.values:
        matchTitle = temp.loc[genreTitles == title, 'Title'].values[0]
    # Bigram match for similar titles in genre-filtered data
    elif not temp.empty:
        print(title)
        similarTitle = TfidfVectorizer(ngram_range=(2, 2), stop_words='english')
        titleMatrix = similarTitle.fit_transform(temp['Title'])
        userVec = similarTitle.transform([title])
        simScores = cosine_similarity(userVec, titleMatrix).flatten()
        bestID = np.argmax(simScores)
        if simScores[bestID] > GENRE_SIMILARITY_THRESHOLD: # prevents clearly wrong matches
            matchTitle = temp.iloc[bestID]['Title']
            print("Assuming you meant: " + matchTitle)

    # if genre-filtered match fails, expand search to larger dataset
    if matchTitle is None:
        allTitles = df['Title'].str.lower()
        # Exact title match
        if title in allTitles.values:
            matchTitle = df.loc[allTitles[allTitles == title].index[0], 'Title']
        # Bigram match for similar titles 
        else:
            similarTitle = TfidfVectorizer(ngram_range=(2, 2), stop_words='english')
            titleMatrix = similarTitle.fit_transform(df['Title'])
            userVec = similarTitle.transform([title])
            simScores = cosine_similarity(userVec, titleMatrix).flatten()
            bestID = np.argmax(simScores)
            if simScores[bestID] > FULL_SIMILARITY_THRESHOLD:
                matchTitle = df.iloc[bestID]['Title']
                print("Assuming you meant: " + matchTitle)
                # get genres of matchTitle to apply to new temp
                matchedRow = df[df['Title'] == matchTitle].iloc[0]
                matchedGenres = [matchedRow[f'Genre{i}'] for i in range(1, 6)]
                temp = df[df[[f'Genre{i}' for i in range(1, 6)]].isin(matchedGenres).any(axis=1)].reset_index(drop=True)
            else:
                print("No books found similar to " + title + ".")
                return
  
    # use trigrams to analyze book similarity from description
    indices = pd.Series(temp.index, index=temp['Title'])
    tf = TfidfVectorizer(ngram_range=(3, 3), stop_words='english')
    tfidf_matrix = tf.fit_transform(temp['Description'])

    sg = cosine_similarity(tfidf_matrix, tfidf_matrix)
    idx = indices[matchTitle]

    sig = list(enumerate(sg[idx]))
    sig = [x for x in sig if x[0] != idx]  # remove self-match explicitly
    sig = sorted(sig, key=lambda x: x[1], reverse=True)[:MAX_RECOMMENDATIONS]

    book_indices = [i[0] for i in sig]
    rec = temp['Title'].iloc[book_indices]

    print(f"People who read '{matchTitle}' might also like: {', '.join(rec)}")


In [7]:
recommend("", "Fantasy")

Top books in 'fantasy': Harry Potter and the Deathly Hallows (Harry Potter, #7), A Court of Mist and Fury (A Court of Thorns and Roses, #2), Words of Radiance (The Stormlight Archive, #2), Harry Potter and the Prisoner of Azkaban (Harry Potter, #3), Harry Potter and the Half-Blood Prince (Harry Potter, #6)
