In [61]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
# import warnings

# from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
def plot_confusion(model, predicted, actual, label = []):
    y_pred = model.predict(predicted)
    y_act = actual
    cm = confusion_matrix(y_act, y_pred, labels = label)
    cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = label)
    cm_display.plot()
    plt.xticks(rotation='vertical')

# Part 1: Book Recommendation

Determine a method of identifying books that are similar to each other. You can use any data available in the “books_data.csv” file, or you can write code to gather more information about each book.  

## a)

Write a function that allows a user to search for books similar to one that they specify. Your function should take the name of the book as a parameter and return a list of at least 5 books that are most similar (according to your metric) to the specified book.

In [None]:
df = pd.read_csv('../data/books_data.csv')

In [None]:
df = df.dropna(subset=['Title', 'description', 'authors', 'categories']).reset_index(drop=True)
# Convert to strings and clean 'authors' and 'categories' columns
df['authors'] = df['authors'].astype(str).str.replace(r"[\[\]']", "", regex=True)
df['categories'] = df['categories'].astype(str).str.replace(r"[\[\]']", "", regex=True)
df = df.reset_index(drop=True)
# Create 'combination' column
df['combination'] = df['description'].astype(str) + ' ' + df['authors'] + ' ' + df['categories']
# df.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,combination
0,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,Philip Nel,http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,Biography & Autobiography,,Philip Nel takes a fascinating look into the k...
1,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,David R. Ray,http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,Religion,,This resource includes twelve principles in un...
2,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,Veronica Haddon,http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,Fiction,,Julia Thomas finds her life spinning out of co...
3,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,Everett Ferguson,http://books.google.com/books/content?id=kVqRa...,http://books.google.nl/books?id=kVqRaiPlx88C&p...,Wm. B. Eerdmans Publishing,1996,http://books.google.nl/books?id=kVqRaiPlx88C&d...,Religion,5.0,In The Church of Christ: A Biblical Ecclesiolo...
4,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,Mary Fabyan Windeatt,http://books.google.com/books/content?id=lmLqA...,http://books.google.nl/books?id=lmLqAAAACAAJ&d...,Tan Books & Pub,2009-01-01,http://books.google.nl/books?id=lmLqAAAACAAJ&d...,Biography & Autobiography,,The story for children 10 and up of St. Hyacin...


In [64]:
v = CountVectorizer(stop_words = 'english')

X = v.fit_transform(df['combination']) #creates a vocabulary and creates the data

#How toget a movie out of the matrix
df['Title'] == 'Dr. Seuss: American Icon'
idx = df[df['Title'] == 'Dr. Seuss: American Icon'].index[0]
# print(idx)
X[idx]

# Here's a slicker way to do it.
# Generate a mapping from movie title to index in df.
# the pandas Series command works a lot like a built-in python dictionary.
movie2idx = pd.Series(df.index, index=df['Title'])

In [87]:
from sklearn.metrics.pairwise import cosine_similarity
def get_most_similar(book_title):
    if book_title in movie2idx.keys():
        idx = movie2idx[book_title]
        scores = cosine_similarity(X, X[idx])
        # Now we want to find the best matches. We're not actually interested in the scores themselves, but the relative rankings.
        # For that, we can use the argsort command.
        # scores.argsort() would sort the scores in ascending order and then return the indices that correspond to those scores.
        # Since we want the highest scores, I'll use (-scores).argsort() instead.
        scores = scores.flatten()
        recommended = (-scores).argsort()

        # We should ignore the first entry in the list (since that'll be the movie itself).
        # Let's take the next 5 movies.
        recommended = recommended[1:6]

        # Now we should return the titles of the corresponding movies.
        return df['Title'].iloc[recommended]
    else:
        return "Book not found."

get_most_similar("Wonderful Worship in Smaller Churches")

11793    A Matter of Honor (G K Hall Large Print Book S...
710                     Man of La Mancha;: A musical play,
1847     Object Relational Dbms: Tracking the Next Grea...
4439     Rape of the Masses: The Psychology of Totalita...
1178                                The Call of the Canyon
Name: Title, dtype: object

In this part, to find five most similar books to a given book. So using cosine similarity because it helps calculating angles between the vectors which means that it finds the most similar. 

## b)

Devise a method to quantify the quality of selection of books you return.  This could include external validation (e.g. comparing your recommendations against another type of recommendation), robustness testing (e.g. comparing the recommendations of two or more of your own different models), or other types of testing.


In order to test the model, using chatGPT and asking most similar books to Wonderful Worship in Smaller Churches. According to the ChatGPT, The Small Church Advantage, Shepherding the Small Church, Help for the Small-Church Pastor, No Little Places: The Untapped Potential of the Small-Town Church and The Purpose Driven Church. So according to the ChatGPT, it provided more church and worship realted books. The answers that the algithm provided are also kind of about churches and faith. So, it kind of correct to say the alrithm provided above works.

# Part 2: Composite Book Ratings

## 1)

Determine a method to rate each book based on its user reviews. You can use any data available in the “Books_ratings.csv” file.

In [67]:
df2 = pd.read_csv("../data/Books_rating.csv")
df2 = df2.dropna().reset_index(drop=True)

In [68]:
df2 = df2.sample(n=50000, random_state=1)
df2.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
264037,310579503,Zondervan NIV Nave's Topical Bible,19.79,A2AW8SZYZTK86B,"Paul Powell ""moloch16""",1/1,5.0,1303344000,Great Resource,"This is an great topical reference, I use it q..."
41275,1850896429,Mrs. Miniver (Isis Series),46.75,A3DNSXDSJRJY6T,J. A. Spilker,0/0,5.0,1196294400,A perennial classic,I re-read this book every year at the start of...
123666,1931599351,The Great Iowa Touring Book: 27 Spectacular Au...,21.95,A3FE8FQPQPONAW,Michelle Pettit,21/21,5.0,1096329600,Wonderful information and accuracy,I don't know how I stumbled on this book but I...
290427,1421808803,Night And Day,41.95,A1H6VI6CMTOX3C,Book Lover,0/0,2.0,1249862400,Night and Day,I was very disappointed in this book. The plot...
388818,486438767,Fashions of the Old South Coloring Book (Dover...,3.99,A2T87CHZR3TE5E,Konrad Trope,0/0,5.0,1201046400,Another beautiful Dover coloring book,Beautiufl coloring book. I bought these for my...


### a)

Perform your own analysis on the sentiment of the reviews in the data set.
Compare your sentiment scores against the reviewer’s personal ratings.


In [None]:
df2['review/text'] = df2['review/text'].dropna()
df2['Title'] = df2['Title'].dropna()
df2.reset_index(drop=True)
# df2.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
264037,310579503,Zondervan NIV Nave's Topical Bible,19.79,A2AW8SZYZTK86B,"Paul Powell ""moloch16""",1/1,5.0,1303344000,Great Resource,"This is an great topical reference, I use it q..."
41275,1850896429,Mrs. Miniver (Isis Series),46.75,A3DNSXDSJRJY6T,J. A. Spilker,0/0,5.0,1196294400,A perennial classic,I re-read this book every year at the start of...
123666,1931599351,The Great Iowa Touring Book: 27 Spectacular Au...,21.95,A3FE8FQPQPONAW,Michelle Pettit,21/21,5.0,1096329600,Wonderful information and accuracy,I don't know how I stumbled on this book but I...
290427,1421808803,Night And Day,41.95,A1H6VI6CMTOX3C,Book Lover,0/0,2.0,1249862400,Night and Day,I was very disappointed in this book. The plot...
388818,486438767,Fashions of the Old South Coloring Book (Dover...,3.99,A2T87CHZR3TE5E,Konrad Trope,0/0,5.0,1201046400,Another beautiful Dover coloring book,Beautiufl coloring book. I bought these for my...


In [70]:
df2 = df2.dropna(subset=['Title', 'review/text']).reset_index(drop=True)
df2['review/text'] = df2['review/text'] + ' ' + df2['review/summary']


In [71]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()
df2['scores'] = df2['review/text'].astype(str).apply(lambda text: sid.polarity_scores(text))
df2['compound'] = df2['scores'].apply(lambda d: d['compound'])
df2['compound'] = df2['compound'].astype(float)
df2['sentiment'] = df2['compound'].apply(lambda score: 1 if score > 0.2 else -1 if score < -0.6 else 0)
df2[:10]

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/muratguzelocak/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text,scores,compound,sentiment
0,0310579503,Zondervan NIV Nave's Topical Bible,19.79,A2AW8SZYZTK86B,"Paul Powell ""moloch16""",1/1,5.0,1303344000,Great Resource,"This is an great topical reference, I use it q...","{'neg': 0.0, 'neu': 0.691, 'pos': 0.309, 'comp...",0.8834,1
1,1850896429,Mrs. Miniver (Isis Series),46.75,A3DNSXDSJRJY6T,J. A. Spilker,0/0,5.0,1196294400,A perennial classic,I re-read this book every year at the start of...,"{'neg': 0.0, 'neu': 0.702, 'pos': 0.298, 'comp...",0.967,1
2,1931599351,The Great Iowa Touring Book: 27 Spectacular Au...,21.95,A3FE8FQPQPONAW,Michelle Pettit,21/21,5.0,1096329600,Wonderful information and accuracy,I don't know how I stumbled on this book but I...,"{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'comp...",0.9939,1
3,1421808803,Night And Day,41.95,A1H6VI6CMTOX3C,Book Lover,0/0,2.0,1249862400,Night and Day,I was very disappointed in this book. The plot...,"{'neg': 0.173, 'neu': 0.746, 'pos': 0.081, 'co...",-0.4765,0
4,0486438767,Fashions of the Old South Coloring Book (Dover...,3.99,A2T87CHZR3TE5E,Konrad Trope,0/0,5.0,1201046400,Another beautiful Dover coloring book,Beautiufl coloring book. I bought these for my...,"{'neg': 0.0, 'neu': 0.655, 'pos': 0.345, 'comp...",0.9022,1
5,B000ETQQ3C,The Center of Everything : A Novel,5.6,A1AYJ89PH08UH8,Amanda,3/3,5.0,1074038400,Excellent read,"I am always looking for a good book to read, a...","{'neg': 0.03, 'neu': 0.726, 'pos': 0.243, 'com...",0.9142,1
6,0764535463,Flash 5 Weekend Crash Course,24.99,AGNEM2S9P40YV,"John K. Gowrie ""jgowrie""",2/2,1.0,1007856000,"Typos, Missing Steps and bad writing plague th...",After getting through the first 50 pages of th...,"{'neg': 0.143, 'neu': 0.821, 'pos': 0.036, 'co...",-0.885,-1
7,158297327X,Description & Setting: Techniques and Exercise...,11.35,A3E2GGJSQINKA3,"Mike Klaassen ""Author""",2/7,4.0,1163980800,WELL WORTH THE PURCHASE PRICE,The Description & Setting volume for the Write...,"{'neg': 0.0, 'neu': 0.698, 'pos': 0.302, 'comp...",0.9559,1
8,0764200720,A Bride Most Begrudging,10.19,A2XL0GC44S709S,Chris Balan,1/1,5.0,1269302400,Loved this book,"I was not sure I would like this book, even af...","{'neg': 0.09, 'neu': 0.682, 'pos': 0.228, 'com...",0.9238,1
9,0833585568,Maniac Magee (Turtleback School & Library Bind...,14.35,A228JPM662A9OL,Luke skywalker,0/0,5.0,1352332800,Manic Magee,Manic Magee Diesel Hayes#8Manic Magee AKA: Jef...,"{'neg': 0.083, 'neu': 0.847, 'pos': 0.07, 'com...",0.128,0


In [89]:
df2['sentiment'].value_counts()

sentiment
 1    42006
-1     4102
 0     3892
Name: count, dtype: int64

In [90]:
df2['review/score'].value_counts()

review/score
5.0    30692
4.0     9525
3.0     4034
1.0     3268
2.0     2481
Name: count, dtype: int64

In this part, using Sentiment analysis to understand which books got positive comments and which books got negative comments. In sentiment analysis, I created three groups: 1, 0, and -1. ! stands for possitive comments, 0 for neutral, and -1 for negative comments. According to the books data provided, sentiment analysis found 42006 posstive reviews, 3892 neutral comments, and 4102 negative comments.

### b)

In [72]:
wrong = df2[((df2['review/score'] >= 4) & (df2['sentiment'] == -1)) | ((df2['review/score'] <= 2) &
         (df2['sentiment'] == 1)) | ((df2['review/score'] == 3) & (df2['sentiment'] != 0))]

accuracy = 1 - (len(wrong) / len(df2))
print(f"Accuracy Rate: {accuracy:.2%}")

Accuracy Rate: 83.26%


In this part in order to test our results from Sentiment Analysis, I assumed that review scores 5 and 4 are posstive comments, and 2 and below are negative comments. 3 comments' scores are neutral. According to this assumtion, this models' succes rate is $83.26\%$. Becuase it is above $80\%$, I think our model is succesfull.

## 2)

### a)

In [None]:
average = df2.groupby('Title')['review/score'].mean().reset_index() 
average = average.rename(columns={'review/score': 'averageScore'})
# average

Unnamed: 0,Title,averageScore
0,"""Cool Stuff"" They Should Teach in School: Crui...",4.500000
1,"""Forget Not Love"": The Passion of Maximilian K...",5.000000
2,"""I Want to Be Jesus!"": Over 150 Easy-To-Use Go...",5.000000
3,"""Let's Face it, Men are @$#%"": What Women Can ...",1.000000
4,"""Life Was Never Meant to Be a Struggle""",5.000000
...,...,...
19277,sendmail Desktop Reference (Pocket Reference),5.000000
19278,st*rf*ck*ng,5.000000
19279,the Picture of Dorian Gray,4.151515
19280,"tick, tick ... BOOM!",4.000000


In [74]:
df = df.reset_index(drop=True)
df2 = df2.reset_index(drop=True)
review_dict = {}
for i in range(len(df2)):
    if df2['Title'][i] not in review_dict.keys():
        review_dict[str(df2['Title'][i])] = df2['review/text'][i]
    else:
        review_dict[str(df2['Title'][i])] += " " 
        review_dict[str(df2['Title'][i])] += df2['review/text'][i]

In [75]:
df3 = pd.DataFrame(list(review_dict.items()), columns=['book_title', 'full_review'])
df3 = pd.merge(df3, average, right_on='Title' ,left_on='book_title', how='inner')
df3 = df3.drop(columns=['Title'])
df3

Unnamed: 0,book_title,full_review,averageScore
0,Zondervan NIV Nave's Topical Bible,"This is an great topical reference, I use it q...",4.2500
1,Mrs. Miniver (Isis Series),I re-read this book every year at the start of...,5.0000
2,The Great Iowa Touring Book: 27 Spectacular Au...,I don't know how I stumbled on this book but I...,5.0000
3,Night And Day,I was very disappointed in this book. The plot...,3.4375
4,Fashions of the Old South Coloring Book (Dover...,Beautiufl coloring book. I bought these for my...,5.0000
...,...,...,...
19277,Mazes (Shire Albums),This is one of the most interesting Christmas ...,4.0000
19278,The Stones Cry Out,"""and his face resembled a skeleton of wires co...",5.0000
19279,The Unanswered Question: Six Talks at Harvard ...,I respect Bernstein even more as a scholar of ...,5.0000
19280,Doubt!! Vol. 1,"'Doubt!!' revolves around 15-year-old, Ai Maek...",4.0000


In [76]:
sid = SentimentIntensityAnalyzer()  # Create SentimentIntensityAnalyzer object
df3['scores'] = df3['full_review'].astype(str).apply(lambda text: sid.polarity_scores(text))
df3['compound'] = df3['scores'].apply(lambda d: d['compound'])
df3['compound'] = df3['compound'].astype(float)
df3['sentiment'] = df3['compound'].apply(lambda score: 1 if score > 0.2 else -1 if score < -0.6 else 0)
df3[:16]

Unnamed: 0,book_title,full_review,averageScore,scores,compound,sentiment
0,Zondervan NIV Nave's Topical Bible,"This is an great topical reference, I use it q...",4.25,"{'neg': 0.012, 'neu': 0.791, 'pos': 0.196, 'co...",0.988,1
1,Mrs. Miniver (Isis Series),I re-read this book every year at the start of...,5.0,"{'neg': 0.0, 'neu': 0.702, 'pos': 0.298, 'comp...",0.967,1
2,The Great Iowa Touring Book: 27 Spectacular Au...,I don't know how I stumbled on this book but I...,5.0,"{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'comp...",0.9939,1
3,Night And Day,I was very disappointed in this book. The plot...,3.4375,"{'neg': 0.078, 'neu': 0.73, 'pos': 0.192, 'com...",0.9999,1
4,Fashions of the Old South Coloring Book (Dover...,Beautiufl coloring book. I bought these for my...,5.0,"{'neg': 0.009, 'neu': 0.667, 'pos': 0.324, 'co...",0.9952,1
5,The Center of Everything : A Novel,"I am always looking for a good book to read, a...",3.896552,"{'neg': 0.09, 'neu': 0.716, 'pos': 0.194, 'com...",0.9999,1
6,Flash 5 Weekend Crash Course,After getting through the first 50 pages of th...,1.0,"{'neg': 0.143, 'neu': 0.821, 'pos': 0.036, 'co...",-0.885,-1
7,Description & Setting: Techniques and Exercise...,The Description & Setting volume for the Write...,3.75,"{'neg': 0.057, 'neu': 0.765, 'pos': 0.178, 'co...",0.9965,1
8,A Bride Most Begrudging,"I was not sure I would like this book, even af...",3.882353,"{'neg': 0.081, 'neu': 0.686, 'pos': 0.233, 'co...",0.9999,1
9,Maniac Magee (Turtleback School & Library Bind...,Manic Magee Diesel Hayes#8Manic Magee AKA: Jef...,3.944444,"{'neg': 0.112, 'neu': 0.717, 'pos': 0.17, 'com...",0.9997,1


In [77]:
df3.sort_values(by='compound', ascending=False)[:16]

Unnamed: 0,book_title,full_review,averageScore,scores,compound,sentiment
830,Bringing Down the House: The Inside Story of S...,I liked this book because it is true. Often ti...,4.309524,"{'neg': 0.061, 'neu': 0.743, 'pos': 0.196, 'co...",1.0,1
523,Jane Eyre (Large Print),Best book I've had the pleasure of reading in ...,4.608696,"{'neg': 0.08, 'neu': 0.685, 'pos': 0.235, 'com...",1.0,1
1536,Attila: A Barbarian's Love Story,ATTILA: A BARBARIAN'S LOVE STORYThe book's str...,4.6,"{'neg': 0.067, 'neu': 0.685, 'pos': 0.248, 'co...",1.0,1
832,Year of Wonders (Turtleback School & Library B...,I could not put this book down. Very well writ...,4.038462,"{'neg': 0.089, 'neu': 0.727, 'pos': 0.184, 'co...",1.0,1
835,Plainsong,I found Plainsong to be a wonderfully evocativ...,3.82,"{'neg': 0.078, 'neu': 0.745, 'pos': 0.177, 'co...",1.0,1
1523,Before the Season Ends,Linore Rose Burkard has created an entertainin...,4.428571,"{'neg': 0.052, 'neu': 0.656, 'pos': 0.292, 'co...",1.0,1
309,Gods and Kings (Chronicles of the Kings #1),I greatly enjoy reading the books of history f...,4.685714,"{'neg': 0.047, 'neu': 0.725, 'pos': 0.228, 'co...",1.0,1
94,Persuasion,I had been very Curiousabout is book after see...,4.295455,"{'neg': 0.074, 'neu': 0.724, 'pos': 0.201, 'co...",1.0,1
1313,Left to Tell: Discovering God Amidst The Rwand...,"I was assigned to read this book for class, an...",4.792683,"{'neg': 0.105, 'neu': 0.667, 'pos': 0.228, 'co...",1.0,1
2030,The Bread Lover's Bread Machine Cookbook: A Ma...,I love this book and highly recommend it! It i...,4.296296,"{'neg': 0.051, 'neu': 0.772, 'pos': 0.177, 'co...",1.0,1


In this part, I combined the all the `review/text` under one book `title`. I, also, averaged the `review/score` under the name of 'averageScore'. In this part, because we generilize the data more I expect higher accuracy score.

### b)

In [92]:
wrong = df3[((df3['averageScore'] >= 4) & (df3['sentiment'] == -1)) | ((df3['averageScore'] <= 2) &
         (df3['sentiment'] == 1)) | ((df3['averageScore'] == 3) & (df3['sentiment'] != 0))]

accuracy = 1 - (len(wrong) / len(df3))
print(f"Accuracy Rate: {accuracy:.2%}")

Accuracy Rate: 86.70%


In this part I used the same technique from the previous section to calcualte the accuracy rate. Combinning reviews under one `book_title`, and getting the averages of the `review/score` helped more than $3\%$. So, new accuracy rate is $86.7\%$, which is significant.

# Part 3: A comprehensive recommender system

## a)

In [79]:
finalDF = pd.merge(df, df3, how='inner', left_on='Title', right_on='book_title')
finalDF = finalDF.drop(columns=['book_title', 'scores', 'image', 'previewLink', 'authors', 'categories',
                                 'description', 'publisher', 'publishedDate', 'infoLink'])
finalDF = finalDF[finalDF['sentiment'] != -1]
finalDF = finalDF.reset_index(drop=True)
finalDF

Unnamed: 0,Title,ratingsCount,combination,full_review,averageScore,compound,sentiment
0,Whispers of the Wicked Saints,,Julia Thomas finds her life spinning out of co...,Just as predicted the first chapter of the boo...,4.75,0.9984,1
1,Dramatica for Screenwriters,,Dramatica for Screenwriters by Armando Saldana...,Beauty is in the bones. This book gives your s...,5.00,0.9972,1
2,The Ultimate Guide to Law School Admission: In...,,This collection brings together a distinguishe...,If you have even casually looked into applying...,1.00,0.4434,1
3,Beginner's Yoruba (Hippocrene Beginner's Series),1.0,"""Beginner's Yoruba"" is now available with two ...",This is my first encounter with Yoruba and I h...,4.00,0.9653,1
4,How to Discipline Kids without Losing Their Lo...,1.0,Imagine... No More Arguing. Imagine... No More...,I purchased this book not knowing much about i...,5.00,0.9963,1
...,...,...,...,...,...,...,...
13799,El color del verano,,Es pleno verano y en la isla de Cuba comienza ...,Since I read the first chapter of the book I w...,5.00,0.4836,1
13800,In the First Line of Battle: The 12th Illinois...,,From its first major engagement at Harpers Fer...,It had been traditional that a member of a mil...,5.00,-0.1635,0
13801,Is Your Retirement Heading in the Right Direct...,,So many seniors today are living so much longe...,this book was very unrewarding in terms of sol...,1.00,0.5994,1
13802,The Awakening and Selected Stories (Modern Lib...,2.0,"WHEN IT FIRST APPEARED IN 1899, THE AWAKENING ...",I actually haven't reached the Awakening yet. ...,3.50,0.9206,1


In [80]:
get_most_similar('Whispers of the Wicked Saints')

29778                           Women in the Wind
38466                        An Independent Woman
119844                 Miss Julia Speaks Her Mind
37156                        At Mrs. Lippincote's
94218     When a Texan Gambles (The Wife Lottery)
Name: Title, dtype: object

In [81]:
v = CountVectorizer(stop_words = 'english')

X = v.fit_transform(finalDF['combination']) #creates a vocabulary and creates the data
book2idx = pd.Series(finalDF.index, index=finalDF['Title'])
def get_best_book(book_title):
    if book_title in book2idx.keys():
        idx = book2idx[book_title]
        scores = cosine_similarity(X, X[idx])
        # Now we want to find the best matches. We're not actually interested in the scores themselves, but the relative rankings.
        # For that, we can use the argsort command.
        # scores.argsort() would sort the scores in ascending order and then return the indices that correspond to those scores.
        # Since we want the highest scores, I'll use (-scores).argsort() instead.
        scores = scores.flatten()
        recommended = (-scores).argsort()

        # We should ignore the first entry in the list (since that'll be the movie itself).
        # Let's take the next 5 movies.
        recommended = recommended[1:6]

        # Now we should return the titles of the corresponding movies.
        return finalDF['Title'].iloc[recommended]
    else:
        return "Book not found"

In [82]:
get_best_book('Whispers of the Wicked Saints')

9529               When a Texan Gambles (The Wife Lottery)
10647                                       Healing H'Arts
4754                                     My Life in France
13570    It Will Live Forever: Traditional Yosemite Ind...
2670     Inconceivable: A Woman's Triumph over Despair ...
Name: Title, dtype: object

In this part, to combine two different analysis techniques to make better book recomandations, I decided to remove all the books with negative sentiment analysis book score from the data so our users don't get recommendations of unliked books. Than I just kept everything else same.

## b)

In [88]:
while True:
  book_title = input("Enter a book title: or hit 'q' to quit")
  if book_title == 'q':
    print("Goodbye!")
    break
  print(f"{book_title} is similar to:\n")
  print(get_best_book(book_title))
  print("\n")

Wonderful Worship in Smaller Churches is similar to:

Book not found


Healing H'Arts is similar to:

4754                                     My Life in France
10025                                       Power Mandalas
13570    It Will Live Forever: Traditional Yosemite Ind...
0                            Whispers of the Wicked Saints
6469                                         Inconceivable
Name: Title, dtype: object


Goodbye!


In this part, I provided users with simple console based book finder.