In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
# import warnings

# from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
def plot_confusion(model, predicted, actual, label = []):
    y_pred = model.predict(predicted)
    y_act = actual
    cm = confusion_matrix(y_act, y_pred, labels = label)
    cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = label)
    cm_display.plot()
    plt.xticks(rotation='vertical')

# 1)

Determine a method of identifying books that are similar to each other. You can use any data available in the “books_data.csv” file, or you can write code to gather more information about each book.  

## a)

Write a function that allows a user to search for books similar to one that they specify. Your function should take the name of the book as a parameter and return a list of at least 5 books that are most similar (according to your metric) to the specified book.

In [4]:
df = pd.read_csv('../data/books_data.csv')


In [5]:
df = df.dropna(subset=['Title', 'description', 'authors', 'categories']).reset_index(drop=True)
# Convert to strings and clean 'authors' and 'categories' columns
df['authors'] = df['authors'].astype(str).str.replace(r"[\[\]']", "", regex=True)
df['categories'] = df['categories'].astype(str).str.replace(r"[\[\]']", "", regex=True)
df = df.reset_index(drop=True)
# Create 'combination' column
df['combination'] = df['description'].astype(str) + ' ' + df['authors'] + ' ' + df['categories']
df.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount,combination
0,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,Philip Nel,http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,Biography & Autobiography,,Philip Nel takes a fascinating look into the k...
1,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,David R. Ray,http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,Religion,,This resource includes twelve principles in un...
2,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,Veronica Haddon,http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,Fiction,,Julia Thomas finds her life spinning out of co...
3,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,Everett Ferguson,http://books.google.com/books/content?id=kVqRa...,http://books.google.nl/books?id=kVqRaiPlx88C&p...,Wm. B. Eerdmans Publishing,1996,http://books.google.nl/books?id=kVqRaiPlx88C&d...,Religion,5.0,In The Church of Christ: A Biblical Ecclesiolo...
4,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,Mary Fabyan Windeatt,http://books.google.com/books/content?id=lmLqA...,http://books.google.nl/books?id=lmLqAAAACAAJ&d...,Tan Books & Pub,2009-01-01,http://books.google.nl/books?id=lmLqAAAACAAJ&d...,Biography & Autobiography,,The story for children 10 and up of St. Hyacin...


In [6]:
v = CountVectorizer(stop_words = 'english')

X = v.fit_transform(df['combination']) #creates a vocabulary and creates the data

#How toget a movie out of the matrix
df['Title'] == 'Dr. Seuss: American Icon'
idx = df[df['Title'] == 'Dr. Seuss: American Icon'].index[0]
# print(idx)
X[idx]

# Here's a slicker way to do it.
# Generate a mapping from movie title to index in df.
# the pandas Series command works a lot like a built-in python dictionary.
movie2idx = pd.Series(df.index, index=df['Title'])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
def get_most_similar(book_title):
    if book_title in movie2idx.keys():
        idx = movie2idx[book_title]
        scores = cosine_similarity(X, X[idx])
        # Now we want to find the best matches. We're not actually interested in the scores themselves, but the relative rankings.
        # For that, we can use the argsort command.
        # scores.argsort() would sort the scores in ascending order and then return the indices that correspond to those scores.
        # Since we want the highest scores, I'll use (-scores).argsort() instead.
        scores = scores.flatten()
        recommended = (-scores).argsort()

        # We should ignore the first entry in the list (since that'll be the movie itself).
        # Let's take the next 5 movies.
        recommended = recommended[1:6]

        # Now we should return the titles of the corresponding movies.
        return df['Title'].iloc[recommended]
    else:
        return "Book not found."

get_most_similar("Dr. Seuss: American Icon")

118570               Pretend You're a Cat (Picture Puffins)
44733                  One Fish Two Fish Red Fish Blue Fish
20261     One Fish two fish red fish blue fish (Beginner...
70685               The Nose Book (Bright & Early Books(R))
67266                                  My Many Colored Days
Name: Title, dtype: object

## b)

In [8]:
# # Drop NaNs and reset index
# df = df.dropna(subset=['Title', 'description', 'authors', 'categories']).reset_index(drop=True)

# # Convert to strings and clean 'authors' and 'categories' columns
# df['authors'] = df['authors'].astype(str).str.replace(r"[\[\]']", "", regex=True)
# df['categories'] = df['categories'].astype(str).str.replace(r"[\[\]']", "", regex=True)
# df = df.reset_index(drop=True)
# # Create 'combination' column
# df['combination'] = df['description'].astype(str) + ' ' + df['authors'] + ' ' + df['categories']

# # Keep only required columns
# df = df[['Title', 'combination']]

# # Vectorize text data
# inputsTrain = df['combination']
# Ytrain = df['Title']

# v = CountVectorizer(stop_words='english', max_features=5000)
# Xtrain = v.fit_transform(inputsTrain)

# # Train the Naive Bayes model
# model = MultinomialNB()
# model.fit(Xtrain, Ytrain)

# # Compute and print training score
# nb_count_train_score = model.score(Xtrain, Ytrain)
# print("Train score:", nb_count_train_score)

# 2)

## a)

In [9]:
df2 = pd.read_csv("../data/Books_rating.csv")
df2 = df2.dropna().reset_index(drop=True)

In [10]:
df2 = df2.sample(n=50000, random_state=1)
df2.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
264037,310579503,Zondervan NIV Nave's Topical Bible,19.79,A2AW8SZYZTK86B,"Paul Powell ""moloch16""",1/1,5.0,1303344000,Great Resource,"This is an great topical reference, I use it q..."
41275,1850896429,Mrs. Miniver (Isis Series),46.75,A3DNSXDSJRJY6T,J. A. Spilker,0/0,5.0,1196294400,A perennial classic,I re-read this book every year at the start of...
123666,1931599351,The Great Iowa Touring Book: 27 Spectacular Au...,21.95,A3FE8FQPQPONAW,Michelle Pettit,21/21,5.0,1096329600,Wonderful information and accuracy,I don't know how I stumbled on this book but I...
290427,1421808803,Night And Day,41.95,A1H6VI6CMTOX3C,Book Lover,0/0,2.0,1249862400,Night and Day,I was very disappointed in this book. The plot...
388818,486438767,Fashions of the Old South Coloring Book (Dover...,3.99,A2T87CHZR3TE5E,Konrad Trope,0/0,5.0,1201046400,Another beautiful Dover coloring book,Beautiufl coloring book. I bought these for my...


In [11]:
df2['review/text'] = df2['review/text'].dropna()
df2['Title'] = df2['Title'].dropna()
df2.reset_index(drop=True)
df2.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
264037,310579503,Zondervan NIV Nave's Topical Bible,19.79,A2AW8SZYZTK86B,"Paul Powell ""moloch16""",1/1,5.0,1303344000,Great Resource,"This is an great topical reference, I use it q..."
41275,1850896429,Mrs. Miniver (Isis Series),46.75,A3DNSXDSJRJY6T,J. A. Spilker,0/0,5.0,1196294400,A perennial classic,I re-read this book every year at the start of...
123666,1931599351,The Great Iowa Touring Book: 27 Spectacular Au...,21.95,A3FE8FQPQPONAW,Michelle Pettit,21/21,5.0,1096329600,Wonderful information and accuracy,I don't know how I stumbled on this book but I...
290427,1421808803,Night And Day,41.95,A1H6VI6CMTOX3C,Book Lover,0/0,2.0,1249862400,Night and Day,I was very disappointed in this book. The plot...
388818,486438767,Fashions of the Old South Coloring Book (Dover...,3.99,A2T87CHZR3TE5E,Konrad Trope,0/0,5.0,1201046400,Another beautiful Dover coloring book,Beautiufl coloring book. I bought these for my...


In [12]:
df2 = df2.dropna(subset=['Title', 'review/text']).reset_index(drop=True)

In [13]:
df = df.reset_index(drop=True)
df2 = df2.reset_index(drop=True)
review_dict = {}
for i in range(len(df2)):
    if df2['Title'][i] not in review_dict.keys():
        review_dict[str(df2['Title'][i])] = df2['review/text'][i]
    else:
        review_dict[str(df2['Title'][i])] += " " 
        review_dict[str(df2['Title'][i])] += df2['review/text'][i]

In [14]:
df3 = pd.DataFrame(list(review_dict.items()), columns=['book_title', 'full_review'])
df3

Unnamed: 0,book_title,full_review
0,Zondervan NIV Nave's Topical Bible,"This is an great topical reference, I use it q..."
1,Mrs. Miniver (Isis Series),I re-read this book every year at the start of...
2,The Great Iowa Touring Book: 27 Spectacular Au...,I don't know how I stumbled on this book but I...
3,Night And Day,I was very disappointed in this book. The plot...
4,Fashions of the Old South Coloring Book (Dover...,Beautiufl coloring book. I bought these for my...
...,...,...
19277,Mazes (Shire Albums),This is one of the most interesting Christmas ...
19278,The Stones Cry Out,"""and his face resembled a skeleton of wires co..."
19279,The Unanswered Question: Six Talks at Harvard ...,I respect Bernstein even more as a scholar of ...
19280,Doubt!! Vol. 1,"'Doubt!!' revolves around 15-year-old, Ai Maek..."


In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()  # Create SentimentIntensityAnalyzer object
df3['scores'] = df3['full_review'].astype(str).apply(lambda text: sid.polarity_scores(text))
df3['compound'] = df3['scores'].apply(lambda d: d['compound'])
df3['compound'] = df3['compound'].astype(float)
df3['sentiment'] = df3['compound'].apply(lambda score: 1 if score > 0.2 else -1 if score < -0.6 else 0)
df3[:16]

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/muratguzelocak/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,book_title,full_review,scores,compound,sentiment
0,Zondervan NIV Nave's Topical Bible,"This is an great topical reference, I use it q...","{'neg': 0.014, 'neu': 0.834, 'pos': 0.152, 'co...",0.9738,1
1,Mrs. Miniver (Isis Series),I re-read this book every year at the start of...,"{'neg': 0.0, 'neu': 0.693, 'pos': 0.307, 'comp...",0.967,1
2,The Great Iowa Touring Book: 27 Spectacular Au...,I don't know how I stumbled on this book but I...,"{'neg': 0.0, 'neu': 0.751, 'pos': 0.249, 'comp...",0.9922,1
3,Night And Day,I was very disappointed in this book. The plot...,"{'neg': 0.078, 'neu': 0.729, 'pos': 0.193, 'co...",0.9999,1
4,Fashions of the Old South Coloring Book (Dover...,Beautiufl coloring book. I bought these for my...,"{'neg': 0.011, 'neu': 0.717, 'pos': 0.272, 'co...",0.9893,1
5,The Center of Everything : A Novel,"I am always looking for a good book to read, a...","{'neg': 0.092, 'neu': 0.722, 'pos': 0.186, 'co...",0.9999,1
6,Flash 5 Weekend Crash Course,After getting through the first 50 pages of th...,"{'neg': 0.097, 'neu': 0.862, 'pos': 0.042, 'co...",-0.6868,-1
7,Description & Setting: Techniques and Exercise...,The Description & Setting volume for the Write...,"{'neg': 0.06, 'neu': 0.782, 'pos': 0.157, 'com...",0.9941,1
8,A Bride Most Begrudging,"I was not sure I would like this book, even af...","{'neg': 0.081, 'neu': 0.697, 'pos': 0.222, 'co...",0.9998,1
9,Maniac Magee (Turtleback School & Library Bind...,Manic Magee Diesel Hayes#8Manic Magee AKA: Jef...,"{'neg': 0.107, 'neu': 0.724, 'pos': 0.169, 'co...",0.9998,1


In [16]:
df3.sort_values(by='compound', ascending=False)[:16]

Unnamed: 0,book_title,full_review,scores,compound,sentiment
412,The China Study: The Most Comprehensive Study ...,I just finished this and I will probably read ...,"{'neg': 0.079, 'neu': 0.78, 'pos': 0.141, 'com...",1.0,1
2361,The Amazing Power of Deliberate Intent 4-CD: P...,"If you know the ""secret"" The Law of Attraction...","{'neg': 0.035, 'neu': 0.738, 'pos': 0.227, 'co...",1.0,1
406,The Tao of Pooh,Telling us how great your philosophy is? Aweso...,"{'neg': 0.054, 'neu': 0.772, 'pos': 0.175, 'co...",1.0,1
423,Good to Great,"Lana VilmainPHR, M EdHuman Resource Generalist...","{'neg': 0.05, 'neu': 0.734, 'pos': 0.215, 'com...",1.0,1
139,Anyone But You,"Ok, I'm not usually a fan of an older woman/yo...","{'neg': 0.06, 'neu': 0.666, 'pos': 0.274, 'com...",1.0,1
1079,Phantom,This is an amazing book. Ms. Kay has successfu...,"{'neg': 0.08, 'neu': 0.73, 'pos': 0.19, 'compo...",1.0,1
435,Love & Respect: The Love She Most Desires; The...,I've often felt that Christian books on the su...,"{'neg': 0.056, 'neu': 0.705, 'pos': 0.239, 'co...",1.0,1
437,Save The Cat! The Last Book on Screenwriting Y...,"I'm always asking, &#34;How do you do it?&#34;...","{'neg': 0.047, 'neu': 0.757, 'pos': 0.196, 'co...",1.0,1
1075,Stranger In A Strange Land: Library Edition,"Michael Smith is a child born in space, becaus...","{'neg': 0.077, 'neu': 0.761, 'pos': 0.162, 'co...",1.0,1
442,Lucky Man: A Memoir,Michael J. Fox has been one of my favorite cel...,"{'neg': 0.057, 'neu': 0.74, 'pos': 0.203, 'com...",1.0,1


## b)

# 3)

## a)

In [17]:
finalDF = pd.merge(df, df3, how='inner', left_on='Title', right_on='book_title')
finalDF = finalDF.drop(columns=['book_title', 'scores', 'image', 'previewLink', 'authors', 'categories',
                                 'description', 'publisher', 'publishedDate', 'infoLink'])
finalDF = finalDF[finalDF['sentiment'] != -1]
finalDF = finalDF.reset_index(drop=True)
finalDF

Unnamed: 0,Title,ratingsCount,combination,full_review,compound,sentiment
0,Whispers of the Wicked Saints,,Julia Thomas finds her life spinning out of co...,Just as predicted the first chapter of the boo...,0.9980,1
1,Dramatica for Screenwriters,,Dramatica for Screenwriters by Armando Saldana...,Beauty is in the bones. This book gives your s...,0.9936,1
2,The Ultimate Guide to Law School Admission: In...,,This collection brings together a distinguishe...,If you have even casually looked into applying...,0.4434,1
3,Beginner's Yoruba (Hippocrene Beginner's Series),1.0,"""Beginner's Yoruba"" is now available with two ...",This is my first encounter with Yoruba and I h...,0.9653,1
4,How to Discipline Kids without Losing Their Lo...,1.0,Imagine... No More Arguing. Imagine... No More...,I purchased this book not knowing much about i...,0.9961,1
...,...,...,...,...,...,...
13810,El color del verano,,Es pleno verano y en la isla de Cuba comienza ...,Since I read the first chapter of the book I w...,0.4836,1
13811,In the First Line of Battle: The 12th Illinois...,,From its first major engagement at Harpers Fer...,It had been traditional that a member of a mil...,-0.0900,0
13812,Is Your Retirement Heading in the Right Direct...,,So many seniors today are living so much longe...,this book was very unrewarding in terms of sol...,0.5994,1
13813,The Awakening and Selected Stories (Modern Lib...,2.0,"WHEN IT FIRST APPEARED IN 1899, THE AWAKENING ...",I actually haven't reached the Awakening yet. ...,0.7338,1


In [18]:
get_most_similar('Whispers of the Wicked Saints')

29778                           Women in the Wind
38466                        An Independent Woman
119844                 Miss Julia Speaks Her Mind
37156                        At Mrs. Lippincote's
94218     When a Texan Gambles (The Wife Lottery)
Name: Title, dtype: object

In [19]:
v = CountVectorizer(stop_words = 'english')

X = v.fit_transform(finalDF['combination']) #creates a vocabulary and creates the data
book2idx = pd.Series(finalDF.index, index=finalDF['Title'])
def get_best_book(book_title):
    if book_title in book2idx.keys():
        idx = book2idx[book_title]
        scores = cosine_similarity(X, X[idx])
        # Now we want to find the best matches. We're not actually interested in the scores themselves, but the relative rankings.
        # For that, we can use the argsort command.
        # scores.argsort() would sort the scores in ascending order and then return the indices that correspond to those scores.
        # Since we want the highest scores, I'll use (-scores).argsort() instead.
        scores = scores.flatten()
        recommended = (-scores).argsort()

        # We should ignore the first entry in the list (since that'll be the movie itself).
        # Let's take the next 5 movies.
        recommended = recommended[1:6]

        # Now we should return the titles of the corresponding movies.
        return finalDF['Title'].iloc[recommended]
    else:
        return "Book not found"

In [20]:
get_best_book('Whispers of the Wicked Saints')

9531               When a Texan Gambles (The Wife Lottery)
10647                                       Healing H'Arts
4758                                     My Life in France
13580    It Will Live Forever: Traditional Yosemite Ind...
2669     Inconceivable: A Woman's Triumph over Despair ...
Name: Title, dtype: object

## b)

In [21]:
while True:
  book_title = input("Enter a book title: or hit 'q' to quit")
  if book_title == 'q':
    print("Goodbye!")
    break
  print(f"{book_title} is similar to:\n")
  print(get_best_book(book_title))
  print("\n")

When a Texan Gambles (The Wife Lottery) is similar to:

0                            Whispers of the Wicked Saints
819                       Night Play (Dark-Hunter, Book 6)
13715    Time Enough for Love: The Lives of Lazarus Lon...
12747                                The Fisherman's Quilt
6797                                       Sister Got Game
Name: Title, dtype: object


The Fisherman's Quilt is similar to:

12283    Stealing Some Time:Volume 1 (Parts 1 and 2)
12296                     Stealing Some Time: Vol. 2
10945            Luba and the Wren (Picture Puffins)
11712                               The Sign of Four
8431             The Red Book (Caldecott Honor Book)
Name: Title, dtype: object


The Sign of Four is similar to:

3112              The Sherlock Holmes Theatre [UNABRIDGED]
12747                                The Fisherman's Quilt
3008     The Rivals (Classic Books on Cassettes Collect...
1633     The Einstein Paradox: And Other Science Myster...
11705              