# SCI-FI BOOK RECOMMENDATION ENGINE: PART III

After we have cleaned and run EDA on the dataset, we can finally implement recommendation engine.

In [49]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df = pd.read_csv("data/scifi_with_cover.csv")

To determine similarity between two books, we generate a new combined feature which includes author name, genres and description.

In [50]:
def combine_features(record):
    try:
        genres = record.genres
        author_name = "_".join(record.author_name.lower().split(" "))
        description = record.book_description
        feature = f"{author_name} {genres} {description}"
        return feature
    except:
        print(temp)

df["combined_features"] = df.apply(combine_features, axis=1)

In [96]:
df[df.book_title.apply(lambda x: x.strip().lower()) == "snow crash"].url

1    https://www.goodreads.com/book/show/40651883-s...
Name: url, dtype: object

In [95]:
df.loc[1].book_title.strip().lower()

'snow crash'

 Afterwards, we tokenize the combined features and calculate cosine similarity between the resulted vectors.

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer(stop_words="english")

count_matrix= cv.fit_transform(df["combined_features"])
cosine_similarity_matrix = cosine_similarity(count_matrix)

As an example, we choose the classic cyberpunk book Neuromancer.

In [22]:
book_user_like = "Count Zero"
book_ID = df[df.book_title == book_user_like].id.values
book_record = df.loc[book_ID]
book_genres = set(book_record.genres.split(" "))

In [38]:
src = """
| Cover | Book Title | Author | Description | Rating |
| --- | --- | --- | --- | --- |"""

In [39]:
temp = df.loc[0, ["cover", "book_title", "author_name", "book_description",  "rating_score"]].values

str_ = f"| ![]({temp[0]} | {temp[1]} | {temp[2]} | {temp[3]} | {temp[4]} |"
src = f"{src}\n{str_}"
src

'\n| Cover | Book Title | Author | Description | Rating |\n| --- | --- | --- | --- | --- |\n| ![](https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1554437249l/6088007._SY475_.jpg | Neuromancer | William Gibson | Hotwired to the leading edges of art and technology, Neuromancer is a cyberpunk, science fiction masterpiece—a classic that ranks with 1984 and Brave New World as one of the twentieth century’s most potent visions of the future.The Matrix is a world within the world, a global consensus-hallucination, the representation of every byte of data in cyberspace...Henry Dorsett Case was the sharpest data-thief in the business, until vengeful former employees crippled his nervous system. But now a new and very mysterious employer recruits him for a last-chance run. The target: an unthinkably powerful artificial intelligence orbiting Earth in service of the sinister Tessier-Ashpool business clan. With a dead man riding shotgun and Molly, mirror-eyed street-samurai, t

In [5]:
similar_books = list(enumerate(cosine_similarity_matrix[book_ID]))
sorted_similar_books = sorted(similar_books,key=lambda x:x[1],reverse=True)
sorted_similar_books[:20]

[(0, 0.9999999999999983),
 (2769, 0.3002781641765874),
 (2419, 0.29497166232283906),
 (309, 0.25279945218534405),
 (6873, 0.2507132682112035),
 (8262, 0.24842360136324754),
 (398, 0.24806946917841688),
 (306, 0.243599382882345),
 (1681, 0.23717082451262847),
 (389, 0.23669053416557545),
 (5111, 0.23669053416557545),
 (1095, 0.2344036154692477),
 (300, 0.23080703932272442),
 (8219, 0.23002185311411805),
 (216, 0.2271720556255607),
 (8980, 0.22594340036404523),
 (5444, 0.22554842722407936),
 (3054, 0.22537446792760438),
 (10, 0.22427130678626514),
 (359, 0.2231122741860899)]

In [6]:
limit = 20
print(book_record.book_title, book_record.author_name, book_record.genres)
for element in sorted_similar_books[1:limit]:
    book_index = element[0]
    recommendation = df[df.id == book_index]["book_title"].values[0] 
    author = df[df.id == book_index]["author_name"].values[0]
    genres = df[df.id == book_index]["genres"].values[0]
    genres = set(genres.split(" "))
    common_genres = book_genres.intersection(genres)
    print(f"Book: {recommendation.upper()} Author:{author.upper()} and Genres: {common_genres}")

Neuromancer William Gibson fiction cyberpunk classics dystopia novels audiobook
Book: BRAVE NEW WORLD REVISITED Author:ALDOUS HUXLEY and Genres: {'dystopia', 'fiction', 'classics'}
Book: BRAVE NEW WORLD / BRAVE NEW WORLD REVISITED Author:ALDOUS HUXLEY and Genres: {'dystopia', 'novels', 'fiction', 'classics'}
Book: LAST TANGO IN CYBERSPACE Author:STEVEN KOTLER and Genres: {'novels', 'cyberpunk', 'fiction'}
Book: THRICE UPON A TIME Author:JAMES P. HOGAN and Genres: {'novels', 'audiobook', 'fiction'}
Book: CHILDREN OF THE NEW WORLD Author:ALEXANDER        WEINSTEIN and Genres: {'dystopia', 'audiobook', 'fiction'}
Book: NULL STATES Author:MALKA ANN OLDER and Genres: {'dystopia', 'audiobook', 'cyberpunk', 'fiction'}
Book: SHADOWRUN 14: NOSFERATU Author:CARL SARGENT and Genres: {'cyberpunk', 'dystopia', 'fiction'}
Book: THE STAND Author:STEPHEN KING and Genres: {'dystopia', 'audiobook', 'fiction', 'classics'}
Book: WILLIAM GIBSON'S NEUROMANCER: THE GRAPHIC NOVEL Author:WILLIAM GIBSON and Gen

We can implement a weighted similarity matrix to increase the importance of certain terms such as author name and genres. However, finding the corresponding term in `count_matrix` and modifiying the value would take too much time. An example implementation is as follows:

```python
for i in range(df.shape[0]):
    weighted_terms = df.loc[i].genres
    weighted_terms = weighted_terms.split(" ")
    index = [ cv.get_feature_names().index(item.lower()) for item in weighted_terms]
    for k in index:
        count_matrix[i,k] *= 100
```

There are 2 for loops inside another for loop and running this would be a nightmare in terms of time. For this reason, we preallocate a matrix including all the authors and genres as columns and rows as count vectors for each record. Using this strategy, we can construct the weight matrix and combine it with count matrix for descriptions.

In [97]:
from scipy.sparse import csr_matrix, hstack

def combine_name(author_name):
    return author_name.lower().replace(" ", "_")


uniq_genres_authors = set()
feature_dict = {}

for genre_list in df.genres:
    for genre in genre_list.split(" "):
        uniq_genres_authors.add(genre)
        
for author in df.author_name:
    temp= combine_name(author)
    uniq_genres_authors.add(temp)
        

feature_dict = {item:index for index, item in enumerate(uniq_genres_authors)}

# for index, item in enumerate(uniq_genres):
#     feature_dict[item] = index
    
weight_matrix = np.zeros((df.shape[0], len(feature_dict)))

for index in range(df.shape[0]):
    temp = []
    record = df.loc[index]
    temp.append(combine_name(record.author_name))
    for item in record.genres.split(" "):
        temp.append(item)
        
    for item in temp:
        weight_matrix[index, feature_dict[item]] = 100 # Arbitrary weight and can be adjusted



In [135]:
book_user_like= "Count Zero"
book_user_like = book_user_like.strip().lower()
type(df[df.book_title.apply(lambda x: x.strip().lower()) == book_user_like])
df[df.book_title.apply(lambda x: x.strip().lower()) == book_user_like]

Unnamed: 0,book_title,author_name,edition_language,rating_score,rating_votes,review_number,book_description,year_published,genres,url,id,cover,combined_features
2,Count Zero,William Gibson,English,4.01,46114,1219,A corporate mercenary wakes in a reconstructed...,1986,cyberpunk fiction dystopia novels canada american,https://www.goodreads.com/book/show/22200.Coun...,2,https://i.gr-assets.com/images/S/compressed.ph...,william_gibson cyberpunk fiction dystopia nove...


In [8]:
cv = CountVectorizer(stop_words="english")
desc_count_matrix= cv.fit_transform(df["book_description"])

In [9]:
weight_matrix = csr_matrix(weight_matrix)
combined_matrix = hstack((weight_matrix, desc_count_matrix))
type(combined_matrix)

scipy.sparse.coo.coo_matrix

In [10]:
cosine_similarity_matrix = cosine_similarity(combined_matrix)

In [11]:
similar_books = list(enumerate(cosine_similarity_matrix[book_ID]))
sorted_similar_books = sorted(similar_books,key=lambda x:x[1],reverse=True)
sorted_similar_books[:20]

[(0, 0.9999999999999992),
 (1, 0.8561360847556276),
 (84, 0.8009772968393042),
 (18, 0.8003460503559313),
 (9, 0.7135356941374834),
 (2, 0.7135112341121717),
 (5, 0.7134959505354028),
 (3070, 0.713449137621607),
 (4967, 0.713449094744709),
 (21, 0.7134175285504063),
 (293, 0.7134023058751567),
 (2599, 0.7133941375578393),
 (51, 0.7133180280561168),
 (1885, 0.7133119432491127),
 (180, 0.7133036769849712),
 (4, 0.713236553045603),
 (5386, 0.7131407115420931),
 (13, 0.7131387199706523),
 (197, 0.7128968689411321),
 (130, 0.7127726553731508)]

In [12]:
limit = 20
print(book_record.book_title, book_record.author_name, book_record.genres)
for element in sorted_similar_books[1:limit]:
    book_index = element[0]
    recommendation = df[df.id == book_index]["book_title"].values[0] 
    author = df[df.id == book_index]["author_name"].values[0]
    genres = df[df.id == book_index]["genres"].values[0]
    genres = set(genres.split(" "))
    common_genres = book_genres.intersection(genres)
    print(f"Book: {recommendation.upper()} Author:{author.upper()} and Genres: {common_genres}")

Neuromancer William Gibson fiction cyberpunk classics dystopia novels audiobook
Book: SNOW CRASH Author:NEAL STEPHENSON and Genres: {'audiobook', 'novels', 'cyberpunk', 'fiction', 'dystopia', 'classics'}
Book: AGENCY Author:WILLIAM GIBSON and Genres: {'audiobook', 'novels', 'cyberpunk', 'fiction', 'dystopia'}
Book: THE PERIPHERAL Author:WILLIAM GIBSON and Genres: {'audiobook', 'novels', 'cyberpunk', 'fiction', 'dystopia'}
Book: VIRTUAL LIGHT Author:WILLIAM GIBSON and Genres: {'dystopia', 'novels', 'cyberpunk', 'fiction'}
Book: COUNT ZERO Author:WILLIAM GIBSON and Genres: {'dystopia', 'novels', 'cyberpunk', 'fiction'}
Book: DO ANDROIDS DREAM OF ELECTRIC SHEEP Author:PHILIP K. DICK and Genres: {'novels', 'cyberpunk', 'fiction', 'dystopia', 'classics'}
Book: THE CITY AND THE STARS Author:ARTHUR C. CLARKE and Genres: {'audiobook', 'novels', 'fiction', 'dystopia', 'classics'}
Book: THE END OF ETERNITY Author:ISAAC ASIMOV and Genres: {'audiobook', 'novels', 'fiction', 'dystopia', 'classics'}

In [34]:
random_books = np.random.randint(0, df.shape[0], 10)
random_books
df.iloc[random_books].cover.values

array(['https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1348917674l/2128197.jpg',
       'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1386924475l/216215.jpg',
       'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1630637257l/960._SX318_.jpg',
       'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1470040819l/29776927._SY475_.jpg',
       'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1330759845l/13507967.jpg',
       'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1388210968l/84119.jpg',
       'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1360807066l/16102412.jpg',
       'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1388183880l/72989.jpg',
       'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1438813494l/7193226._SY475_.jpg',
       'https://i.gr-assets.com/images/S/compr

In [48]:
from time import perf_counter

start_time = perf_counter()
finish_time = perf_counter()
passed_time = finish_time - start_time
round(passed_time,4)

0.0001