# 1. Data

In [None]:
import pandas as pd
import numpy as np
import json
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import train_test_split as surprise_split
from surprise.model_selection import GridSearchCV


from sklearn.metrics.pairwise import cosine_similarity
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from keras.models import Model, Sequential
from keras.layers import Embedding, Flatten, Input, Dropout, Dense, BatchNormalization, dot, concatenate
from keras.optimizers import Adam, SGD
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG
from tqdm import tqdm

In [None]:
ratings_path = '../data/user_rating_book_filter_per_30.csv'
info_path = '../data/info_book_detail_per_30.csv'

df_ratings = pd.read_csv(ratings_path)
df_info = pd.read_csv(info_path)

In [None]:
df_ratings

Unnamed: 0,url_book,book_title,username,user_url,user_rating,book_id,user_id
0,https://www.goodreads.com/book/show/2784.Ways_...,Ways of Seeing,Trevor,https://www.goodreads.com/review/list/175635-t...,5.0,0,0
1,https://www.goodreads.com/book/show/4214.Life_...,Life of Pi,Trevor,https://www.goodreads.com/review/list/175635-t...,2.0,1,0
2,https://www.goodreads.com/book/show/1168341.Ph...,Physics of the Impossible,Trevor,https://www.goodreads.com/review/list/175635-t...,4.0,2,0
3,https://www.goodreads.com/book/show/46223297-p...,Permanent Record,Trevor,https://www.goodreads.com/review/list/175635-t...,5.0,3,0
4,https://www.goodreads.com/book/show/1362.The_H...,The Histories,Trevor,https://www.goodreads.com/review/list/175635-t...,5.0,4,0
...,...,...,...,...,...,...,...
66128,https://www.goodreads.com/book/show/2493.The_T...,The Time Machine,Whanthatapril,https://www.goodreads.com/user/show/18086226-w...,3.0,79,1181
66129,https://www.goodreads.com/book/show/52357.Beowulf,Beowulf,Whanthatapril,https://www.goodreads.com/user/show/18086226-w...,4.0,78,1181
66130,https://www.goodreads.com/book/show/2612.The_T...,The Tipping Point: How Little Things Can Make ...,Whanthatapril,https://www.goodreads.com/user/show/18086226-w...,3.0,6,1181
66131,https://www.goodreads.com/book/show/139855.Kno...,Knowing God,Whanthatapril,https://www.goodreads.com/user/show/18086226-w...,5.0,2153,1181


In [None]:
# Lọc những dòng trùng lặp và loại các cột không cần thiết
df_ratings = df_ratings.drop_duplicates(subset=['book_id', 'user_id'])
df_ratings = df_ratings.drop(columns=['url_book', 'user_url'])

num_books = len(df_ratings['book_id'].unique())
num_users = len(df_ratings['user_id'].unique())
print('Number of books:', num_books)
print('Number of users:', num_users)

Number of books: 2196
Number of users: 1182


# 2. Rating Prediction

## 2.1. KNN-based Collaborative Filtering



In [None]:
# Đưa book_id và user_id về đoạn [0, nums-1] để tiện cho tính toán
map_book = dict(zip(df_ratings['book_id'].sort_values().unique(), range(num_books)))
map_user = dict(zip(df_ratings['user_id'].sort_values().unique(), range(num_users)))

df_ratings['book_id_mapped'] = df_ratings['book_id'].map(map_book)
df_ratings['user_id_mapped'] = df_ratings['user_id'].map(map_user)
df_ratings.head()

Unnamed: 0,book_title,username,user_rating,book_id,user_id,book_id_mapped,user_id_mapped
0,Ways of Seeing,Trevor,5.0,0,0,0,0
1,Life of Pi,Trevor,2.0,1,0,1,0
2,Physics of the Impossible,Trevor,4.0,2,0,2,0
3,Permanent Record,Trevor,5.0,3,0,3,0
4,The Histories,Trevor,5.0,4,0,4,0


In [None]:
# Đọc dữ liệu vào định dạng Dataset và chia train, test set
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['user_id_mapped', 'book_id_mapped', 'user_rating']], reader)

train_set, test_set = surprise_split(data, test_size=0.15, random_state=10)

- Ta thấy trong thực tế, sẽ có những user "dễ tính" thường cho rating cao và ngược lại những user "khó tính" thường cho rating thấp. Do đó cần phải đưa tất cả user về cùng một mức độ bằng cách loại bỏ bias của từng user. Để làm được điều này, chỉ cần trừ rating của user đó đối với từng book, cho trung bình rating mà user đó đánh giá với tất cả book. Với thư viện `surprise`, dùng model `KNNWithMeans` cho việc này.

- Bên cạnh đó, có 2 hướng để dự đoán rating là `user-based` và `item-based`. Ở đây chúng ta sẽ dùng `item-based` để tiện cho việc recommend bên dưới: Để tính rating R của user U cho item I, ta tìm tất cả những item đã được rate bởi U, và chọn ra N item tương đồng nhất với I để tính toán rating (sự tương đồng dựa vào rating vector)

In [None]:
# Độ đo cosine và phương pháp item-based
options = {
    "name": "cosine",
    "user_based": False,
}
model_knn = KNNWithMeans(sim_options=options)

model_knn.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fb00ae0abe0>

In [None]:
# Dự đoán rating trên tập test
predictions = model_knn.test(test_set)
accuracy.mae(predictions)
accuracy.mse(predictions)
accuracy.rmse(predictions)

MAE:  0.7039
MSE: 0.8145
RMSE: 0.9025


0.9024956937228628

In [None]:
predicted = [p.est for p in predictions]
actual = [r for (u, i, r) in test_set]

df_pred = pd.DataFrame({'Actual': actual, 'Predicted': predicted})
df_pred.head(10)

Unnamed: 0,Actual,Predicted
0,3.0,3.664673
1,4.0,4.038851
2,5.0,3.887711
3,4.0,3.999845
4,5.0,3.676571
5,4.0,3.508675
6,5.0,3.646334
7,3.0,3.859623
8,4.0,3.852288
9,5.0,3.542794


## 2.2. Matrix Factorization and Deep Learning

In [None]:
df_ratings_smaller_2 = df_ratings[df_ratings.user_rating <= 2]
df_ratings_greater_2 = df_ratings[df_ratings.user_rating > 2]

Vì dữ liệu của những quyển sách người dùng đánh giá kém (nhỏ hơn 2 sao) rất ít nên ta sẽ tăng số dữ liệu này trong tập train

In [None]:
train_1, test_1 = train_test_split(df_ratings_smaller_2, test_size=0.05, random_state = 42, shuffle = True)
train_2, test_2 = train_test_split(df_ratings_greater_2, test_size=0.2, random_state = 42, shuffle = True)
train = pd.concat([train_1, train_2], ignore_index = True)
test = pd.concat([test_1, test_2], ignore_index = True)

In [None]:
print("Train shape: {}".format(train.shape))
print("Test shape: {}".format(test.shape))

Train shape: (53569, 5)
Test shape: (12308, 5)


In [None]:
latent_dim = 10

# Inputs
book_input = Input(shape=[1],name='book-input')
user_input = Input(shape=[1], name='user-input')

# MLP Embeddings
book_embedding_mlp = Embedding(num_books + 1, latent_dim, name='book-embedding-mlp')(book_input)
book_vec_mlp = Flatten(name='flatten-book-mlp')(book_embedding_mlp)

user_embedding_mlp = Embedding(num_users + 1, latent_dim, name='user-embedding-mlp')(user_input)
user_vec_mlp = Flatten(name='flatten-user-mlp')(user_embedding_mlp)

# MF Embeddings
book_embedding_mf = Embedding(num_books + 1, latent_dim, name='book-embedding-mf')(book_input)
book_vec_mf = Flatten(name='flatten-book-mf')(book_embedding_mf)

user_embedding_mf = Embedding(num_users + 1, latent_dim, name='user-embedding-mf')(user_input)
user_vec_mf = Flatten(name='flatten-user-mf')(user_embedding_mf)

# MLP layers
concat = concatenate([book_vec_mlp, user_vec_mlp], axis = 1, name='concat')
concat_dropout = Dropout(0.2)(concat)
fc_1 = Dense(100, name='fc-1', activation='relu')(concat_dropout)
fc_1_bn = BatchNormalization(name='batch-norm-1')(fc_1)
fc_1_dropout = Dropout(0.2)(fc_1_bn)
fc_2 = Dense(50, name='fc-2', activation='relu')(fc_1_dropout)
fc_2_bn = BatchNormalization(name='batch-norm-2')(fc_2)
fc_2_dropout = Dropout(0.2)(fc_2_bn)

# Prediction from both layers
pred_mlp = Dense(10, name='pred-mlp', activation='relu')(fc_2_dropout)
pred_mf = dot([book_vec_mf, user_vec_mf], axes = 1, name='pred-mf')
combine_mlp_mf = concatenate([pred_mf, pred_mlp], axis = 1, name='combine-mlp-mf')

# Final prediction
result = Dense(1, name='result', activation='relu')(combine_mlp_mf)

model = Model([user_input, book_input], result)
model.compile(optimizer=SGD(lr=0.003), loss='mean_absolute_error')
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
book-input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
user-input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
book-embedding-mlp (Embedding)  (None, 1, 10)        21970       book-input[0][0]                 
__________________________________________________________________________________________________
user-embedding-mlp (Embedding)  (None, 1, 10)        11830       user-input[0][0]                 
____________________________________________________________________________________________

In [None]:
# Train từ đầu hoặc load model đã được train trên 400 epochs
#history = model.fit([train.user_id, train.book_id], train.user_rating, epochs=400)
model.load_weights("../model/model_MF.h5")

In [None]:
# Dự đoán và đánh giá
y_hat = np.round(model.predict([test.user_id, test.book_id]), decimals=2)
y_true = test.user_rating
print("MAE:", mean_absolute_error(y_true, y_hat))
print("MSE:", mean_squared_error(y_true, y_hat))
print("RMSE:", mean_squared_error(y_true, y_hat, squared = False))

MAE: 0.6196100126430126
MSE: 0.6222541102835032
RMSE: 0.788830850235653


# 3. Book Recommendation

In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from textblob import TextBlob
import re
import string
import random
import json
from collections import Counter
%matplotlib inline

In [None]:
df_info.columns= df_info.columns.str.lower()
df_info = df_info[["book_id", "book_title", "author_name", "description", "rating", "genres"]]
df_info.head()

Unnamed: 0,book_id,book_title,author_name,description,rating,genres
0,0,Ways of Seeing,John Berger,John Berger’s Classic Text on ArtJohn Berger's...,3.86,Nonfiction
1,1,Life of Pi,Yann Martel,Life of Pi is a fantasy adventure novel by Yan...,3.92,Cultural|Literature|Classics|Contemporary|Fict...
2,2,Physics of the Impossible,Michio Kaku,A fascinating exploration of the science of th...,4.07,Nonfiction|Audiobook
3,3,Permanent Record,Edward Snowden,"Edward Snowden, the man who risked everything ...",4.31,Nonfiction|Audiobook
4,4,The Histories,Herodotus,One of the masterpieces of classical literatur...,3.98,Cultural|Literature|Classics|Nonfiction|Histor...


In [None]:
genres = df_info['genres'].apply(json.loads)
# Chọn ra 10 Genres tốt nhất
get_top_genres = lambda x: [a[0] for a in Counter(x).most_common(10)] 
genres = genres.apply(get_top_genres)
genres

0       [Nonfiction, Writing, Art, Design, Philosophy,...
1       [Fiction, Fantasy, Classics, Adventure, Contem...
2       [Nonfiction, Science, Space, Science Fiction, ...
3       [Nonfiction, Politics, Autobiography, History,...
4       [Classics, Nonfiction, History, Cultural, Hist...
                              ...                        
2191    [Science Fiction, Fiction, Thriller, Fantasy, ...
2192    [Nonfiction, Reference, Cultural, European Lit...
2193    [Leadership, Nonfiction, Business, Buisness, S...
2194    [Nonfiction, History, Economics, Business, Phi...
2195    [Nonfiction, Business, Self Help, Buisness, Cu...
Name: genres, Length: 2196, dtype: object

In [None]:
# Chọn ra 15 genres xuất hiện nhiều nhất trong toàn bộ cột Genres
top_genres = set(genres.explode().value_counts().head(15).index)
genres = genres.apply(set)
intersect = lambda x: x & top_genres
# Dùng phép giao nhau giữa top_genres và genres hiện tại của sách
genres = genres.apply(intersect).apply(list)
df_info['genres'] = genres
df_info.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,book_id,book_title,author_name,description,rating,genres
0,0,Ways of Seeing,John Berger,John Berger’s Classic Text on ArtJohn Berger's...,3.86,[Nonfiction]
1,1,Life of Pi,Yann Martel,Life of Pi is a fantasy adventure novel by Yan...,3.92,"[Cultural, Literature, Classics, Contemporary,..."
2,2,Physics of the Impossible,Michio Kaku,A fascinating exploration of the science of th...,4.07,"[Nonfiction, Audiobook]"
3,3,Permanent Record,Edward Snowden,"Edward Snowden, the man who risked everything ...",4.31,"[Nonfiction, Audiobook]"
4,4,The Histories,Herodotus,One of the masterpieces of classical literatur...,3.98,"[Cultural, Literature, Classics, Nonfiction, H..."


## 3.1. KNN-based Collaborative Filtering

Sử dụng model_knn ở mục 2.1, ta có thể gợi ý sách cho người dùng bằng cách chọn ra N sách có `vector rating` tương đồng nhất với sách đang tìm. `Vector rating` của một sách là một vector tổng hợp rating của tất cả các user cho sách đó và đã được chuẩn hoá.

In [None]:
df_books = df_ratings[['book_id_mapped', 'book_title']].drop_duplicates(subset=['book_id_mapped'])
hashmap = {title: id for id, title in zip(list(df_books['book_id_mapped']), list(df_books['book_title']))}
reverse_hashmap = {v: k for k, v in hashmap.items()}
print(hashmap)



In [None]:
def recommend(model, title, k):
    if title not in hashmap.keys():
        return []
    iid = hashmap[title]
    rec_ids = model.get_neighbors(iid,k)
    rec_titles = [reverse_hashmap[i] for i in rec_ids]
    return rec_titles

In [None]:
book_title = 'A Brief History of Time'
rec_books = recommend(model_knn, book_title, 20)
rec_books

['Permanent Record',
 'Seven Brief Lessons on Physics',
 'At Home: A Short History of Private Life',
 'Caste: The Origins of Our Discontents',
 'Frankenstein: The 1818 Text',
 'David and Goliath: Underdogs, Misfits, and the Art of Battling Giants',
 'Nudge: Improving Decisions About Health, Wealth, and Happiness',
 'The Happiness Hypothesis: Finding Modern Truth in Ancient Wisdom',
 'The Curious Incident of the Dog in the Night-Time',
 'Talking to Strangers: What We Should Know About the People We Don’t Know',
 'The Rise and Fall of the Third Reich: A History of Nazi Germany',
 'Made to Stick: Why Some Ideas Survive and Others Die',
 'Why We Sleep: Unlocking the Power of Sleep and Dreams',
 'The Professor and the Madman: A Tale of Murder, Insanity and the Making of the Oxford English Dictionary',
 'The Uninhabitable Earth: Life After Warming',
 "Man's Search for Meaning",
 'Northanger Abbey',
 'Persuasion',
 'Humankind: A Hopeful History',
 "The Omnivore's Dilemma: A Natural History of

In [None]:
rec_df = df_info[df_info['book_title'].isin(rec_books)]
rec_df = pd.concat([df_info[df_info['book_title']==book_title], rec_df])
with pd.option_context('display.max_colwidth', None):
    display(rec_df)

Unnamed: 0,book_id,book_title,author_name,description,rating,genres
64,64,A Brief History of Time,Stephen Hawking,"In the ten years since its publication in 1988, Stephen Hawking's classic work has become a landmark volume in scientific writing, with more than nine million copies in forty languages sold worldwide. That edition was on the cutting edge of what was then known about the origins and nature of the universe. But the intervening years have seen extraordinary advances in the technology of observing both the micro- and the macrocosmic worlds. These observations have confirmed many of Professor Hawking's theoretical predictions in the first edition of his book, including the recent discoveries of the Cosmic Background Explorer satellite (COBE), which probed back in time to within 300,000 years of the universe's beginning and revealed wrinkles in the fabric of space-time that he had projected. Eager to bring to his original text the new knowledge revealed by these observations, as well as his own recent research, Professor Hawking has prepared a new introduction to the book, written an entirely new chapter on wormholes and time travel, and updated the chapters throughout.",4.18,"[Nonfiction, Audiobook, Classics]"
3,3,Permanent Record,Edward Snowden,"Edward Snowden, the man who risked everything to expose the US government’s system of mass surveillance, reveals for the first time the story of his life, including how he helped to build that system and what motivated him to try to bring it down.In 2013, twenty-nine-year-old Edward Snowden shocked the world when he broke with the American intelligence establishment and revealed that the United States government was secretly pursuing the means to collect every single phone call, text message, and email. The result would be an unprecedented system of mass surveillance with the ability to pry into the private lives of every person on earth. Six years later, Snowden reveals for the very first time how he helped to build this system and why he was moved to expose it.Spanning the bucolic Beltway suburbs of his childhood and the clandestine CIA and NSA postings of his adulthood, Permanent Record is the extraordinary account of a bright young man who grew up online—a man who became a spy, a whistleblower, and, in exile, the Internet’s conscience. Written with wit, grace, passion, and an unflinching candor, Permanent Record is a crucial memoir of our digital age and destined to be a classic.",4.31,"[Nonfiction, Audiobook]"
8,8,Seven Brief Lessons on Physics,Carlo Rovelli,"All the beauty of modern physics in fewer than a hundred pages.This is a book about the joy of discovery. A playful, entertaining, and mind-bending introduction to modern physics, it's already a major bestseller in Italy and the United Kingdom. Carlo Rovelli offers surprising—and surprisingly easy to grasp—explanations of general relativity, quantum mechanics, elementary particles, gravity, black holes, the complex architecture of the universe, and the role humans play in this weird and wonderful world. He takes us to the frontiers of our knowledge: to the most minute reaches of the fabric of space, back to the origins of the cosmos, and into the workings of our minds. “Here, on the edge of what we know, in contact with the ocean of the unknown, shines the mystery and the beauty of the world,” Rovelli writes. “And it’s breathtaking.”",3.96,"[Nonfiction, Audiobook]"
14,14,At Home: A Short History of Private Life,Bill Bryson,"“Houses aren’t refuges from history. They are where history ends up.”Bill Bryson and his family live in a Victorian parsonage in a part of England where nothing of any great significance has happened since the Romans decamped. Yet one day, he began to consider how very little he knew about the ordinary things of life as he found it in that comfortable home. To remedy this, he formed the idea of journeying about his house from room to room to “write a history of the world without leaving home.” The bathroom provides the occasion for a history of hygiene; the bedroom, sex, death, and sleep; the kitchen, nutrition and the spice trade; and so on, as Bryson shows how each has figured in the evolution of private life. Whatever happens in the world, he demonstrates, ends up in our house, in the paint and the pipes and the pillows and every item of furniture.(front flap)",3.97,"[Nonfiction, Audiobook, Historical, Cultural]"
16,16,Caste: The Origins of Our Discontents,Isabel Wilkerson,"The Pulitzer Prize–winning, bestselling author of The Warmth of Other Suns examines the unspoken caste system that has shaped America and shows how our lives today are still defined by a hierarchy of human divisions.“As we go about our daily lives, caste is the wordless usher in a darkened theater, flashlight cast down in the aisles, guiding us to our assigned seats for a performance. The hierarchy of caste is not about feelings or morality. It is about power—which groups have it and which do not.”In this brilliant book, Isabel Wilkerson gives us a masterful portrait of an unseen phenomenon in America as she explores, through an immersive, deeply researched narrative and stories about real people, how America today and throughout its history has been shaped by a hidden caste system, a rigid hierarchy of human rankings.Beyond race, class, or other factors, there is a powerful caste system that influences people’s lives and behavior and the nation’s fate. Linking the caste systems of America, India, and Nazi Germany, Wilkerson explores eight pillars that underlie caste systems across civilizations, including divine will, bloodlines, stigma, and more. Using riveting stories about people—including Martin Luther King, Jr., baseball’s Satchel Paige, a single father and his toddler son, Wilkerson herself, and many others—she shows the ways that the insidious undertow of caste is experienced every day. She documents how the Nazis studied the racial systems in America to plan their out-cast of the Jews; she discusses why the cruel logic of caste requires that there be a bottom rung for those in the middle to measure themselves against; she writes about the surprising health costs of caste, in depression and life expectancy, and the effects of this hierarchy on our culture and politics. Finally, she points forward to ways America can move beyond the artificial and destructive separations of human divisions, toward hope in our common humanity.",4.59,"[Nonfiction, Audiobook, Cultural]"
17,17,Frankenstein: The 1818 Text,Mary Wollstonecraft Shelley,"Mary Shelley's seminal novel of the scientist whose creation becomes a monsterThis edition is the original 1818 text, which preserves the hard-hitting and politically charged aspects of Shelley's original writing, as well as her unflinching wit and strong female voice. This edition also includes a new introduction and suggestions for further reading by author and Shelley expert Charlotte Gordon, literary excerpts and reviews selected by Gordon and a chronology and essay by preeminent Shelley scholar Charles E. Robinson.",3.81,"[Literature, Classics, Fiction, Fantasy, Novels]"
21,21,"David and Goliath: Underdogs, Misfits, and the Art of Battling Giants",Malcolm Gladwell,"In his #1 bestselling books The Tipping Point, Blink, and Outliers, Malcolm Gladwell has explored the ways we understand and change our world. Now he looks at the complex and surprising ways the weak can defeat the strong, the small can match up against the giant, and how our goals (often culturally determined) can make a huge difference in our ultimate sense of success. Drawing upon examples from the world of business, sports, culture, cutting-edge psychology, and an array of unforgettable characters around the world, David and Goliath is in many ways the most practical and provocative book Malcolm Gladwell has ever written.",3.95,"[Nonfiction, Audiobook]"
22,22,"Nudge: Improving Decisions About Health, Wealth, and Happiness",Richard H. Thaler,"From the winner of the 2017 Nobel Prize in Economics, Richard H. Thaler, and Cass R. Sunstein: a revelatory look at how we make decisionsNew York Times bestsellerNamed a Best Book of the Year by The Economist and the Financial Times Every day we make choices—about what to buy or eat, about financial investments or our children’s health and education, even about the causes we champion or the planet itself. Unfortunately, we often choose poorly. Nudge is about how we make these choices and how we can make better ones. Using dozens of eye-opening examples and drawing on decades of behavioral science research, Nobel Prize winner Richard H. Thaler and Harvard Law School professor Cass R. Sunstein show that no choice is ever presented to us in a neutral way, and that we are all susceptible to biases that can lead us to make bad decisions. But by knowing how people think, we can use sensible “choice architecture” to nudge people toward the best decisions for ourselves, our families, and our society, without restricting our freedom of choice.",3.84,[Nonfiction]
23,23,The Happiness Hypothesis: Finding Modern Truth in Ancient Wisdom,Jonathan Haidt,"In his widely praised book, award-winning psychologist Jonathan Haidt examines the world’s philosophical wisdom through the lens of psychological science, showing how a deeper understanding of enduring maxims-like Do unto others as you would have others do unto you, or What doesn’t kill you makes you stronger-can enrich and even transform our lives.",4.11,"[Nonfiction, Audiobook]"
27,27,The Curious Incident of the Dog in the Night-Time,Mark Haddon,"Christopher John Francis Boone knows all the countries of the world and their capitals and every prime number up to 7,057. He relates well to animals but has no understanding of human emotions. He cannot stand to be touched. And he detests the color yellow.Although gifted with a superbly logical brain, for fifteen-year-old Christopher everyday interactions and admonishments have little meaning. He lives on patterns, rules, and a diagram kept in his pocket. Then one day, a neighbor's dog, Wellington, is killed and his carefully constructive universe is threatened. Christopher sets out to solve the murder in the style of his favourite (logical) detective, Sherlock Holmes. What follows makes for a novel that is funny, poignant and fascinating in its portrayal of a person whose curse and blessing are a mind that perceives the world entirely literally.",3.88,"[Adult Fiction, Mystery, Literature, Classics, Contemporary, Fiction, Novels]"


## 3.2. Content-based Recommendation

- Ý tưởng thực hiện:
    + Việc gợi ý sách cho người đọc được dựa trên 4 tiêu chuẩn chính: Mô tả sách (Description), Thể loại sách (Genres), đánh giá rating từ phía người dùng (user_rating), đánh giá rating trung bình của sách (book_rating)

- Các bước thực hiện:
    + Bước 1: Lấy thông tin những quyển sách mà người dùng đã đánh giá (chỉ lấy những quyển người dùng đánh giá 5 sao, nếu không có thì lấy 4 sao). 
    + Bước 2: với mỗi quyển sách, ta sẽ gợi ý 5 quyển nữa (dựa vào xử lý mô tả sách và thể loại sách) và gom lại thành 1 danh sách
    + Bước 3: Dựa vào thuật toán, ta dự đoán số sao (rating) mà người dùng sẽ đánh giá cho danh sách đó và chọn ra 10 quyển được đánh giá cao nhất cho người dùng

### 3.2.1. Recommend Similar Book

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)
    
def make_lower_case(text):
    return text.lower()
    
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text
    
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text
    
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)
    

df_info['cleaned_desc'] = df_info['description'].apply(_removeNonAscii)
df_info['cleaned_desc'] = df_info.cleaned_desc.apply(func = make_lower_case)
df_info['cleaned_desc'] = df_info.cleaned_desc.apply(func = remove_stop_words)
df_info['cleaned_desc'] = df_info.cleaned_desc.apply(func=remove_punctuation)
df_info['cleaned_desc'] = df_info.cleaned_desc.apply(func=remove_html)

In [None]:
df_info.head()

Unnamed: 0,book_id,book_title,author_name,description,rating,genres,word_count,cleaned_desc
0,0,Ways of Seeing,John Berger,John Berger’s Classic Text on ArtJohn Berger's...,3.86,[Nonfiction],207,john bergers classic text artjohn berger s way...
1,1,Life of Pi,Yann Martel,Life of Pi is a fantasy adventure novel by Yan...,3.92,"[Cultural, Literature, Classics, Contemporary,...",58,life pi fantasy adventure novel yann martel pu...
2,2,Physics of the Impossible,Michio Kaku,A fascinating exploration of the science of th...,4.07,"[Nonfiction, Audiobook]",282,fascinating exploration science impossiblefrom...
3,3,Permanent Record,Edward Snowden,"Edward Snowden, the man who risked everything ...",4.31,"[Nonfiction, Audiobook]",200,edward snowden man risked everything expose us...
4,4,The Histories,Herodotus,One of the masterpieces of classical literatur...,3.98,"[Cultural, Literature, Classics, Nonfiction, H...",95,one masterpieces classical literature historie...


In [None]:
# Hàm để lấy index danh sách các thể loại genres cho trước
def get_genres_index(genres): 
    temp = df_info['genres'].explode()
    return list(set(temp[temp.isin(genres)].index.to_list()))

def recommend(title, genres):
    """
    Mục tiêu: Dựa vào Mô tả (Description) và thể loại, hàm này sẽ gợi ý 5 quyển sách có độ tương đồng cao
    + Quá trình: Lọc theo danh sách thể loại -> Tính TF-IDF -> tính độ đo cosine -> lọc ra 5 quyển có độ đo cao nhất
    """
    
    global rec
    # Lọc danh sách theo thể loại yêu cầu
    list_index_genres = get_genres_index(genres) 


    data = df_info.loc[list_index_genres]  
    data.reset_index(level = 0, inplace = True) 
  
    indices = pd.Series(data.index, index = data['book_title'])
    
    #Áp dụng tf-idf với ngram (1->3)
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df = 1, stop_words='english')
    tfidf_matrix = tf.fit_transform(data['cleaned_desc'])
    
    # Tính độ đo cosine
    sg = cosine_similarity(tfidf_matrix, tfidf_matrix)
    

    idx = indices[title]
    if type(idx) != np.int64: # Phòng trường hợp tên của quyển sách bị trùng
        idx = idx[0]

    # Lấy điểm số (Score)
    sig = list(enumerate(sg[idx]))
    sig = sorted(sig, key=lambda x: x[1], reverse=True)
    # Lấy 10 quyển sách tốt nhất
    sig = sig[1:11]
    book_indices = [i[0] for i in sig]
   
    # Top 10 book recommendation
    rec = data[["book_id", "book_title", "author_name", "description", "rating", "genres"]].iloc[book_indices]
    
    return rec, sig


In [None]:
book_example = 'A Brief History of Time'
genres_example = df_info[df_info.book_title == book_example].genres.values[0]

recommend_books, scores = recommend(book_example, genres_example)
recommend_books


Unnamed: 0,book_id,book_title,author_name,description,rating,genres
360,411,Brief Answers to the Big Questions,Stephen Hawking,Stephen Hawking was recognized as one of the g...,4.3,"[Nonfiction, Audiobook, Adult]"
118,122,The Grand Design,Stephen Hawking,THE FIRST MAJOR WORK IN NEARLY A DECADE BY ONE...,4.04,"[Nonfiction, Audiobook]"
10,10,"The Fabric of the Cosmos: Space, Time, and the...",Brian Greene,"From Brian Greene, one of the world’s leading ...",4.11,"[Nonfiction, Audiobook]"
24,24,Chaos: Making a New Science,James Gleick,A work of popular science in the tradition of ...,4.02,"[Nonfiction, Audiobook]"
1658,1953,Eat That Frog!: 21 Great Ways to Stop Procrast...,Brian Tracy,"The legendary Eat That Frog! (more than 450,00...",3.88,"[Nonfiction, Audiobook]"
151,161,The Universe in a Nutshell,Stephen Hawking,"Stephen Hawking’s phenomenal, multimillion-cop...",4.17,"[Nonfiction, Audiobook]"
391,444,Astrophysics for People in a Hurry,Neil deGrasse Tyson,What is the nature of space and time? How do w...,4.08,"[Nonfiction, Audiobook, Adult]"
334,384,The Artist's Way: A Spiritual Path to Higher C...,Julia Cameron,The Artist’s Way is the seminal book on the su...,3.91,[Nonfiction]
1392,1635,Things Fall Apart,Chinua Achebe,More than two million copies of Things Fall Ap...,3.69,"[Cultural, Literature, Novels, Classics, Ficti..."
385,438,A Universe from Nothing: Why There Is Somethin...,Lawrence M. Krauss,Bestselling author and acclaimed physicist Law...,3.93,"[Nonfiction, Audiobook]"


In [None]:
# Độ tương đồng
scores

[(360, 0.06701252782849218),
 (118, 0.05682015363646507),
 (10, 0.037838194566628056),
 (24, 0.03228998082215758),
 (1658, 0.031731812408608825),
 (151, 0.03145663935814988),
 (391, 0.030002697319774653),
 (334, 0.02901000913592123),
 (1392, 0.028890814445230298),
 (385, 0.027174002477004893)]

###3.2.2. Recommend Books for Specific User

In [None]:
# Lấy danh sách thể loại yêu thích của user
def get_user_favorite_genres(user_id): 
    list_book_ids = df_ratings[df_ratings.user_id == user_id][df_ratings.user_rating >= 4].book_id.to_list()
    genres_like = Counter(df_info[df_info.book_id.isin(list_book_ids)].genres.explode())
    genres_like = [i[0] for i in genres_like.most_common(5)]
    return genres_like

In [None]:
def recommend_for_user(user_id):

    """
    Mục tiêu: Gợi ý sách cho người dùng
    + Quá trình: Lọc ra những quyển sách user đã từng đánh giá và có rating = 5 (nếu không có thì = 4)
    -> Sắp xếp theo tiêu chí đánh giá trung bình (Average Rating) của quyển sách và chọn tối đa 10 quyển
    -> Lấy danh sách thể loại yêu thích của người dùng
    -> Với mỗi quyển sách, nếu không có thể loại yêu thích của user thì sẽ bỏ qua; dùng hàm recommend để tìm 10 quyển tương đồng và gom lại
    -> Dùng mô hình MF đã khởi tạo trước đó để dự đoán rating user cho những quyển được gợi ý, lọc ra top 10 quyển cao nhất cho user
    """
    list_book_ids = df_ratings[df_ratings.user_id == user_id][df_ratings.user_rating == 5].book_id.to_list()

    if len(list_book_ids) == 0:
        list_book_ids = df_ratings[df_ratings.user_id == user_id][df_ratings.user_rating == 4].book_id.to_list()

    if len(list_book_ids) == 0:
        return None, None

    list_book_ids = df_info[df_info.book_id.isin(list_book_ids)].sort_values(by = "rating", ascending = False)
    list_book_ids = list_book_ids.book_id.values[:min(20, len(list_book_ids))]

    list_book_recommend = pd.DataFrame({"book_id": [], "book_title": [], "author_name": [], "description": [], 
                                        "rating": [], "genres": []})

    genres_favorite_user = set(get_user_favorite_genres(user_id))

    for ids in tqdm(list_book_ids):
        genres_book_in_list = df_info[df_info.book_id == ids].genres.values[0]
        list_intersect_genres = genres_favorite_user & set(genres_book_in_list)
        if len(list_intersect_genres) == 0:
            continue

        book = df_info[df_info.book_id == ids].book_title.values[0]
        recommend_book, _ = recommend(book, list(list_intersect_genres))
        list_book_recommend = pd.concat([list_book_recommend, recommend_book], ignore_index = True)

    list_book_recommend.drop_duplicates(subset=['book_id', 'book_title'], inplace = True)
    list_book_recommend_ids = list_book_recommend.book_id.to_list()

    users = [user_id] * len(list_book_recommend_ids)

    predict_ratings = np.round(model.predict([pd.Series(users), pd.Series(list_book_recommend_ids)]), decimals=2)
    predict_ratings = predict_ratings.T[0]

    predict_ratings_dict = {}
    for i in range(len(predict_ratings)):
        predict_ratings_dict[list_book_recommend.book_title.values[i]] = predict_ratings[i]

    predict_ratings_dict = sorted(predict_ratings_dict.items(), key=lambda kv: kv[1], reverse=True)

    books_recommend_for_user = predict_ratings_dict[:10]
    books_recommend_for_user = [book[0] for book in books_recommend_for_user]

    return books_recommend_for_user, genres_favorite_user
    

In [None]:
user_id = 300
list_recommend_book, genres_user_read = recommend_for_user(user_id)

  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until
100%|██████████| 5/5 [00:11<00:00,  2.32s/it]


In [None]:
rec_df = df_info[df_info['book_title'].isin(list_recommend_book)]
with pd.option_context('display.max_colwidth', None):
    display(rec_df)

Unnamed: 0,book_id,book_title,author_name,description,rating,genres,word_count,cleaned_desc
117,117,How to Stop Worrying and Start Living,Dale Carnegie,"The book 'How to stop worrying &amp; start living' suggest many ways to conquer worry and lead a wonderful life.The book mentions fundamental facts to know about worry and magic formula for solving worry-some situations.Psychologists &amp; Doctors' view:-Worry can make even the most stolid person ill.-Worry may cause nervous breakdown.-Worry can even cause tooth decay-Worry is one of the factors for High Blood Pressure.-Worry makes you tense and nervous and affect the nerves of your stomach.The book suggests basic techniques in analysing worry, step by step, in order to cope up with them.A very interesting feature of the book is 'How to eliminate 50% of your business worries'.The book offers 7 ways to cultivate a mental attitude that will bring you peace and happiness. Also, the golden rule for conquering worry, keeping your energy &amp; spirits high.The book consists of some True Stories which will help the readers in conquering worry to lead you to success in life.The book is full of similar incidences and narrations which will make our readers to understand the situation in an easy way and lead a happy life. A must read book for everyone.",4.14,"[Nonfiction, Audiobook]",190,book how stop worrying amp start living suggest many ways conquer worry lead wonderful life the book mentions fundamental facts know worry magic formula solving worry some situations psychologists amp doctors view worry make even stolid person ill worry may cause nervous breakdown worry even cause tooth decay worry one factors high blood pressure worry makes tense nervous affect nerves stomach the book suggests basic techniques analysing worry step step order cope them a interesting feature book how eliminate 50 business worries the book offers 7 ways cultivate mental attitude bring peace happiness also golden rule conquering worry keeping energy amp spirits high the book consists true stories help readers conquering worry lead success life the book full similar incidences narrations make readers understand situation easy way lead happy life must read book everyone
195,195,Wuthering Heights,Emily Brontë,"You can find the redesigned cover of this edition HERE.This best-selling Norton Critical Edition is based on the 1847 first edition of the novel. For the Fourth Edition, the editor has collated the 1847 text with several modern editions and has corrected a number of variants, including accidentals. The text is accompanied by entirely new explanatory annotations.New to the fourth Edition are twelve of Emily Bronte's letters regarding the publication of the 1847 edition of Wuthering Heights as well as the evolution of the 1850 edition, prose and poetry selections by the author, four reviews of the novel, and poetry selections by the author, four reviews of the novel, and Edward Chitham's insightful and informative chronology of the creative process behind the beloved work.Five major critical interpretations of Wuthering Heights are included, three of them new to the Fourth Edition. A Stuart Daley considers the importance of chronology in the novel. J. Hillis Miller examines Wuthering Heights's problems of genre and critical reputation. Sandra M. Gilbert assesses the role of Victorian Christianity plays in the novel, while Martha Nussbaum traces the novel's romanticism. Finally, Lin Haire-Sargeant scrutinizes the role of Heathcliff in film adaptations of Wuthering Heights. A Chronology and updated Selected Bibliography are also included.",3.86,"[Romance, Literature, Novels, Classics, Fiction, Historical]",206,find redesigned cover edition here this best selling norton critical edition based 1847 first edition novel fourth edition editor collated 1847 text several modern editions corrected number variants including accidentals text accompanied entirely new explanatory annotations new fourth edition twelve emily bronte s letters regarding publication 1847 edition wuthering heights well evolution 1850 edition prose poetry selections author four reviews novel poetry selections author four reviews novel edward chitham s insightful informative chronology creative process behind beloved work five major critical interpretations wuthering heights included three new fourth edition stuart daley considers importance chronology novel j hillis miller examines wuthering heights s problems genre critical reputation sandra m gilbert assesses role victorian christianity plays novel martha nussbaum traces novel s romanticism finally lin haire sargeant scrutinizes role heathcliff film adaptations wuthering heights chronology updated selected bibliography also included
404,404,Exhalation,Ted Chiang,"alternate cover for this ISBN can be found hereThe universe began as an enormous breath being held.From the acclaimed author of Stories of Your Life and Others — the basis for the Academy Award-nominated film Arrival — comes a ground-breaking new collection of short fiction: nine stunningly original, provocative, and poignant stories. These are tales that tackle some of humanity's oldest questions along with new quandaries only Ted Chiang could imagine.In ""The Merchant and the Alchemist's Gate"", a portal through time forces a fabric seller in ancient Baghdad to grapple with past mistakes and second chances. In ""Exhalation"", an alien scientist makes a shocking discovery with ramifications that are literally universal. In ""Anxiety Is the Dizziness of Freedom"" the ability to glimpse into alternate universes necessitates a radically new examination of the concepts of choice and free will.Including stories being published for the first time as well as some of his rare and classic uncollected work, Exhalation is Ted Chiang at his best: profound, sympathetic — revelatory.Contents:- The Merchant and the Alchemist's Gate (2007)- Exhalation (2008)- What's Expected of Us (2005)- The Lifecycle of Software Objects (2010)- Dacey's Patent Automatic Nanny (2011)- The Truth of Fact, the Truth of Feeling (2013)- The Great Silence (2015)- Omphalos (2018)- Anxiety Is the Dizziness of Freedom (2018)",4.27,"[Fiction, Audiobook, Fantasy]",214,alternate cover isbn found herethe universe began enormous breath held from acclaimed author stories life others basis academy award nominated film arrival comes ground breaking new collection short fiction nine stunningly original provocative poignant stories tales tackle humanity s oldest questions along new quandaries ted chiang could imagine in the merchant alchemist s gate portal time forces fabric seller ancient baghdad grapple past mistakes second chances exhalation alien scientist makes shocking discovery ramifications literally universal anxiety dizziness freedom ability glimpse alternate universes necessitates radically new examination concepts choice free will including stories published first time well rare classic uncollected work exhalation ted chiang best profound sympathetic revelatory contents merchant alchemist s gate 2007 exhalation 2008 what s expected us 2005 lifecycle software objects 2010 dacey s patent automatic nanny 2011 truth fact truth feeling 2013 great silence 2015 omphalos 2018 anxiety dizziness freedom 2018
406,406,Exhalation,Ted Chiang,"alternate cover for ISBN 9781101947883From an award-winning science fiction writer (whose short story ""The Story of Your Life"" was the basis for the Academy Award-nominated movie Arrival), the long-awaited new collection of stunningly original, humane, and already celebrated short storiesThis much-anticipated second collection of stories is signature Ted Chiang, full of revelatory ideas and deeply sympathetic characters. In ""The Merchant and the Alchemist's Gate,"" a portal through time forces a fabric seller in ancient Baghdad to grapple with past mistakes and the temptation of second chances. In the epistolary ""Exhalation,"" an alien scientist makes a shocking discovery with ramifications not just for his own people, but for all of reality. And in ""The Lifecycle of Software Objects,"" a woman cares for an artificial intelligence over twenty years, elevating a faddish digital pet into what might be a true living being. Also included are two brand-new stories: ""Omphalos"" and ""Anxiety Is the Dizziness of Freedom.""In this fantastical and elegant collection, Ted Chiang wrestles with the oldest questions on earth--What is the nature of the universe? What does it mean to be human?--and ones that no one else has even imagined. And, each in its own way, the stories prove that complex and thoughtful science fiction can rise to new heights of beauty, meaning, and compassion.",4.27,"[Fiction, Audiobook, Fantasy]",214,alternate cover isbn 9781101947883from award winning science fiction writer whose short story the story life basis academy award nominated movie arrival long awaited new collection stunningly original humane already celebrated short storiesthis much anticipated second collection stories signature ted chiang full revelatory ideas deeply sympathetic characters the merchant alchemist s gate portal time forces fabric seller ancient baghdad grapple past mistakes temptation second chances epistolary exhalation alien scientist makes shocking discovery ramifications people reality the lifecycle software objects woman cares artificial intelligence twenty years elevating faddish digital pet might true living being also included two brand new stories omphalos anxiety dizziness freedom in fantastical elegant collection ted chiang wrestles oldest questions earth what nature universe mean human and ones one else even imagined and way stories prove complex thoughtful science fiction rise new heights beauty meaning compassion
407,407,Exhalation,Ted Chiang,"An alternate cover edition for this book can be found here.The universe began as an enormous breath being held.From the acclaimed author of Stories of Your Life and Others—the basis for the Academy Award–nominated film Arrival—comes a ground-breaking new collection of short fiction: nine stunningly original, provocative, and poignant stories. These are the tales that tackle some of humanity's oldest questions along with new quandaries only Ted Chiang could imagine.In ""The Merchant and the Alchemist's Gate,"" a portal through time forces a fabric seller in ancient Baghdad to grapple with past mistakes and second chances. In ""Exhalation,"" an alien scientist makes a shocking discovery with ramifications that are literally universal. In ""Anxiety Is the Dizziness of Freedom,"" the ability to glimpse into alternate universes necessitates a radically new examination of the concepts of choice and free will.Including stories being published for the first time as well as some of his rare and classic uncollected work, Exhalation is Ted Chiang at his best: profound, sympathetic—revelatory.",4.27,"[Fiction, Audiobook, Fantasy]",164,alternate cover edition book found here the universe began enormous breath held from acclaimed author stories life othersthe basis academy awardnominated film arrivalcomes ground breaking new collection short fiction nine stunningly original provocative poignant stories tales tackle humanity s oldest questions along new quandaries ted chiang could imagine in the merchant alchemist s gate portal time forces fabric seller ancient baghdad grapple past mistakes second chances exhalation alien scientist makes shocking discovery ramifications literally universal anxiety dizziness freedom ability glimpse alternate universes necessitates radically new examination concepts choice free will including stories published first time well rare classic uncollected work exhalation ted chiang best profound sympatheticrevelatory
621,621,Hamnet,Maggie O'Farrell,"A New York Times Notable Book (2020)Best Book of 2020: Guardian, Financial Times, Literary Hub, and NPRDrawing on Maggie O'Farrell's long-term fascination with the little-known story behind Shakespeare's most enigmatic play, HAMNET is a luminous portrait of a marriage, at its heart the loss of a beloved child. Warwickshire in the 1580s. Agnes is a woman as feared as she is sought after for her unusual gifts. She settles with her husband in Henley street, Stratford, and has three children: a daughter, Susanna, and then twins, Hamnet and Judith. The boy, Hamnet, dies in 1596, aged eleven. Four years or so later, the husband writes a play called Hamlet. Award-winning author Maggie O'Farrell's new novel breathes full-blooded life into the story of a loss usually consigned to literary footnotes, and provides an unforgettable vindication of Agnes, a woman intriguingly absent from history.",4.35,"[Adult Fiction, Historical, Adult, Fiction, Audiobook, Novels]",142,new york times notable book 2020 best book 2020 guardian financial times literary hub nprdrawing maggie o farrell s long term fascination little known story behind shakespeare s enigmatic play hamnet luminous portrait marriage heart loss beloved child warwickshire 1580s agnes woman feared sought unusual gifts settles husband henley street stratford three children daughter susanna twins hamnet judith boy hamnet dies 1596 aged eleven four years later husband writes play called hamlet award winning author maggie o farrell s new novel breathes full blooded life story loss usually consigned literary footnotes provides unforgettable vindication agnes woman intriguingly absent history
639,639,Daisy Jones & The Six,Taylor Jenkins Reid,"A gripping novel about the whirlwind rise of an iconic 1970s rock group and their beautiful lead singer, revealing the mystery behind their infamous break up.Everyone knows Daisy Jones &amp; The Six, but nobody knows the real reason why they split at the absolute height of their popularity...until now.Daisy is a girl coming of age in L.A. in the late sixties, sneaking into clubs on the Sunset Strip, sleeping with rock stars, and dreaming of singing at the Whisky a Go-Go. The sex and drugs are thrilling, but it's the rock and roll she loves most. By the time she's twenty, her voice is getting noticed, and she has the kind of heedless beauty that makes people do crazy things.Another band getting noticed is The Six, led by the brooding Billy Dunne. On the eve of their first tour, his girlfriend Camila finds out she's pregnant, and with the pressure of impending fatherhood and fame, Billy goes a little wild on the road.Daisy and Billy cross paths when a producer realizes the key to supercharged success is to put the two together. What happens next will become the stuff of legend.The making of that legend is chronicled in this riveting and unforgettable novel, written as an oral history of one of the biggest bands of the seventies. Taylor Jenkins Reid is a talented writer who takes her work to a new level with Daisy Jones &amp; The Six, brilliantly capturing a place and time in an utterly distinctive voice.",4.19,"[Romance, Adult Fiction, Contemporary, Adult, Fiction, Audiobook, Historical]",249,gripping novel whirlwind rise iconic 1970s rock group beautiful lead singer revealing mystery behind infamous break up everyone knows daisy jones amp six nobody knows real reason split absolute height popularity until now daisy girl coming age l a late sixties sneaking clubs sunset strip sleeping rock stars dreaming singing whisky go go sex drugs thrilling rock roll loves most time twenty voice getting noticed kind heedless beauty makes people crazy things another band getting noticed six led brooding billy dunne eve first tour girlfriend camila finds pregnant pressure impending fatherhood fame billy goes little wild road daisy billy cross paths producer realizes key supercharged success put two together happens next become stuff legend the making legend chronicled riveting unforgettable novel written oral history one biggest bands seventies taylor jenkins reid talented writer takes work new level daisy jones amp six brilliantly capturing place time utterly distinctive voice
700,700,Daisy Jones & The Six,Taylor Jenkins Reid,"A gripping novel about the whirlwind rise of an iconic 1970s rock group and their beautiful lead singer, revealing the mystery behind their infamous break up.Everyone knows Daisy Jones &amp; The Six, but nobody knows the real reason why they split at the absolute height of their popularity…until now.Daisy is a girl coming of age in L.A. in the late sixties, sneaking into clubs on the Sunset Strip, sleeping with rock stars, and dreaming of singing at the Whisky a Go-Go. The sex and drugs are thrilling, but it’s the rock and roll she loves most. By the time she’s twenty, her voice is getting noticed, and she has the kind of heedless beauty that makes people do crazy things.Another band getting noticed is The Six, led by the brooding Billy Dunne. On the eve of their first tour, his girlfriend Camila finds out she’s pregnant, and with the pressure of impending fatherhood and fame, Billy goes a little wild on the road.Daisy and Billy cross paths when a producer realizes the key to supercharged success is to put the two together. What happens next will become the stuff of legend.",4.19,"[Romance, Adult Fiction, Contemporary, Adult, Fiction, Audiobook, Historical]",191,gripping novel whirlwind rise iconic 1970s rock group beautiful lead singer revealing mystery behind infamous break up everyone knows daisy jones amp six nobody knows real reason split absolute height popularityuntil now daisy girl coming age l a late sixties sneaking clubs sunset strip sleeping rock stars dreaming singing whisky go go sex drugs thrilling rock roll loves most time shes twenty voice getting noticed kind heedless beauty makes people crazy things another band getting noticed six led brooding billy dunne eve first tour girlfriend camila finds shes pregnant pressure impending fatherhood fame billy goes little wild road daisy billy cross paths producer realizes key supercharged success put two together happens next become stuff legend
722,722,Hamnet,Maggie O'Farrell,"A New York Times Notable Book (2020)Best Book of 2020: Guardian, Financial Times, Literary Hub, and NPRA thrilling departure: A short, piercing, deeply moving new novel from the acclaimed author of I Am, I Am, I Am, about the death of Shakespeare's eleven-year-old son Hamnet--a name interchangeable with Hamlet in fifteenth-century Britain--and the years leading up to the production of his great play.England, 1580. A young Latin tutor--penniless, bullied by a violent father--falls in love with an extraordinary, eccentric young woman: a wild creature who walks her family's estate with a falcon on her shoulder and is known throughout the countryside for her unusual gifts as a healer. Agnes understands plants and potions better than she does people, but once she settles with her husband on Henley Street in Stratford she becomes a fiercely protective mother and a steadfast, centrifugal force in the life of her young husband, whose gifts as a writer are just beginning to awaken when his beloved young son succumbs to bubonic plague.A luminous portrait of a marriage, a shattering evocation of a family ravaged by grief and loss, and a hypnotic recreation of the story that inspired one of the greatest literary masterpieces of all time, Hamnet is mesmerizing and seductive, an impossible-to-put-down novel from one of our most gifted writers.",4.35,"[Adult Fiction, Historical, Adult, Fiction, Audiobook, Novels]",216,new york times notable book 2020 best book 2020 guardian financial times literary hub npra thrilling departure short piercing deeply moving new novel acclaimed author am am am death shakespeare s eleven year old son hamnet a name interchangeable hamlet fifteenth century britain and years leading production great play england 1580 young latin tutor penniless bullied violent father falls love extraordinary eccentric young woman wild creature walks family s estate falcon shoulder known throughout countryside unusual gifts healer agnes understands plants potions better people settles husband henley street stratford becomes fiercely protective mother steadfast centrifugal force life young husband whose gifts writer beginning awaken beloved young son succumbs bubonic plague a luminous portrait marriage shattering evocation family ravaged grief loss hypnotic recreation story inspired one greatest literary masterpieces time hamnet mesmerizing seductive impossible to put down novel one gifted writers
747,747,Hamnet,Maggie O'Farrell,"A thrilling departure: A short, piercing, deeply moving new novel from the acclaimed author of I Am, I Am, I Am, about the death of Shakespeare's eleven-year-old son Hamnet--a name interchangeable with Hamlet in fifteenth-century Britain--and the years leading up to the production of his great play.England, 1580. A young Latin tutor--penniless, bullied by a violent father--falls in love with an extraordinary, eccentric young woman: a wild creature who walks her family's estate with a falcon on her shoulder and is known throughout the countryside for her unusual gifts as a healer. Agnes understands plants and potions better than she does people, but once she settles with her husband on Henley Street in Stratford she becomes a fiercely protective mother and a steadfast, centrifugal force in the life of her young husband, whose gifts as a writer are just beginning to awaken when his beloved young son succumbs to bubonic plague.A luminous portrait of a marriage, a shattering evocation of a family ravaged by grief and loss, and a hypnotic recreation of the story that inspired one of the greatest literary masterpieces of all time, Hamnet is mesmerizing and seductive, an impossible-to-put-down novel from one of our most gifted writers.",4.35,"[Adult Fiction, Historical, Adult, Fiction, Audiobook, Novels]",200,thrilling departure short piercing deeply moving new novel acclaimed author am am am death shakespeare s eleven year old son hamnet a name interchangeable hamlet fifteenth century britain and years leading production great play england 1580 young latin tutor penniless bullied violent father falls love extraordinary eccentric young woman wild creature walks family s estate falcon shoulder known throughout countryside unusual gifts healer agnes understands plants potions better people settles husband henley street stratford becomes fiercely protective mother steadfast centrifugal force life young husband whose gifts writer beginning awaken beloved young son succumbs bubonic plague a luminous portrait marriage shattering evocation family ravaged grief loss hypnotic recreation story inspired one greatest literary masterpieces time hamnet mesmerizing seductive impossible to put down novel one gifted writers


# 4. Đánh giá

## 4.1. Rating Prediction

Kết quả đạt được:
- KNN-based collaborative filtering (CF): 0.9025 (RMSE)
- Matrix factorization & Deep learning (MF): 0.7888 (RMSE)

Nhìn chung kết quả này là chấp nhận được đối với một tập dữ liệu khá khiêm tốn cũng như chưa mang tính tổng quát mà nhóm tự crawl.

Bên cạnh đó, ta thấy MF tỏ ra vượt trội hơn so với CF về độ chính xác. Tuy nhiên, chi phí tính toán của MF khá cao hơn CF, do đó tuỳ tình huống mà ta có thể chọn lựa giải pháp phù hợp.

## 4.2. Book Recommendation

Ở đây, nhóm cũng sử dụng hai giải pháp tương ứng với hai giải pháp dự đoán rating. Cả hai đều trả về một danh sách gợi ý tương đối tốt, tuy nhiên chúng cũng có những ưu, nhược điểm sau:
- Collaborative filtering (CF) based:
    + CF đưa ra gợi ý dựa vào sự tương đồng về mặt rating của mỗi cuốn sách, nên CF có thể gợi ý cho người đọc những cuốn sách mới nằm ngoài những chủ đề, nội dung mà họ hay đọc.
    + Tuy nhiên điều này cũng mang lại một nhược điểm: những cuốn sách mới, có ít lượt rating sẽ gần như hiếm khi được gợi ý do không đủ dữ liệu (cold start problem).

- Content based:
    + Trái với CF, giải pháp này gợi ý cho người đọc dựa trên sự tương đồng về mặt chủ đề, thể loại, nội dung của sách. Do đó, các cuốn sách mới có ít lượt rating vẫn có thể được gợi ý bình thường.
    + Tuy nhiên, đây cũng là một nhược điểm khi hệ thống chỉ gợi ý các cuốn sách tương đồng mà không giới thiệu các chủ đề hay thể loại khác. Mặt khác, việc gán nhãn chủ đề, nội dung cho sách cũng ảnh hưởng rất lớn đến độ hiệu quả của giải pháp này.

## 4.3. Thiếu sót và hướng phát triển

- Bộ dữ liệu chưa thật sự tốt, còn khá ít cũng như chưa mang tính bao quát

&#8594;  Dành nhiều thời gian hơn cho việc thu thập và xử lý dữ liệu, kết hợp tìm thêm các nguồn chính thống

- Cả 2 giải pháp đều có những ưu, nhược điểm riêng, để áp dụng thực tế vẫn cần cải tiến nhiều hơn

&#8594; Sử dụng các giải pháp hybrid, kết hợp cả content-based và collaborative filering based để đạt hiệu quả cao hơn cho hệ thống gợi ý


# 5. Tham khảo

[1] https://realpython.com/build-recommendation-engine-collaborative-filtering/#what-is-collaborative-filtering

[2] https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea

[3] https://towardsdatascience.com/creating-a-hybrid-content-collaborative-movie-recommender-using-deep-learning-cc8b431618af

[4] https://calvinfeng.gitbook.io/machine-learning-notebook/supervised-learning/recommender/neural_collaborative_filtering

[5] https://calvinfeng.gitbook.io/machine-learning-notebook/supervised-learning/recommender/neural_collaborative_filtering

[6] https://towardsdatascience.com/building-a-content-based-book-recommendation-engine-9fd4d57a4da