# 1. Bộ dữ liệu

> * Bộ dữ liệu được sử dụng trong dự án này là ***một phần*** của một bộ dữ liệu gốc chứa các đánh giá sản phẩm và metadata từ Amazon, bao gồm **142.8 triệu** đánh giá từ tháng 5 năm 1996 đến tháng 7 năm 2014. [Amazon product data](https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/links.html)
>   
> * Bộ dữ liệu này bao gồm các đánh giá (điểm đánh giá, văn bản, số phiếu bầu hữu ích), metadata sản phẩm (mô tả, thông tin danh mục, giá cả, thương hiệu, và đặc điểm hình ảnh), và liên kết (đồ thị sản phẩm được xem và được mua cùng).
>
> * Chúng ta làm việc với dữ liệu về các sản phẩm làm đẹp (Beauty Products), với 2 dataset như sau:



**1. meta_Beauty.json.gz**: chứa 259,204 thông tin về các sản phẩm.

**2. ratings_Beauty.csv**: chứa 2,023,070 đánh giá từ người dùng cho các sản phẩm.

### Mục tiêu: xây dựng Hệ gợi ý sản phẩm cho người dùng theo 2 phương pháp:
* Collaborative Filtering

* Content-based Filtering

# 2. Tiền xử lý dữ liệu

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

plt.style.use("ggplot")


## 2.1. Đọc dữ liệu thô

In [81]:
file_path = './dataset/ratings_Beauty.csv'
amazon_ratings = pd.read_csv('./dataset/ratings_Beauty.csv')
amazon_ratings = amazon_ratings.dropna()
amazon_ratings.head()


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/ratings_Beauty.csv'

In [79]:
# Chuẩn hóa dữ liệu bằng cách loại bỏ khoảng trắng thừa
amazon_ratings['ProductId'] = amazon_ratings['ProductId'].str.strip()
amazon_ratings['UserId'] = amazon_ratings['UserId'].str.strip()

In [33]:
amazon_ratings.shape

(2023070, 4)

## 2.2. Tạo Utility Matrix

In [34]:
amazon_ratings1 = amazon_ratings.head(100000)

In [69]:
# Tạo utility matrix
utility_matrix = amazon_ratings1.pivot(index='ProductId', columns='UserId', values='Rating')

In [80]:
# Kiểm tra sự tồn tại của item_id và user_id trong utility_matrix
user_id = 'A2UDN3GAOQK1JE'
item_id = 'B000HRZLQE'

item_exists = item_id in utility_matrix.index
user_exists = user_id in utility_matrix.columns

print(f"Item '{item_id}' tồn tại trong utility matrix: {item_exists}")
print(f"User '{user_id}' tồn tại trong utility matrix: {user_exists}")

Item 'B000HRZLQE' tồn tại trong utility matrix: False
User 'A2UDN3GAOQK1JE' tồn tại trong utility matrix: False


In [70]:
utility_matrix

UserId,A00205921JHJK5X9LNP42,A00473363TJ8YSZ3YAGG9,A00700212KB3K0MVESPIY,A0081289HG0BXFQJQUWW,A01247753D6GFZD87MUV8,A01379141PEJ6FIH7UH38,A0143622X8ZC66HZXLUP,A01437583CZ7V02UKZQ5S,A01456542S5QPYUEGJXR8,A01907982I6OHXDYN5HD6,...,AZZOFVMQC0BJG,AZZQXL8VDCFTV,AZZS7W015QH2L,AZZSAMMJPJKJ1,AZZSNN1LC94VF,AZZTJQ7CQZUD8,AZZVCBG5G4EV8,AZZWJ3LICUEKJ,AZZWPNME0GQZ2,AZZZLM1E5JJ8C
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0205616461,,,,,,,,,,,...,,,,,,,,,,
0558925278,,,,,,,,,,,...,,,,,,,,,,
0733001998,,,,,,,,,,,...,,,,,,,,,,
0737104473,,,,,,,,,,,...,,,,,,,,,,
0762451459,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B0002VNKRM,,,,,,,,,,,...,,,,,,,,,,
B0002VNKSG,,,,,,,,,,,...,,,,,,,,,,
B0002VNLG2,,,,,,,,,,,...,,,,,,,,,,
B0002VNZS6,,,,,,,,,,,...,,,,,,,,,,


In [71]:
utility_matrix.shape

(6384, 91656)

In [72]:
# Tính trung bình chỉ từ các giá trị đã có
mean_ratings = utility_matrix.mean(axis=1, skipna=True)

# Chuẩn hóa utility matrix bằng cách trừ đi trung bình
normalized_matrix = utility_matrix.sub(mean_ratings, axis=0)

# Fill các giá trị missing bằng 0 sau khi chuẩn hóa
normalized_matrix = normalized_matrix.fillna(0)

In [73]:
# Hiển thị normalized matrix
normalized_matrix.head(100000)

UserId,A00205921JHJK5X9LNP42,A00473363TJ8YSZ3YAGG9,A00700212KB3K0MVESPIY,A0081289HG0BXFQJQUWW,A01247753D6GFZD87MUV8,A01379141PEJ6FIH7UH38,A0143622X8ZC66HZXLUP,A01437583CZ7V02UKZQ5S,A01456542S5QPYUEGJXR8,A01907982I6OHXDYN5HD6,...,AZZOFVMQC0BJG,AZZQXL8VDCFTV,AZZS7W015QH2L,AZZSAMMJPJKJ1,AZZSNN1LC94VF,AZZTJQ7CQZUD8,AZZVCBG5G4EV8,AZZWJ3LICUEKJ,AZZWPNME0GQZ2,AZZZLM1E5JJ8C
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0205616461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0558925278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0733001998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0737104473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0762451459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B0002VNKRM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0002VNKSG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0002VNLG2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0002VNZS6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
# Chuyển đổi sang định dạng sparse matrix để tiết kiệm bộ nhớ
sparse_matrix = csr_matrix(normalized_matrix.values)



In [75]:
# Tính toán độ đo tương đồng cosine giữa các sản phẩm
item_similarity = cosine_similarity(sparse_matrix)
# Đặt tất cả các giá trị trên đường chéo bằng 0
np.fill_diagonal(item_similarity, 1)


In [76]:
# Chuyển đổi matrix tương đồng sang DataFrame để dễ đọc
item_similarity_df = pd.DataFrame(item_similarity, index=utility_matrix.index, columns=utility_matrix.index)

# Hiển thị item similarity matrix
item_similarity_df.head(50)

ProductId,0205616461,0558925278,0733001998,0737104473,0762451459,1304139212,1304139220,130414089X,130414643X,1304146537,...,B0002VJIH8,B0002VJTGS,B0002VNKPO,B0002VNKPY,B0002VNKQS,B0002VNKRM,B0002VNKSG,B0002VNLG2,B0002VNZS6,B0002VQ0WO
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0205616461,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0558925278,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0733001998,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0737104473,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0762451459,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1304139212,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1304139220,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130414089X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130414643X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1304146537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# Kiểm tra xem giá trị có tồn tại trong tệp CSV không
user_id = 'A2UDN3GAOQK1JE'
item_id = 'B000HRZLQE'

item_in_csv = amazon_ratings['ProductId'].str.contains(item_id).any()
user_in_csv = amazon_ratings['UserId'].str.contains(user_id).any()

print(f"Item '{item_id}' tồn tại trong CSV: {item_in_csv}")
print(f"User '{user_id}' tồn tại trong CSV: {user_in_csv}")

Item 'B000HRZLQE' tồn tại trong CSV: True
User 'A2UDN3GAOQK1JE' tồn tại trong CSV: True


In [77]:
value = utility_matrix.loc['B000HRZLQE', 'A2UDN3GAOQK1JE']
print(f"Giá trị tại hàng 'B000HRZLQE' và cột 'A2UDN3GAOQK1JE': {value}")

KeyError: 'A2UDN3GAOQK1JE'

In [59]:
# Ví dụ cụ thể: Dự đoán rating của user 'A3LDPF5FMB782Z' cho sản phẩm '0616806692'
user_id = 'A1N77FI5BN8L78'
if user_id in utility_matrix.columns:
    user_ratings = utility_matrix[user_id].dropna()
    print(f"Ratings by user {user_id}:")
    print(user_ratings)
else:
    print(f"User {user_id} not found in utility matrix.")




User A1N77FI5BN8L78 not found in utility matrix.


# 3. Collaborative Filtering Model

## 3.1.1 Item-item Collaborative Filtering.

### Tìm chỉ số của sản phẩm có ProductId = i

In [12]:
ratings_utility_matrix.index[10]

'130414674X'

In [13]:
i = "130414674X"

product_names = list(ratings_utility_matrix.index)
product_ID = product_names.index(ratings_utility_matrix.index[10])

product_ID

10

### Tính toán độ tương quan của các sản phẩm còn lại với sản phẩm ProductId = i

In [14]:
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

(886,)

### Gợi ý 10 sản phẩm có độ tương quan cao nhất và lớn hơn 0.95

In [15]:
Recommend = list(ratings_utility_matrix.index[correlation_product_ID > 0.95])

# Removes the item already bought by the customer
Recommend.remove(i) 

Recommend[0:9]

['0205616461',
 '0558925278',
 '1304139220',
 '130414089X',
 '1304174778',
 '1304196046',
 '1304196062',
 '1304196135',
 '1304482634']

## 3.1.2. Đánh giá mô hình.

In [16]:
# Chú ý: Mã này là mã giả, vì ta không có thông tin chi tiết về sở thích cá nhân cụ thể của từng người dùng
important_products = amazon_ratings[amazon_ratings['Rating'] >= 4].groupby('UserId')['ProductId'].apply(set).to_dict()


In [17]:
# Mã giả, cần thông tin cụ thể về người dùng mục tiêu
user_target = 'A39HTATAQ9V7YF'

# Lấy sản phẩm quan trọng cho người dùng mục tiêu
true_labels = important_products.get(user_target, set())

# Tính precision và recall
true_positives = set(Recommend) & true_labels
precision = len(true_positives) / len(Recommend) if Recommend else 0
recall = len(true_positives) / len(true_labels) if true_labels else 0

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')


Precision: 0.0043
Recall: 0.3333


# Khó khăn cần tiếp tục giải quyết
### Precision và Recall thấp, cần tìm hiểu nguyên nhân

## 3.2. User-User Collaborative Filtering. (*incomplete*)

# 4. Collaborative Filtering Model (*incomplete*)

In [18]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF("../dataset/meta_Beauty.json.gz")



FileNotFoundError: [Errno 2] No such file or directory: '../dataset/meta_Beauty.json.gz'

In [None]:
df

Unnamed: 0,asin,description,title,imUrl,salesRank,categories,price,related,brand
0,0205616461,"As we age, our once youthful, healthy skin suc...",Bio-Active Anti-Aging Serum (Firming Ultra-Hyd...,http://ecx.images-amazon.com/images/I/41DecrGO...,{'Health & Personal Care': 461765},"[[Beauty, Skin Care, Face, Creams & Moisturize...",,,
1,0558925278,Mineral Powder Brush--Apply powder or mineral ...,Eco Friendly Ecotools Quality Natural Bamboo C...,http://ecx.images-amazon.com/images/I/51L%2BzY...,{'Beauty': 402875},"[[Beauty, Tools & Accessories, Makeup Brushes ...",,,
2,0733001998,"From the Greek island of Chios, this Mastiha b...",Mastiha Body Lotion,http://ecx.images-amazon.com/images/I/311WK5y1...,{'Beauty': 540255},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",,,
3,0737104473,Limited edition Hello Kitty Lipstick featuring...,Hello Kitty Lustre Lipstick (See sellers comme...,http://ecx.images-amazon.com/images/I/31u6Hrzk...,{'Beauty': 931125},"[[Beauty, Makeup, Lips, Lipstick]]",,,
4,0762451459,"The mermaid is an elusive (okay, mythical) cre...",Stephanie Johnson Mermaid Round Snap Mirror,http://ecx.images-amazon.com/images/I/41y2%2BF...,,"[[Beauty, Tools & Accessories, Mirrors, Makeup...",19.98,,
...,...,...,...,...,...,...,...,...,...
259199,B00LP2YB8E,Color: White\nFullness72 inches\nCenter Gather...,2t 2t Edge Crystal Rhinestones Bridal Wedding ...,http://ecx.images-amazon.com/images/I/41E630m-...,,"[[Beauty, Hair Care, Styling Tools, Styling Ac...",,,
259200,B00LOS7MEE,"The secret to long lasting colors, healthy nai...",French Manicure Gel Nail Polish Set - &quot;Se...,http://ecx.images-amazon.com/images/I/41skHL1O...,{'Beauty': 108820},"[[Beauty, Makeup, Nails, Nail Polish]]",,"{'also_viewed': ['B0057JCYYE', 'B00LMXHR1Y', '...",
259201,B00LPVG6V0,ResQ Organics Face & Body Wash - With Aloe Ver...,ResQ Organics Face &amp; Body Wash - Aloe Vera...,http://ecx.images-amazon.com/images/I/31C1w4Ku...,,"[[Beauty, Skin Care, Face, Creams & Moisturize...",,,
259202,B00LTDUHJQ,Color: White\n2 Tier \nFullness 72 inches\nSew...,2 Tier Tulle Elbow Wedding Veil with Ribbon Ed...,http://ecx.images-amazon.com/images/I/51%2B%2B...,,"[[Beauty, Hair Care, Styling Tools, Styling Ac...",,,


In [None]:
import pandas as pd

# Giả sử 'df' là DataFrame của bạn

# Bước 1 & 2: Gộp và phẳng hóa list của các categories
all_categories = [category for sublist in df['categories'] for category in sublist[0]]

# Bước 3: Loại bỏ các giá trị trùng lặp và đếm
unique_categories = set(all_categories)  # Sử dụng set để loại bỏ trùng lặp
num_unique_categories = len(unique_categories)  # Đếm số lượng phần tử duy nhất

print(f"Số lượng categories: {num_unique_categories}")
print("Danh sách các categories:")
for category in unique_categories:
    print(category)


Số lượng categories: 275
Danh sách các categories:
Nail Whitening
Hair Rollers
Sports & Outdoors
Children's
Retinol
Glitter & Shimmer
Powder
Wigs
Irons
Shower Caps
Massage & Relaxation
Masks & Pillows
Moisturizing Gloves
Eau de Toilette
Nail Brushes
Sunscreens & Self Tanners
Women's
Fluids & Lotions
Snow Sports
Diffusers
Hair Styling Serums
Sets
Cuticle Care
Fillers
Eyeliner
Cosmetic Bags
Creams & Moisturizers
Hair Regrowth Treatments
Hair Coloring Tools
Nail Strengthening
Foundation
Hair Color Removers
Hardware
Hair Color Mixing Bowls
Makeup Sets
Salicylic Acid
Skin Protection
Body Scrubs
Lip Plumpers
Curling Irons
Caps, Foils & Wraps
Hair Drying Towels
Travel Cases & Holders
Skiing
Hair Color
Styling Tools
Nail Polish
Nail Treatments
Combs
Crimping Irons
Top & Base Coats
Texturizers
Lipstick Primers
Bags, Packs & Accessories
Moisturizers
Nail Repair
Nail Files & Buffers
Bubble Bath
Hair Extensions
Fan Shop
Health & Personal Care
Fake Eyelashes & Adhesives
Blush
Color Refreshers
Sun
C

In [None]:
# Làm sạch dữ liệu văn bản và xử lý giá trị thiếu
df['description'] = df['description'].fillna('').map(lambda x: x.lower())
df['title'] = df['title'].fillna('').map(lambda x: x.lower())
df['price'] = df['price'].fillna(df['price'].mean())

# Đối với `categories`, giả sử mỗi item chỉ có một danh sách category duy nhất
df['categories'] = df['categories'].map(lambda x: ','.join(x[0]).lower())


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Khởi tạo TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Giới hạn số lượng features

# Kết hợp văn bản từ description và title
text_features = tfidf_vectorizer.fit_transform(df['description'] + " " + df['title'])

# Có thể chuyển đổi `text_features` thành array để dễ dàng kết hợp với các feature khác nếu cần


In [None]:
from sklearn.preprocessing import OneHotEncoder

# Khởi tạo One-Hot Encoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# Encode `categories`
categories_features = onehot_encoder.fit_transform(df[['categories']])


In [None]:
import numpy as np

# Chuyển đổi `price` thành array để kết hợp với các feature khác
price_features = np.array(df['price']).reshape(-1, 1)


In [None]:
from scipy.sparse import hstack

# Kết hợp các feature
final_features = hstack([text_features, categories_features, price_features])


In [None]:
final_features_dense = final_features.toarray()

# Xem kích thước của final_features
print("Kích thước của final_features:", final_features_dense.shape)



Kích thước của final_features: (259204, 1285)


In [26]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Tạo một ma trận dữ liệu ví dụ
data = np.array([[1, 0, 3],
                 [4, 0, 0],
                 [1, 2, 0],
                 [0, 0, 0]])

# Chuyển đổi sang định dạng sparse matrix
sparse_matrix = csr_matrix(data)

# Tính toán độ đo tương đồng cosine giữa các hàng của ma trận
item_similarity = cosine_similarity(sparse_matrix)

# Chuyển đổi matrix tương đồng sang DataFrame để dễ đọc
item_similarity_df = pd.DataFrame(item_similarity, index=[f'item_{i}' for i in range(data.shape[0])], 
                                  columns=[f'item_{i}' for i in range(data.shape[0])])

# In ra ma trận tương đồng cosine
print("Ma trận dữ liệu:")
print(data)
print("\nMa trận tương đồng cosine:")
print(item_similarity_df)

# Kiểm tra các giá trị trên đường chéo của ma trận tương đồng
diagonal_values = np.diag(item_similarity_df.values)
print("\nGiá trị trên đường chéo của ma trận tương đồng:")
print(diagonal_values)


Ma trận dữ liệu:
[[1 0 3]
 [4 0 0]
 [1 2 0]
 [0 0 0]]

Ma trận tương đồng cosine:
          item_0    item_1    item_2  item_3
item_0  1.000000  0.316228  0.141421     0.0
item_1  0.316228  1.000000  0.447214     0.0
item_2  0.141421  0.447214  1.000000     0.0
item_3  0.000000  0.000000  0.000000     0.0

Giá trị trên đường chéo của ma trận tương đồng:
[1. 1. 1. 0.]


In [None]:
# Xem một số giá trị mẫu từ final_features
print("Một số giá trị mẫu từ final_features:\n", final_features_dense[:1000])

Một số giá trị mẫu từ final_features:
 [[ 0.          0.          0.         ...  0.          0.
  24.87816496]
 [ 0.          0.          0.         ...  0.          0.
  24.87816496]
 [ 0.          0.          0.         ...  0.          0.
  24.87816496]
 ...
 [ 0.          0.          0.         ...  0.          0.
   3.07      ]
 [ 0.          0.          0.         ...  0.          0.
  24.87816496]
 [ 0.          0.          0.         ...  0.          0.
   9.74      ]]
