In [12]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import StandardScaler

import nltk
# nltk.download('wordnet')
#from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer  # Thay đổi ở đây
from sklearn.neighbors import KNeighborsRegressor
from math import sqrt

import re
import string
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.metrics import mean_squared_error

In [13]:
# Đọc dữ liệu
items_df = pd.read_csv('./items.csv')
ratings_df = pd.read_csv('./ratings.csv')
users_df = pd.read_csv('./users.csv')

In [14]:
items_df["feature"] = items_df["feature"].str.lower()

def remove_tags(text):
    pattern = re.compile("<.*?>")
    return pattern.sub(r'', text)

items_df['feature'] = items_df['feature'].apply(remove_tags)
punctuation = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('', '', punctuation))

items_df['feature'] = items_df['feature'].apply(remove_punc)

In [15]:
# Sử dụng TfidfVectorizer và tính cosine similarity trên cột 'feature'
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
vectors = tfidf_vectorizer.fit_transform(items_df["feature"]).toarray()
similarity = cosine_similarity(vectors)

In [34]:
# Lưu DataFrame thành file CSV
vectors_df = pd.DataFrame(vectors)
vectors_df.to_csv('vt.csv', index=False)

In [31]:
feature_names = tfidf_vectorizer.get_feature_names_out()
#print("Feature names:", feature_names)

Feature names: ['03mm' '05' '05mm' ... 'zippers' 'zone' 'zones']


In [17]:
ratings_matrix = ratings_df.to_numpy()

def get_items_rated_by_user(rate_matrix, user_id):
    """
    Trả về (item_ids, scores)
    """
    user_row_indices = np.where(rate_matrix[:, 0] == user_id + 1)[0]
    if len(user_row_indices) == 0:
        return np.array([]), np.array([])  

    item_ids = rate_matrix[user_row_indices, 1] - 1  
    scores = rate_matrix[user_row_indices, 2]
    return item_ids.astype(int), scores  

In [18]:
n_users = users_df.shape[0]
#print("Number of users:", n_users)
n_items = items_df.shape[0]
#print("Number of items:", n_items)

In [19]:
knn_models = [None] * n_users

for n in range(n_users):
    ids, scores = get_items_rated_by_user(ratings_matrix, n)

    if len(ids) == 0:
        knn_models[n] = None  # Không có đánh giá
        continue

    n_neighbors = min(5, len(ids))
    knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights='uniform', metric='cosine')
    Xhat = vectors[ids.astype(int), :]  
    y = scores  
    knn.fit(Xhat, y)
    knn_models[n] = knn

In [20]:
# Dự đoán điểm đánh giá cho tất cả người dùng và item đã được đánh giá
Yhat = np.zeros((n_items, n_users))

for n in range(n_users):
    knn = knn_models[n]
    if knn is None:
        continue 

    ids, scores = get_items_rated_by_user(ratings_matrix, n)
    if len(ids) == 0:
        continue

    Xhat = vectors[ids, :] 
    y_pred = knn.predict(Xhat)

    Yhat[ids, n] = y_pred

In [21]:
def evaluate(Yhat, rates):
    y_true = []
    y_pred = []

    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        if len(ids) == 0:
            continue 

        scores_pred = Yhat[ids, n]
        y_true.extend(scores_truth)
        y_pred.extend(scores_pred)

    mae = np.mean(np.abs(np.array(y_true) - np.array(y_pred))) if len(y_true) > 0 else float('nan')
    rmse = mean_squared_error(y_true, y_pred, squared=False) if len(y_true) > 0 else float('nan')

    return mae, rmse

mae, rmse = evaluate(Yhat, ratings_matrix)
print("MAE for training:", mae)
print("RMSE for training:", rmse)

MAE for training: 0.5754489526513736
RMSE for training: 0.9042204573458988


In [29]:
def get_top_similar_items(item_id, similarity_matrix, items_df, top_n=10):
    item_similarities = similarity_matrix[item_id]  
    similar_items = np.argsort(item_similarities)[::-1]  
    
    similar_items = similar_items[similar_items != item_id]
    
    similar_item_ids = items_df.iloc[similar_items[:top_n]]['itemId'].values
    similar_item_names = items_df.iloc[similar_items[:top_n]]['itemName'].values

    return similar_item_ids, similar_item_names

item_id = int(input("Nhập ItemId: "))
top_similar_item_ids, top_similar_item_names = get_top_similar_items(item_id, similarity, items_df)

print(f"Top 10 sản phẩm tương tự:")
for idx, (item_id, item_name) in enumerate(zip(top_similar_item_ids, top_similar_item_names)):
    print(f"{idx + 1}. itemId: {item_id}, {item_name}")


Top 10 sản phẩm tương tự:
1. itemId: 1285, PetSafe Easysport Harness
2. itemId: 3595, DEXDOG EZHarness, Dog Harness | On/Off Quick | Easy Step in | Walk Vest
3. itemId: 12757, 2 Hounds Design Freedom No-Pull Dog Harness, Adjustable Comfortable Control for Dog Walking, Made in USA (Leash Sold Separately) (5/8&quot;)
4. itemId: 11607, 2 Hounds Design Freedom No-Pull Dog Harness and Leash, Adjustable Comfortable Control for Dog Walking, Made in USA (1&quot;)
5. itemId: 1128, Kurgo Tru-Fit No Pull Dog Harness, Easy Walking Harness, Quick On and Off Harness With Pet Seat Belt Tether for Car
6. itemId: 181, Easy to Put on and Take Off Small Dog Harnesses Our Small Dog Harness Vest has Padded Interior and Exterior Cushioning Ensuring Your Dog is Snug and Comfortable !
7. itemId: 2012, Lifepul No Pull Dog Vest Harness - Dog Body Padded Vest - Comfort Control for Large Dogs in Training Walking - No More Pulling, Tugging or Choking
8. itemId: 5534, Pawtitas Pet Reflective Step in Dog Harness Ref