In [2]:
# Hàm chuyển đổi chuỗi thành danh sách dựa trên khoảng trắng
def split_by_space(value):
    if pd.isna(value):
        return []
    return value.split()


# Hàm lấy top_k ảnh cho một ảnh query
def get_top_k_results(df, query_image, k):
    submission_row = df[df['id'] == query_image]
    if not submission_row.empty:
        top_k_results = submission_row.iloc[0]['images'][:k]
        return top_k_results
    return []

# Hàm tính Recall@k
def calculate_recall_at_k(top_k_results, query_image, total_relevant):
    if total_relevant == 0:
        return 0
    relevant_images = set(ground_truth_mapping.get(query_image, []))
    retrieved_relevant = len(relevant_images.intersection(top_k_results))
    return retrieved_relevant / total_relevant

In [3]:
import pandas as pd

# Đọc dữ liệu từ file ground_truth.csv và test_submission_Resnet152_IVF.csv
ground_truth_path = '/kaggle/input/deepfashion1/ground_truth.csv'
test_submission_path = '/kaggle/input/deepfashion1/test_submission_Resnet152_Flat.csv'

ground_truth_df = pd.read_csv(ground_truth_path)
test_submission_df = pd.read_csv(test_submission_path)

# Chuẩn hóa cột 'relevant_images' trong ground_truth_df
ground_truth_df['relevant_images'] = ground_truth_df['relevant_images'].apply(split_by_space)

# Tạo từ điển ground_truth từ ground_truth.csv
ground_truth_mapping = {
    row['query']: row['relevant_images']
    for _, row in ground_truth_df.iterrows()
}

# Chuẩn hóa cột 'images' trong test_submission_df
test_submission_df['images'] = test_submission_df['images'].apply(split_by_space)

In [4]:
# Biến lưu tổng Recall@k và số lượng query
num_queries = 0
total_recall_at_k_1 = 0
total_recall_at_k_5 = 0
total_recall_at_k_10 = 0
total_recall_at_k_50 = 0

# Duyệt qua từng ảnh query trong ground_truth_df
for query_image in ground_truth_df['query']:
    # Kiểm tra nếu query_image có trong cột 'id' của test_submission_df
    if query_image not in test_submission_df['id'].values:
        continue

    total_relevant = len(ground_truth_mapping.get(query_image, []))

    # Lấy top_k_results cho từng giá trị k
    top_k_results_1 = get_top_k_results(test_submission_df, query_image, 1)
    top_k_results_5 = get_top_k_results(test_submission_df, query_image, 5)
    top_k_results_10 = get_top_k_results(test_submission_df, query_image, 10)
    top_k_results_50 = get_top_k_results(test_submission_df, query_image, 50)

    # Tính Recall@k
    recall_at_k_1 = calculate_recall_at_k(top_k_results_1, query_image, total_relevant)
    recall_at_k_5 = calculate_recall_at_k(top_k_results_5, query_image, total_relevant)
    recall_at_k_10 = calculate_recall_at_k(top_k_results_10, query_image, total_relevant)
    recall_at_k_50 = calculate_recall_at_k(top_k_results_50, query_image, total_relevant)

    # Cộng dồn Recall@k
    total_recall_at_k_1 += recall_at_k_1
    total_recall_at_k_5 += recall_at_k_5
    total_recall_at_k_10 += recall_at_k_10
    total_recall_at_k_50 += recall_at_k_50

    num_queries += 1

In [5]:
# Tính Recall@k trung bình
if num_queries > 0:
    average_recall_at_k_1 = total_recall_at_k_1 / num_queries
    average_recall_at_k_5 = total_recall_at_k_5 / num_queries
    average_recall_at_k_10 = total_recall_at_k_10 / num_queries
    average_recall_at_k_50 = total_recall_at_k_50 / num_queries

    print(f"Recall@1 trung bình: {average_recall_at_k_1:.4f}")
    print(f"Recall@5 trung bình: {average_recall_at_k_5:.4f}")
    print(f"Recall@10 trung bình: {average_recall_at_k_10:.4f}")
    print(f"Recall@50 trung bình: {average_recall_at_k_50:.4f}")
else:
    print("Không có ảnh query có ảnh liên quan để tính Recall.")

Recall@1 trung bình: 0.2365
Recall@5 trung bình: 0.4492
Recall@10 trung bình: 0.5026
Recall@50 trung bình: 0.5026
