In [1]:
import pandas as pd

def load_data(submission_file, ground_truth_file):
    """
    Đọc file test-submission và ground-truth, xử lý NaN và chuyển thành cấu trúc phù hợp.
    """
    # Đọc test-submission
    submission = pd.read_csv(submission_file)
    submission.columns = submission.columns.str.strip()  # Xóa khoảng trắng trong tên cột
    
    # Đọc ground-truth
    ground_truth = pd.read_csv(ground_truth_file)
    ground_truth.columns = ground_truth.columns.str.strip()  # Xóa khoảng trắng trong tên cột
    
    # Xử lý NaN: thay NaN trong cột 'relevant_images' bằng chuỗi rỗng
    ground_truth['relevant_images'] = ground_truth['relevant_images'].fillna("")
    
    # Chuyển ground-truth thành dictionary
    ground_truth_dict = {}
    for _, row in ground_truth.iterrows():
        query = row['query']
        relevant_images = set(row['relevant_images'].split())  # Chuyển chuỗi thành set
        ground_truth_dict[query] = relevant_images
    
    return submission, ground_truth_dict


def average_precision_at_k(relevant_images, predicted_images, k):
    """
    Tính Average Precision (AP) cho top-k kết quả của 1 query.
    """
    if not relevant_images:
        return 0.0  # Nếu không có ảnh liên quan, AP = 0
    
    predicted_images_at_k = predicted_images[:k]
    num_relevant = 0
    precision_sum = 0.0

    for i, image in enumerate(predicted_images_at_k):
        if image in relevant_images:
            num_relevant += 1
            precision_sum += num_relevant / (i + 1)
    
    # Average Precision
    return precision_sum / len(relevant_images)


def mean_average_precision(submission, ground_truth_dict, top_k_list):
    """
    Tính Mean Average Precision (mAP) cho các top-k kết quả.
    """
    map_scores = {k: 0.0 for k in top_k_list}
    num_queries = 0

    for _, row in submission.iterrows():
        query = row['id']
        
        # Xử lý NaN trong 'images' (thay NaN bằng chuỗi rỗng)
        predicted_images = str(row['images']).split() if pd.notna(row['images']) else []

        if query not in ground_truth_dict:
            continue
        
        relevant_images = ground_truth_dict[query]
        num_queries += 1

        # Tính AP cho từng giá trị k
        for k in top_k_list:
            ap = average_precision_at_k(relevant_images, predicted_images, k)
            map_scores[k] += ap
    
    # Chia tổng AP cho số lượng query để tính mAP
    for k in top_k_list:
        map_scores[k] /= num_queries
    
    return map_scores



def main():
    # Đường dẫn file test-submission và ground-truth
    submission_file = '/kaggle/input/deepfashion1/test_submission_Resnet152_IVF.csv'  # Thay bằng đường dẫn file test-submission của bạn
    ground_truth_file = '/kaggle/input/deepfashion1/ground_truth.csv'   # Thay bằng đường dẫn file ground-truth của bạn
    
    # Đọc dữ liệu
    submission, ground_truth_dict = load_data(submission_file, ground_truth_file)
    
    # Tính mAP cho top-1, top-5, top-10
    top_k_list = [1, 5, 10]
    map_scores = mean_average_precision(submission, ground_truth_dict, top_k_list)
    
    # In kết quả
    for k in top_k_list:
        print(f"Mean Average Precision for top-{k}: {map_scores[k]:.6f}")


if __name__ == "__main__":
    main()


Mean Average Precision for top-1: 0.217288
Mean Average Precision for top-5: 0.352205
Mean Average Precision for top-10: 0.354768


In [2]:
# Hàm chuyển đổi chuỗi thành danh sách dựa trên khoảng trắng
def split_by_space(value):
    if pd.isna(value):
        return []
    return value.split()


# Hàm lấy top_k ảnh cho một ảnh query
def get_top_k_results(df, query_image, k):
    submission_row = df[df['id'] == query_image]
    if not submission_row.empty:
        top_k_results = submission_row.iloc[0]['images'][:k]
        return top_k_results
    return []

# Hàm tính Recall@k
def calculate_recall_at_k(top_k_results, query_image, total_relevant):
    if total_relevant == 0:
        return 0
    relevant_images = set(ground_truth_mapping.get(query_image, []))
    retrieved_relevant = len(relevant_images.intersection(top_k_results))
    return retrieved_relevant / total_relevant


In [3]:
import pandas as pd

# Đọc dữ liệu từ file ground_truth.csv và test_submission_Resnet152_IVF.csv
ground_truth_path = '/kaggle/input/deepfashion1/ground_truth.csv'
test_submission_path = '/kaggle/input/deepfashion1/test_submission_Resnet152_IVF.csv'

ground_truth_df = pd.read_csv(ground_truth_path)
test_submission_df = pd.read_csv(test_submission_path)

# Chuẩn hóa cột 'relevant_images' trong ground_truth_df
ground_truth_df['relevant_images'] = ground_truth_df['relevant_images'].apply(split_by_space)

# Tạo từ điển ground_truth từ ground_truth.csv
ground_truth_mapping = {
    row['query']: row['relevant_images']
    for _, row in ground_truth_df.iterrows()
}

# Chuẩn hóa cột 'images' trong test_submission_df
test_submission_df['images'] = test_submission_df['images'].apply(split_by_space)

In [4]:
# Biến lưu tổng Recall@k và số lượng query
num_queries = 0
total_recall_at_k_1 = 0
total_recall_at_k_5 = 0
total_recall_at_k_10 = 0
total_recall_at_k_50 = 0

# Duyệt qua từng ảnh query trong ground_truth_df
for query_image in ground_truth_df['query']:
    # Kiểm tra nếu query_image có trong cột 'id' của test_submission_df
    if query_image not in test_submission_df['id'].values:
        continue

    total_relevant = len(ground_truth_mapping.get(query_image, []))

    # Lấy top_k_results cho từng giá trị k
    top_k_results_1 = get_top_k_results(test_submission_df, query_image, 1)
    top_k_results_5 = get_top_k_results(test_submission_df, query_image, 5)
    top_k_results_10 = get_top_k_results(test_submission_df, query_image, 10)
    top_k_results_50 = get_top_k_results(test_submission_df, query_image, 50)

    # Tính Recall@k
    recall_at_k_1 = calculate_recall_at_k(top_k_results_1, query_image, total_relevant)
    recall_at_k_5 = calculate_recall_at_k(top_k_results_5, query_image, total_relevant)
    recall_at_k_10 = calculate_recall_at_k(top_k_results_10, query_image, total_relevant)
    recall_at_k_50 = calculate_recall_at_k(top_k_results_50, query_image, total_relevant)

    # Cộng dồn Recall@k
    total_recall_at_k_1 += recall_at_k_1
    total_recall_at_k_5 += recall_at_k_5
    total_recall_at_k_10 += recall_at_k_10
    total_recall_at_k_50 += recall_at_k_50

    num_queries += 1

In [5]:
# Tính Recall@k trung bình
if num_queries > 0:
    average_recall_at_k_1 = total_recall_at_k_1 / num_queries
    average_recall_at_k_5 = total_recall_at_k_5 / num_queries
    average_recall_at_k_10 = total_recall_at_k_10 / num_queries
    average_recall_at_k_50 = total_recall_at_k_50 / num_queries

    print(f"Recall@1 trung bình: {average_recall_at_k_1:.4f}")
    print(f"Recall@5 trung bình: {average_recall_at_k_5:.4f}")
    print(f"Recall@10 trung bình: {average_recall_at_k_10:.4f}")
    print(f"Recall@50 trung bình: {average_recall_at_k_50:.4f}")
else:
    print("Không có ảnh query có ảnh liên quan để tính Recall.")

Recall@1 trung bình: 0.2173
Recall@5 trung bình: 0.3687
Recall@10 trung bình: 0.3763
Recall@50 trung bình: 0.3763
