# Normalize dataset categories

This notebook loads the dataset from Hugging Face, normalizes category labels with the provided merge rules, prints category stats per split, and saves the normalized dataset to `data_vi_normalized`.


In [None]:
from datasets import load_dataset
from collections import Counter

# 1) Load dataset (có đủ 3 split)
dataset = load_dataset("vohuutridung/3190-data")
print(dataset)

# 2) Merge rules
merge_rules = {
    "CAMERA": "MÁY_ẢNH",
    "DỊCH_VỤ": "DỊCH_VỤ&PHỤ_KIỆN",
    "LOA": "TÍNH_NĂNG",
    "LOA_THOẠI": "TÍNH_NĂNG",
    "VÂN_TAY": "TÍNH_NĂNG",
    "LƯU TRỮ": "LƯU_TRỮ",
}

def normalize_labels(labels):
    # labels: list of [aspect, category, polarity, opinion]
    out = []
    for aspect, category, polarity, opinion in labels:
        category = merge_rules.get(category, category)
        out.append([aspect, category, polarity, opinion])
    return out

def map_fn(example):
    return {"labels": normalize_labels(example["labels"])}

# 3) Apply cho tất cả split hiện có trong dataset
for split in dataset.keys():
    dataset[split] = dataset[split].map(map_fn)

print("Done normalize splits:", list(dataset.keys()))

# 4) (Optional) Thống kê lại category cho từng split
def count_categories(ds_split):
    c = Counter()
    for ex in ds_split:
        for quad in ex["labels"]:
            c[quad[1]] += 1
    return c

for split in dataset.keys():
    c = count_categories(dataset[split])
    print(f"\n[{split}] num_categories =", len(c))
    print("top10:", c.most_common(10))

# 5) Save normalized dataset
dataset.save_to_disk("data_vi_normalized")
print("Saved to data_vi_normalized")
