# Data

In [71]:
# NOTE: you CAN change this cell
# If you want to use your own database, download it here
# !gdown ...
!gdown --fuzzy https://drive.google.com/file/d/168XttTc8_UlYXuPHHkCKUh2bp_h3L9o8/view?usp=sharing -O list_address.json
!gdown --fuzzy https://drive.google.com/file/d/176heW3Yco-kyXJl25byYDchTFa5Yso9x/view?usp=sharing -O abbreviations.json
!gdown --fuzzy https://drive.google.com/file/d/1UFvugSSmVgruhK27CgcJy6kJtZNVG1Mw/view?usp=sharing -O list_district.txt
!gdown --fuzzy https://drive.google.com/file/d/1dLQOYTQ4fRTTnd-OMc7W70oRw31KtvBo/view?usp=sharing -O list_province.txt
!gdown --fuzzy https://drive.google.com/file/d/1Ex7v0TgE18xTF2z_wfhDrLPw6BPqHtZG/view?usp=sharing -O list_ward.txt

Downloading...
From: https://drive.google.com/uc?id=168XttTc8_UlYXuPHHkCKUh2bp_h3L9o8
To: /content/list_address.json
100% 256k/256k [00:00<00:00, 79.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=176heW3Yco-kyXJl25byYDchTFa5Yso9x
To: /content/abbreviations.json
100% 118/118 [00:00<00:00, 418kB/s]
Downloading...
From: https://drive.google.com/uc?id=1UFvugSSmVgruhK27CgcJy6kJtZNVG1Mw
To: /content/list_district.txt
100% 4.80k/4.80k [00:00<00:00, 16.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1dLQOYTQ4fRTTnd-OMc7W70oRw31KtvBo
To: /content/list_province.txt
100% 638/638 [00:00<00:00, 2.24MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Ex7v0TgE18xTF2z_wfhDrLPw6BPqHtZG
To: /content/list_ward.txt
100% 76.2k/76.2k [00:00<00:00, 72.2MB/s]


In [72]:
# NOTE: you CAN change this cell
# Add more to your needs
# you must place ALL pip install here
#!pip install editdistance
!pip install gdown



In [73]:
# NOTE: you CAN change this cell
# import your library here
import time
import re
import json
from functools import lru_cache

# Algorithm

In [88]:
# NOTE: you MUST change this cell
# New methods / functions must be written under class Solution.

THRESHOLD = 0.83
MAX_DISTANCE = 3
_WARD_FALLBACK_TIME_LIMIT_SEC = 0.03
_MAPPING = [
    # oa: o + (a có dấu) -> (o có dấu) + a
    ("oà", "òa"), ("oá", "óa"), ("oả", "ỏa"), ("oã", "õa"), ("oạ", "ọa"),
    ("Oà", "Òa"), ("Oá", "Óa"), ("Oả", "Ỏa"), ("Oã", "Õa"), ("Oạ", "Ọa"),
    ("oÀ", "òa"), ("oÁ", "óa"), ("oẢ", "ỏa"), ("oÃ", "õa"), ("oẠ", "ọa"),
    ("OÀ", "ÒA"), ("OÁ", "ÓA"), ("OẢ", "ỎA"), ("OÃ", "ÕA"), ("OẠ", "ỌA"),

    # oe: o + (e có dấu) -> (o có dấu) + e
    ("oè", "òe"), ("oé", "óe"), ("oẻ", "ỏe"), ("oẽ", "õe"), ("oẹ", "ọe"),
    ("Oè", "Òe"), ("Oé", "Óe"), ("Oẻ", "Ỏe"), ("Oẽ", "Õe"), ("Oẹ", "Ọe"),
    ("oÈ", "òe"), ("oÉ", "óe"), ("oẺ", "ỏe"), ("oẼ", "õe"), ("oẸ", "ọe"),
    ("OÈ", "ÒE"), ("OÉ", "ÓE"), ("OẺ", "ỎE"), ("OẼ", "ÕE"), ("OẸ", "ỌE"),

    # uy: u + (y có dấu) -> (u có dấu) + y
    ("uỳ", "ùy"), ("uý", "úy"), ("uỷ", "ủy"), ("uỹ", "ũy"), ("uỵ", "ụy"),
    ("Uỳ", "Ùy"), ("Uý", "Úy"), ("Uỷ", "Ủy"), ("Uỹ", "Ũy"), ("Uỵ", "Ụy"),
    ("uỲ", "ùy"), ("uÝ", "úy"), ("uỶ", "ủy"), ("uỸ", "ũy"), ("uỴ", "ụy"),
    ("UỲ", "ÙY"), ("UÝ", "ÚY"), ("UỶ", "ỦY"), ("UỸ", "ŨY"), ("UỴ", "ỤY"),
]
_REMOVE_DATA_BY_TEAMPLTE_FILE = False
class Solution:
    def __init__(self):
        self.province_path = 'list_province.txt'
        self.district_path = 'list_district.txt'
        self.ward_path = 'list_ward.txt'

        # address data
        self.address_path = 'list_address.json'
        self.abbreviate_path = 'abbreviations.json'
        self.province_data = self.TrieNode()
        self.district_data = {}
        self.ward_data = {}
        self.all_districts_data = self.TrieNode()
        self.province_wards_data = {}

        # Load data
        self.load_hierarchical_data()

    class TrieNode():
        def __init__(self):
            self.childs = {}
            self.exact_word = None
            self.key_word = None

    def remove_prefixes(self, text, level):
        text = text.strip().lower()

        if level == 'province':
            text = re.sub(r'\b(tỉnh|tp|tnh|tỉnhc|tỉnhv|tpho|thanhpho|thànhphô|thànhphố|thphố|tphố|thpho|tp\.)\s*', '', text)
        elif level == 'district':
            text = re.sub(r'\b(huyện|hyen|huyen|quận|quan|qận|qan|qun|tp|tpho|thanhpho|thànhphô|thànhphố|thphố|tphố|thpho|q\.)\s*', '', text)
        elif level == 'ward':
            text = re.sub(r'\b(xã|thịxã|thxã|phường|phưòng|phung|tt|tx|p|Thi trấ|F\.)\s*', '', text)

        return text

    def insert(self, root_node, key, word, level):
        # Remove prefixes
        cleaned_key = self.remove_prefixes(key, level)
        cleaned_key = cleaned_key.replace(" ", "").strip().lower()

        node = root_node
        for char in cleaned_key:
            if char not in node.childs:
                node.childs[char] = self.TrieNode()
            node = node.childs[char]
        node.exact_word = word.strip()
        node.key_word = cleaned_key

    def load_hierarchical_data(self):
        """Load data from the hierarchical JSON file"""
        def remove_prefix(name):
            if not name:
                return ""
            name = re.sub(r'^(tỉnh|thành phố|huyện|quận|thị xã|xã|phường|thị trấn)\s+', '', name, flags=re.IGNORECASE)
            return name.strip()

        with open(self.province_path, 'r', encoding='utf-8') as f:
            flat_provinces = [line.strip() for line in f if line.strip()]

        with open(self.district_path, 'r', encoding='utf-8') as f:
            flat_districts = [line.strip() for line in f if line.strip()]

        with open(self.ward_path, 'r', encoding='utf-8') as f:
            flat_wards = [line.strip() for line in f if line.strip()]

        with open(self.address_path, "r", encoding='utf-8') as file:
            data = json.load(file)

        try:
            with open(self.abbreviate_path, 'r', encoding='utf-8') as f:
                self.abbreviations = json.load(f)
        except FileNotFoundError:
            self.abbreviations = {"province": {}, "district": {}, "ward": {}}

        # Load provinces
        for province_name in data.keys():
            province_name = remove_prefix(province_name)
            if _REMOVE_DATA_BY_TEAMPLTE_FILE and province_name not in flat_provinces:
                continue
            self.insert(self.province_data, province_name, province_name, 'province')
            for abbreviation in self.abbreviations.get("province", {}).get(province_name, []):
                self.insert(self.province_data, abbreviation, province_name, 'province')

        # Load districts and wards
        for province_name, districts in data.items():
            province_name = remove_prefix(province_name)
            if _REMOVE_DATA_BY_TEAMPLTE_FILE and province_name not in flat_provinces:
                continue
            # Create a trie for districts in province
            self.district_data[province_name] = self.TrieNode()

            # Create a trie for wards in this province (skip district level)
            self.province_wards_data[province_name] = self.TrieNode()

            for district_name, wards in districts.items():
                district_name = remove_prefix(district_name)
                if _REMOVE_DATA_BY_TEAMPLTE_FILE and district_name not in flat_districts:
                    continue
                # Insert district into province's district trie
                self.insert(self.district_data[province_name], district_name, district_name, 'district')
                # Insert district into all districts trie (for skip province)
                self.insert(self.all_districts_data, district_name, district_name, 'district')

                for abbreviation in self.abbreviations.get("district", {}).get(district_name, []):
                    self.insert(self.province_data, abbreviation, district_name, 'district')

                # Create a trie for wards in this district
                ward_key = (province_name, district_name)
                self.ward_data[ward_key] = self.TrieNode()

                # Insert wards into district's ward trie
                for ward_name in wards:
                    ward_name = remove_prefix(ward_name)
                    if _REMOVE_DATA_BY_TEAMPLTE_FILE and ward_name not in flat_wards:
                        continue
                    self.insert(self.ward_data[ward_key], ward_name, ward_name, 'ward')
                    # Insert ward into province's ward trie (skip district level)
                    self.insert(self.province_wards_data[province_name], ward_name, ward_name, 'ward')

                    for abbreviation in self.abbreviations.get("ward", {}).get(ward_name, []):
                        self.insert(self.province_data, abbreviation, ward_name, 'ward')

    def find_closest_string(self, root_node, word, level, deadline=None):
        # Remove prefixes from search word
        word = self.remove_prefixes(word, level).strip().lower()

        results = []
        max_distance = MAX_DISTANCE
        initial_row = list(range(len(word) + 1))

        def search_recursive(node, prefix, previous_row):
            nonlocal max_distance

            # early stop when timeout
            if deadline is not None and time.perf_counter() > deadline:
                return

            current_word = prefix

            if node.key_word and previous_row[-1] <= max_distance:
                results.append((node.key_word, node.exact_word, previous_row[-1]))
                max_distance = previous_row[-1]

            for char, child_node in node.childs.items():
                current_row = [previous_row[0] + 1]

                for i in range(1, len(word) + 1):
                    insert_cost = current_row[i - 1] + 1
                    delete_cost = previous_row[i] + 1
                    replace_cost = previous_row[i - 1] + (0 if word[i - 1] == char else 1)
                    current_row.append(min(insert_cost, delete_cost, replace_cost))

                if min(current_row) <= max_distance:
                    search_recursive(child_node, current_word + char, current_row)

        search_recursive(root_node, "", initial_row)
        results.sort(key=lambda x: x[2])

        dynamic_threshold = THRESHOLD - (len(word) / 100)
        if results:
            return results[0][1] if dynamic_threshold <= 1 - (results[0][2]/max(len(word), len(results[0][0]))) else ""
        return ""

    def normalize_vn_legacy_to_modern(self, text: str) -> str:
        if not text:
            return text
        for old, new in _MAPPING:
            text = text.replace(old, new)
        return text

    def sanitize_string(self, input_word):
        input_word = input_word.strip().lower()
        input_word = self.normalize_vn_legacy_to_modern(input_word)
        input_word = re.sub(r'[\s,.~-]', ' ', input_word)
        return input_word.strip()

    def slide_and_combine_with_indices(self, words):
        n = len(words)
        results = []

        for i in range(n - 1, -1, -1):
            window = words[max(0, i - 2):i + 1]

            if len(window) == 3:
                w1, w2, w3 = window
                results.extend([(f"{w2}{w3}", i - 1), (f"{w1}{w2}", i - 2), (w3, i), (f"{w1}{w2}{w3}", i - 2)])
            elif len(window) == 2:
                w1, w2 = window
                results.extend([(f"{w1}{w2}", i - 1), (w2, i)])
            elif len(window) == 1:
                results.append((window[0], i))

        return results

    def process(self, s: str):
        # write your process string here
        result = {
            "province": "",
            "district": "",
            "ward": ""
        }

        start_time = time.perf_counter()
        cleaned_input = self.sanitize_string(s).split()

        # Step 1: Find province
        province_candidates = self.slide_and_combine_with_indices(cleaned_input)
        matched_start_index = -1
        processed_province_candidates = set()

        for i, (candidate, start_index) in enumerate(province_candidates):
            if candidate in processed_province_candidates:
                continue
            processed_province_candidates.add(candidate)

            result["province"] = self.find_closest_string(self.province_data, candidate, 'province')
            if result["province"]:
                matched_start_index = start_index
                cleaned_input = cleaned_input[:matched_start_index]
                break

        # Step 2: Find district
        district_found = False
        if result["province"] and result["province"] in self.district_data:
            district_candidates = self.slide_and_combine_with_indices(cleaned_input)
            matched_start_index = -1
            processed_district_candidates = set()

            for i, (candidate, start_index) in enumerate(district_candidates):
                if candidate in processed_district_candidates:
                    continue
                processed_district_candidates.add(candidate)

                result["district"] = self.find_closest_string(
                    self.district_data[result["province"]],
                    candidate,
                    'district'
                )
                if result["district"]:
                    matched_start_index = start_index
                    cleaned_input = cleaned_input[:matched_start_index]
                    district_found = True
                    break

        if not result["province"]:
            district_candidates = self.slide_and_combine_with_indices(cleaned_input)
            matched_start_index = -1
            processed_district_candidates = set()
            deadline = time.perf_counter() + _WARD_FALLBACK_TIME_LIMIT_SEC

            for i, (candidate, start_index) in enumerate(district_candidates):
                if time.perf_counter() > deadline:
                    break
                if candidate in processed_district_candidates:
                    continue
                processed_district_candidates.add(candidate)

                result["district"] = self.find_closest_string(
                    self.all_districts_data,
                    candidate,
                    'district',
                    deadline=deadline
                )
                if result["district"]:
                    matched_start_index = start_index
                    cleaned_input = cleaned_input[:matched_start_index]
                    district_found = True
                    break

        # Step 3: Find ward
        if result["province"] and result["district"]:
            ward_key = (result["province"], result["district"])
            if ward_key in self.ward_data:
                ward_candidates = self.slide_and_combine_with_indices(cleaned_input)
                processed_ward_candidates = set()

                for i, (candidate, start_index) in enumerate(ward_candidates):
                    if candidate in processed_ward_candidates:
                        continue
                    processed_ward_candidates.add(candidate)

                    result["ward"] = self.find_closest_string(
                        self.ward_data[ward_key],
                        candidate,
                        'ward'
                    )
                    if result["ward"]:
                        break

        elif result["province"] and not result["district"]:
            if result["province"] in self.province_wards_data:
                ward_candidates = self.slide_and_combine_with_indices(cleaned_input)
                processed_ward_candidates = set()
                deadline = time.perf_counter() + _WARD_FALLBACK_TIME_LIMIT_SEC

                for i, (candidate, start_index) in enumerate(ward_candidates):
                    if time.perf_counter() > deadline:
                        break
                    if candidate in processed_ward_candidates:
                        continue
                    processed_ward_candidates.add(candidate)

                    result["ward"] = self.find_closest_string(
                        self.province_wards_data[result["province"]],
                        candidate,
                        'ward',
                        deadline=deadline
                    )
                    if result["ward"]:
                        break

        return result

# Run

In [None]:
# NOTE: DO NOT change this cell
!rm -rf test.json
# this link is public test
!gdown --fuzzy https://drive.google.com/file/d/1PBt3U9I3EH885CDhcXspebyKI5Vw6uLB/view?usp=sharing -O test.json

Downloading...
From: https://drive.google.com/uc?id=1PBt3U9I3EH885CDhcXspebyKI5Vw6uLB
To: /content/test.json
  0% 0.00/79.4k [00:00<?, ?B/s]100% 79.4k/79.4k [00:00<00:00, 58.4MB/s]


In [90]:
# CORRECT TESTS
groups_province = {}
groups_district = {'hòa bình': ['Hoà Bình', 'Hòa Bình'], 'kbang': ['Kbang', 'KBang'], 'quy nhơn': ['Qui Nhơn', 'Quy Nhơn']}
groups_ward = {'ái nghĩa': ['ái Nghĩa', 'Ái Nghĩa'], 'ái quốc': ['ái Quốc', 'Ái Quốc'], 'ái thượng': ['ái Thượng', 'Ái Thượng'], 'ái tử': ['ái Tử', 'Ái Tử'], 'ấm hạ': ['ấm Hạ', 'Ấm Hạ'], 'an ấp': ['An ấp', 'An Ấp'], 'ẳng cang': ['ẳng Cang', 'Ẳng Cang'], 'ẳng nưa': ['ẳng Nưa', 'Ẳng Nưa'], 'ẳng tở': ['ẳng Tở', 'Ẳng Tở'], 'an hòa': ['An Hoà', 'An Hòa'], 'ayun': ['Ayun', 'AYun'], 'bắc ái': ['Bắc ái', 'Bắc Ái'], 'bảo ái': ['Bảo ái', 'Bảo Ái'], 'bình hòa': ['Bình Hoà', 'Bình Hòa'], 'châu ổ': ['Châu ổ', 'Châu Ổ'], 'chư á': ['Chư á', 'Chư Á'], 'chư rcăm': ['Chư Rcăm', 'Chư RCăm'], 'cộng hòa': ['Cộng Hoà', 'Cộng Hòa'], 'cò nòi': ['Cò  Nòi', 'Cò Nòi'], 'đại ân 2': ['Đại Ân  2', 'Đại Ân 2'], 'đak ơ': ['Đak ơ', 'Đak Ơ'], "đạ m'ri": ["Đạ M'ri", "Đạ M'Ri"], 'đông hòa': ['Đông Hoà', 'Đông Hòa'], 'đồng ích': ['Đồng ích', 'Đồng Ích'], 'hải châu i': ['Hải Châu  I', 'Hải Châu I'], 'hải hòa': ['Hải Hoà', 'Hải Hòa'], 'hành tín đông': ['Hành Tín  Đông', 'Hành Tín Đông'], 'hiệp hòa': ['Hiệp Hoà', 'Hiệp Hòa'], 'hòa bắc': ['Hoà Bắc', 'Hòa Bắc'], 'hòa bình': ['Hoà Bình', 'Hòa Bình'], 'hòa châu': ['Hoà Châu', 'Hòa Châu'], 'hòa hải': ['Hoà Hải', 'Hòa Hải'], 'hòa hiệp trung': ['Hoà Hiệp Trung', 'Hòa Hiệp Trung'], 'hòa liên': ['Hoà Liên', 'Hòa Liên'], 'hòa lộc': ['Hoà Lộc', 'Hòa Lộc'], 'hòa lợi': ['Hoà Lợi', 'Hòa Lợi'], 'hòa long': ['Hoà Long', 'Hòa Long'], 'hòa mạc': ['Hoà Mạc', 'Hòa Mạc'], 'hòa minh': ['Hoà Minh', 'Hòa Minh'], 'hòa mỹ': ['Hoà Mỹ', 'Hòa Mỹ'], 'hòa phát': ['Hoà Phát', 'Hòa Phát'], 'hòa phong': ['Hoà Phong', 'Hòa Phong'], 'hòa phú': ['Hoà Phú', 'Hòa Phú'], 'hòa phước': ['Hoà Phước', 'Hòa Phước'], 'hòa sơn': ['Hoà Sơn', 'Hòa Sơn'], 'hòa tân': ['Hoà Tân', 'Hòa Tân'], 'hòa thuận': ['Hoà Thuận', 'Hòa Thuận'], 'hòa tiến': ['Hoà Tiến', 'Hòa Tiến'], 'hòa trạch': ['Hoà Trạch', 'Hòa Trạch'], 'hòa vinh': ['Hoà Vinh', 'Hòa Vinh'], 'hương hòa': ['Hương Hoà', 'Hương Hòa'], 'ích hậu': ['ích Hậu', 'Ích Hậu'], 'ít ong': ['ít Ong', 'Ít Ong'], 'khánh hòa': ['Khánh Hoà', 'Khánh Hòa'], 'krông á': ['Krông Á', 'KRông á'], 'lộc hòa': ['Lộc Hoà', 'Lộc Hòa'], 'minh hòa': ['Minh Hoà', 'Minh Hòa'], 'mường ải': ['Mường ải', 'Mường Ải'], 'mường ẳng': ['Mường ẳng', 'Mường Ẳng'], 'nậm ét': ['Nậm ét', 'Nậm Ét'], 'nam hòa': ['Nam Hoà', 'Nam Hòa'], 'na ư': ['Na ư', 'Na Ư'], 'ngã sáu': ['Ngã sáu', 'Ngã Sáu'], 'nghi hòa': ['Nghi Hoà', 'Nghi Hòa'], 'nguyễn úy': ['Nguyễn Uý', 'Nguyễn úy', 'Nguyễn Úy'], 'nhân hòa': ['Nhân Hoà', 'Nhân Hòa'], 'nhơn hòa': ['Nhơn Hoà', 'Nhơn Hòa'], 'nhơn nghĩa a': ['Nhơn nghĩa A', 'Nhơn Nghĩa A'], 'phúc ứng': ['Phúc ứng', 'Phúc Ứng'], 'phước hòa': ['Phước Hoà', 'Phước Hòa'], 'sơn hóa': ['Sơn Hoá', 'Sơn Hóa'], 'tạ an khương đông': ['Tạ An Khương  Đông', 'Tạ An Khương Đông'], 'tạ an khương nam': ['Tạ An Khương  Nam', 'Tạ An Khương Nam'], 'tăng hòa': ['Tăng Hoà', 'Tăng Hòa'], 'tân hòa': ['Tân Hoà', 'Tân Hòa'], 'tân hòa thành': ['Tân Hòa  Thành', 'Tân Hòa Thành'], 'tân khánh trung': ['Tân  Khánh Trung', 'Tân Khánh Trung'], 'tân lợi': ['Tân lợi', 'Tân Lợi'], 'thái hòa': ['Thái Hoà', 'Thái Hòa'], 'thiết ống': ['Thiết ống', 'Thiết Ống'], 'thuận hòa': ['Thuận Hoà', 'Thuận Hòa'], 'thượng ấm': ['Thượng ấm', 'Thượng Ấm'], 'thụy hương': ['Thuỵ Hương', 'Thụy Hương'], 'thủy xuân': ['Thuỷ Xuân', 'Thủy Xuân'], 'tịnh ấn đông': ['Tịnh ấn Đông', 'Tịnh Ấn Đông'], 'tịnh ấn tây': ['Tịnh ấn Tây', 'Tịnh Ấn Tây'], 'triệu ái': ['Triệu ái', 'Triệu Ái'], 'triệu ẩu': ['Triệu ẩu', 'Triệu Ẩu'], 'trung hòa': ['Trung Hoà', 'Trung Hòa'], 'trung ý': ['Trung ý', 'Trung Ý'], 'tùng ảnh': ['Tùng ảnh', 'Tùng Ảnh'], 'úc kỳ': ['úc Kỳ', 'Úc Kỳ'], 'ứng hòe': ['ứng Hoè', 'Ứng Hoè'], 'vĩnh hòa': ['Vĩnh Hoà', 'Vĩnh Hòa'], 'vũ hòa': ['Vũ Hoà', 'Vũ Hòa'], 'xuân ái': ['Xuân ái', 'Xuân Ái'], 'xuân áng': ['Xuân áng', 'Xuân Áng'], 'xuân hòa': ['Xuân Hoà', 'Xuân Hòa'], 'xuất hóa': ['Xuất Hoá', 'Xuất Hóa'], 'ỷ la': ['ỷ La', 'Ỷ La']}
groups_ward.update({1: ['1', '01'], 2: ['2', '02'], 3: ['3', '03'], 4: ['4', '04'], 5: ['5', '05'], 6: ['6', '06'], 7: ['7', '07'], 8: ['8', '08'], 9: ['9', '09']})
def to_same(groups):
    same = {ele: k for k, v in groups.items() for ele in v}
    return same
same_province = to_same(groups_province)
same_district = to_same(groups_district)
same_ward = to_same(groups_ward)
def normalize(text, same_dict):
    return same_dict.get(text, text)

In [86]:
TEAM_NAME = 'HK251'
EXCEL_FILE = f'{TEAM_NAME}.xlsx'

import json
import time
with open('test.json') as f:
    data = json.load(f)

summary_only = False
df = []
solution = Solution()
timer = []
correct = 0
for test_idx, data_point in enumerate(data):
    address = data_point["text"]

    ok = 0
    try:
        answer = data_point["result"]
        answer["province_normalized"] = normalize(answer["province"], same_province)
        answer["district_normalized"] = normalize(answer["district"], same_district)
        answer["ward_normalized"] = normalize(answer["ward"], same_ward)

        start = time.perf_counter_ns()
        result = solution.process(address)
        finish = time.perf_counter_ns()
        timer.append(finish - start)
        result["province_normalized"] = normalize(result["province"], same_province)
        result["district_normalized"] = normalize(result["district"], same_district)
        result["ward_normalized"] = normalize(result["ward"], same_ward)

        province_correct = int(answer["province_normalized"] == result["province_normalized"])
        district_correct = int(answer["district_normalized"] == result["district_normalized"])
        ward_correct = int(answer["ward_normalized"] == result["ward_normalized"])
        ok = province_correct + district_correct + ward_correct

        #Print kqua
        if not summary_only and ok !=3:
            print(f"Case #{test_idx+1} ---")
            print(f"Input    : '{address}'")
            print(f"Expected : {answer}")
            print(f"Actual   : {result}")
            print(f"Correct  : {ok}/3")
            print(f"Time     : {timer[-1] / 1_000_000_000:.6f}s")
            print("-" * 30)

        df.append([
            test_idx,
            address,
            answer["province"],
            result["province"],
            answer["province_normalized"],
            result["province_normalized"],
            province_correct,
            answer["district"],
            result["district"],
            answer["district_normalized"],
            result["district_normalized"],
            district_correct,
            answer["ward"],
            result["ward"],
            answer["ward_normalized"],
            result["ward_normalized"],
            ward_correct,
            ok,
            timer[-1] / 1_000_000_000,
        ])
    except Exception as e:
        print(f"Case #{test_idx+1} with input: '{address}'")
        print(f"Exception: {e}")
        df.append([
            test_idx,
            address,
            answer.get("province", "N/A"), "EXCEPTION", answer.get("province_normalized", "N/A"), "EXCEPTION", 0,
            answer.get("district", "N/A"), "EXCEPTION", answer.get("district_normalized", "N/A"), "EXCEPTION", 0,
            answer.get("ward", "N/A"), "EXCEPTION", answer.get("ward_normalized", "N/A"), "EXCEPTION", 0,
            0, 0,
        ])
        pass
    correct += ok

print(f"Summary")
total = len(data) * 3
score_scale_10 = round(correct / total * 10, 2)
if len(timer) == 0:
    timer = [0]
max_time_sec = round(max(timer) / 1_000_000_000, 4)
avg_time_sec = round((sum(timer) / len(timer)) / 1_000_000_000, 4)

import pandas as pd

df2 = pd.DataFrame(
    [[correct, total, score_scale_10, max_time_sec, avg_time_sec]],
    columns=['correct', 'total', 'score / 10', 'max_time_sec', 'avg_time_sec',],
)


columns = [
    'ID', 'text',
    'province', 'province_student', 'province_normalized', 'province_student_normalized', 'province_correct',
    'district', 'district_student', 'district_normalized', 'district_student_normalized', 'district_correct',
    'ward', 'ward_student', 'ward_normalized', 'ward_student_normalized', 'ward_correct',
    'total_correct', 'time_sec',
]

df = pd.DataFrame(df)
df.columns = columns

print(f'{TEAM_NAME = }')
print(f'{EXCEL_FILE = }')
print(df2)

!pip install -q xlsxwriter
writer = pd.ExcelWriter(EXCEL_FILE, engine='xlsxwriter')
df2.to_excel(writer, index=False, sheet_name='summary')
df.to_excel(writer, index=False, sheet_name='details')
writer.close()

Case #1 ---
Input    : 'TT Tân Bình Huyện Yên Sơn, Tuyên Quang'
Expected : {'province': 'Tuyên Quang', 'district': 'Yên Sơn', 'ward': 'Tân Bình', 'province_normalized': 'Tuyên Quang', 'district_normalized': 'Yên Sơn', 'ward_normalized': 'Tân Bình'}
Actual   : {'province': 'Tuyên Quang', 'district': 'Yên Sơn', 'ward': '', 'province_normalized': 'Tuyên Quang', 'district_normalized': 'Yên Sơn', 'ward_normalized': ''}
Correct  : 2/3
Time     : 0.003079s
------------------------------
Case #2 ---
Input    : '357/28,Ng-T- Thuật,P1,Q3,TP.HồChíMinh.'
Expected : {'province': 'Hồ Chí Minh', 'district': '', 'ward': '', 'province_normalized': 'Hồ Chí Minh', 'district_normalized': '', 'ward_normalized': ''}
Actual   : {'province': 'Hồ Chí Minh', 'district': '', 'ward': '1', 'province_normalized': 'Hồ Chí Minh', 'district_normalized': '', 'ward_normalized': 1}
Correct  : 2/3
Time     : 0.006154s
------------------------------
Case #3 ---
Input    : '284DBis Ng Văn Giáo, P3, Mỹ Tho, T.Giang.'
Expecte