In [None]:
# NOTE: you CAN change this cell
# If you want to use your own database, download it here
!pip install -q gdown

import gdown

# province
gdown.download("https://drive.google.com/uc?id=1TVG3CmtPnpPfFA8O_EMOgZnDQNzH9wxH", "list_province.txt", quiet=False)

# district
gdown.download("https://drive.google.com/uc?id=16LnTv_Ruybrxw8MfbEaWB7GPrj7y4pzA", "list_district.txt", quiet=False)

# ward
gdown.download("https://drive.google.com/uc?id=1Zn3FTe8OAMQqlOZsF1xcy2Jy2cQkRpyU", "list_ward.txt", quiet=False)

# database
gdown.download("https://drive.google.com/uc?id=1aI19MkiUM2CZ9XbU7ERffo2uG--lBr0v", "list_full_hierarchy.txt", quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1TVG3CmtPnpPfFA8O_EMOgZnDQNzH9wxH
To: /content/list_province.txt
100%|██████████| 749/749 [00:00<00:00, 2.60MB/s]
Downloading...
From: https://drive.google.com/uc?id=16LnTv_Ruybrxw8MfbEaWB7GPrj7y4pzA
To: /content/list_district.txt
100%|██████████| 7.75k/7.75k [00:00<00:00, 17.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Zn3FTe8OAMQqlOZsF1xcy2Jy2cQkRpyU
To: /content/list_ward.txt
100%|██████████| 88.3k/88.3k [00:00<00:00, 41.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1aI19MkiUM2CZ9XbU7ERffo2uG--lBr0v
To: /content/list_full_hierarchy.txt
100%|██████████| 571k/571k [00:00<00:00, 102MB/s]


'list_full_hierarchy.txt'

In [None]:
# NOTE: you CAN change this cell
# Add more to your needs
# you must place ALL pip install here
!pip install editdistance



In [None]:
# NOTE: you CAN change this cell
# import your library here
import re, csv, unicodedata, time
from collections import defaultdict

In [None]:
# NOTE: you MUST change this cell
# New methods / functions must be written under class Solution.
# ================================================================
class Solution:
    # =========================
    # ======= PUBLIC API ======
    # =========================
    def __init__(self):
        # fixed file names (provided by the grader)
        self.province_path  = 'list_province.txt'
        self.district_path  = 'list_district.txt'
        self.ward_path      = 'list_ward.txt'
        self.hierarchy_path = 'list_full_hierarchy.txt'

        # 1) load hierarchy (province -> district -> [wards])
        self.address_hierarchy = self._build_address_hierarchy(self.hierarchy_path)

        # 2) auto-generate abbreviations (no JSON needed)
        self.auto_abbr = self._auto_generate_abbreviations(self.address_hierarchy)

        # 3) load base lists for output normalization
        self.provinces_raw = self._load_list(self.province_path)
        self.districts_raw = self._load_list(self.district_path)
        self.wards_raw     = self._load_list(self.ward_path)

        # 4) OCR pairs
        self._build_ocr_maps()

        # 5) build context tries + norm→raw maps
        (self.trie_prov, self.prov_norm2raw,
         self.trie_dist_by_prov, self.dist_norm2raw_by_prov,
         self.trie_ward_by_dist, self.ward_norm2raw_by_dist) = self._build_context_tries_v2(
            self.address_hierarchy, self.auto_abbr
        )

    def process(self, s: str):
        """
        Parse one free-form address string → {'province','district','ward'}
        Pipeline:
        - normalize + expand abbrev + OCR-fix
        - detect province (trie)
        - detect district theo province (trie)
        - detect ward theo (province,district) (trie)
        - fallback tìm theo substring trong hierarchy nếu thiếu
        - guard bằng anchors "phuong N" để tránh chọn ward sai
        - ưu tiên ward nằm TRƯỚC district trong chuỗi (thứ tự VN)
        """
        texts = self._normalize_pipeline(s)
        exp = texts[1]
        anchors = self._extract_anchors(exp)

        # 1) province
        prov_cands = self._candidates_with_positions(texts, self.trie_prov, self.prov_norm2raw, level="prov")
        province = prov_cands[0][0] if prov_cands else ""
        pkey = self._n_base(province) if province else ""

        # 2) district by province (context)
        district, dist_pos = "", None
        if pkey and pkey in self.trie_dist_by_prov and pkey in self.dist_norm2raw_by_prov:
            dist_cands = self._candidates_with_positions(texts, self.trie_dist_by_prov[pkey],
                                                         self.dist_norm2raw_by_prov[pkey], level="dist")
            if dist_cands:
                # Avoid pick district = province (TP Tuyên Quang vs Tỉnh Tuyên Quang) when label TP does not exist
                best = dist_cands[0]
                for c in dist_cands:
                    if self._n_base(c[0]) != pkey:
                        best = c
                        break
                district, _, st, en = best
                dist_pos = (st, en)

        dkey = self._n_base(district) if district else ""

        # 3) ward theo (province,district) + prefer appear before district
        ward = ""
        if pkey and dkey and (pkey, dkey) in self.trie_ward_by_dist and (pkey, dkey) in self.ward_norm2raw_by_dist:
            ward_cands = self._candidates_with_positions(texts,
                                                         self.trie_ward_by_dist[(pkey, dkey)],
                                                         self.ward_norm2raw_by_dist[(pkey, dkey)],
                                                         level="ward")
            if dist_pos:
                before = [c for c in ward_cands if c[3] < dist_pos[0]]  # c = (raw, score, st, en)
                ward = before[0][0] if before else (ward_cands[0][0] if ward_cands else "")
            elif ward_cands:
                ward = ward_cands[0][0]

            # Anchor-guard:
            wn_set = set(self.ward_norm2raw_by_dist[(pkey, dkey)].values())
            if anchors.get('ward_numbers'):
                nums = [re.sub(r'^0+', '', x) for x in anchors['ward_numbers']]
                if not any(n in wn_set for n in nums):
                    if re.fullmatch(r'\d{1,2}', self._n_base(ward)):
                        ward = ""

        # 4) fallback substring (missing p/d)
        if not (province and district):
            p2, d2, w2 = self._hier_lookup(' '.join(texts))
            if not province: province = p2
            if not district: district = d2
            if not ward: ward = w2

        return {
            "province": province or "",
            "district": district or "",
            "ward":     ward or ""
        }

    # =========================
    # ===== DATA BUILDERS =====
    # =========================
    def _load_list(self, path):
        vals = []
        try:
            with open(path, encoding='utf-8') as f:
                for line in f:
                    t = line.strip()
                    if t: vals.append(t)
        except FileNotFoundError:
            pass
        return vals

    def _build_address_hierarchy(self, path):
        """ Parse dia_chi_loc.txt → { province_raw : { district_raw : [ward_raw...] } } """
        hierarchy = defaultdict(lambda: defaultdict(list))
        try:
            with open(path, encoding='utf-8') as f:
                reader = csv.reader(f)
                for row in reader:
                    if len(row) < 3:
                        continue
                    w, d, p = [x.strip() for x in row[:3]]
                    hierarchy[p][d].append(w)
        except FileNotFoundError:
            pass
        return hierarchy

    def _auto_generate_abbreviations(self, hier):
        """Sinh alias đơn giản theo pattern + city aliases"""
        abbr = {"province": {}, "district": {}, "ward": {}}
        for p, dmap in hier.items():
            abbr["province"][p] = self._make_abbr(p)
            for d, wards in dmap.items():
                abbr["district"][d] = self._make_abbr(d)
                for w in wards:
                    abbr["ward"][w] = self._make_abbr(w)
        # city aliases
        abbr["province"].setdefault("Thành phố Hồ Chí Minh", []).extend(["hcm", "tphcm", "tp hcm"])
        abbr["province"].setdefault("Thành phố Hà Nội", []).extend(["hn", "tp ha noi", "tphn"])
        # dedup
        for lvl in abbr:
            for k, v in abbr[lvl].items():
                abbr[lvl][k] = sorted(set(v))
        return abbr

    def _make_abbr(self, name):
        n = self._n_base(self._strip_level_label(name))
        out = []
        # province-level
        if re.match(r'^thanh pho\b', n):
            core = n.replace('thanh pho ', '')
            out += [f'tp {core}', f'tp{core.replace(" ","")}']
        if re.match(r'^tinh\b', n):
            core = n.replace('tinh ', '')
            out += [f't {core}', f'tinh {core}']
        # district-level
        if re.match(r'^quan\b', n):
            core = n.replace('quan ', '')
            out += [f'q {core}', f'q{core.replace(" ","")}', f'quan {core}']
        if re.match(r'^huyen\b', n):
            core = n.replace('huyen ', '')
            out += [f'h {core}', f'h{core.replace(" ","")}', f'huyen {core}']
        if re.match(r'^thi xa\b', n):
            core = n.replace('thi xa ', '')
            out += [f'tx {core}', f'tx{core.replace(" ","")}']
        if re.match(r'^thi tran\b', n):
            core = n.replace('thi tran ', '')
            out += [f'tt {core}', f'tt{core.replace(" ","")}']
        # ward-level
        if re.match(r'^phuong\b', n):
            core = n.replace('phuong ', '')
            if re.fullmatch(r'\d{1,2}', core):
                out += [f'p {core}', f'p0{core}' if len(core)==1 else f'p{core}', f'phuong {core}']
            else:
                out += [f'p {core}', f'p{core.replace(" ","")}', f'phuong {core}']
        if re.match(r'^xa\b', n):
            core = n.replace('xa ', '')
            out += [f'x {core}', f'x{core.replace(" ","")}']
        return out

    # =========================
    # ===== NORMALIZATION =====
    # =========================
    def _nfd_strip(self, s):
        s = unicodedata.normalize("NFD", s)
        s = ''.join(ch for ch in s if unicodedata.category(ch) != 'Mn')
        s = s.replace('đ','d').replace('Đ','D')
        return unicodedata.normalize("NFC", s)

    def _n_base(self, s):
        s = unicodedata.normalize("NFKC", s)
        s = self._nfd_strip(s).lower()
        s = re.sub(r'[^a-z0-9]+', ' ', s)
        s = re.sub(r'\s+', ' ', s).strip()
        return s

    def _strip_level_label(self, s):
        s0 = s.strip()
        s0 = re.sub(r'^(Tỉnh|Thành\s*phố)\s+', '', s0, flags=re.I)
        s0 = re.sub(r'^(Quận|Huyện|Thị\s*xã|Thành\s*phố)\s+', '', s0, flags=re.I)
        s0 = re.sub(r'^(Phường|Xã|Thị\s*trấn)\s+', '', s0, flags=re.I)
        return s0.strip()

    def _expand_text_abbrev(self, s_norm):
        toks = s_norm.split()
        rep = {'tp':'thanh pho','q':'quan','p':'phuong','tx':'thi xa','tt':'thi tran','h':'huyen','x':'xa'}
        for i, t in enumerate(toks):
            if t in rep: toks[i] = rep[t]
        x = ' '.join(toks)
        # q10 / p05 / h3...
        x = re.sub(r'\b(p|f)\s*0?(\d{1,2})\b', r'phuong \2', x)
        x = re.sub(r'\bq\s*0?(\d{1,2})\b', r'quan \1', x)
        x = re.sub(r'\bh\s*0?(\d{1,2})\b', r'huyen \1', x)
        return x

    # =========================
    # ======== OCR FIX ========
    # =========================
    def _build_ocr_maps(self):
        self.ocr_pairs = [('0','o'),('1','l'),('1','i'),('5','s'),('8','b')]

    def _apply_ocr_lite(self, s_norm):
        x = s_norm
        for a, b in self.ocr_pairs:
            x = re.sub(rf'(?<=\b){a}(?=\w)', b, x)
        return x

    # =========================
    # ========= TRIE ==========
    # =========================
    class _TrieNode:
        __slots__ = ("ch","end")
        def __init__(self):
            self.ch, self.end = {}, False

    def _build_trie(self, words):
        root = self._TrieNode()
        for w in words:
            cur = root
            for c in w:
                cur = cur.ch.setdefault(c, self._TrieNode())
            cur.end = True
        return root

    def _trie_scan_all(self, text, root):
        out, n = [], len(text)
        for i in range(n):
            cur, j, last_end = root, i, -1
            while j < n and text[j] in cur.ch:
                cur = cur.ch[text[j]]
                if cur.end:
                    last_end = j
                j += 1
            if last_end >= i:
                out.append((text[i:last_end+1], i, last_end))
        return out

    # =========================
    # == CONTEXT TRIES BUILD ==
    # =========================
    def _build_context_tries_v2(self, hier, abbr):
        # Province
        prov_items = []
        prov_norm2raw = {}
        for p in hier.keys():
            p_plain = self._strip_level_label(p)
            forms = [ self._n_base(p_plain) ] + [ self._n_base(x) for x in abbr["province"].get(p, []) ]
            for f in forms:
                prov_items.append(f)
                prov_norm2raw[f] = p_plain
        trie_prov = self._build_trie(prov_items)

        # District per province (key by normalized province)
        trie_dist, dist_norm2raw = {}, {}
        for p, dmap in hier.items():
            p_plain = self._strip_level_label(p)
            pkey = self._n_base(p_plain)
            items = []
            for d in dmap.keys():
                d_plain = self._strip_level_label(d)
                d_out = re.sub(r'^0+', '', d_plain) if re.fullmatch(r'\d{1,2}', self._n_base(d_plain)) else d_plain
                forms = [ self._n_base(d_plain) ] + [ self._n_base(x) for x in abbr["district"].get(d, []) ]
                if re.fullmatch(r'\d{1,2}', self._n_base(d_plain)):
                    forms.append(self._n_base(re.sub(r'^0+', '', d_plain)))
                for f in set(forms):
                    items.append(f)
                    dist_norm2raw.setdefault(pkey, {})[f] = d_out
            trie_dist[pkey] = self._build_trie(items) if items else self._build_trie([])

        # Ward per (province,district)
        trie_ward, ward_norm2raw = {}, {}
        for p, dmap in hier.items():
            p_plain = self._strip_level_label(p)
            pkey = self._n_base(p_plain)
            for d, wards in dmap.items():
                d_plain = self._strip_level_label(d)
                dkey = self._n_base(d_plain)
                items = []
                for w in wards:
                    w_plain = self._strip_level_label(w)
                    w_out = re.sub(r'^0+', '', w_plain) if re.fullmatch(r'\d{1,2}', self._n_base(w_plain)) else w_plain
                    forms = [ self._n_base(w_plain) ] + [ self._n_base(x) for x in abbr["ward"].get(w, []) ]
                    if re.fullmatch(r'\d{1,2}', self._n_base(w_plain)):
                        forms.append(self._n_base(re.sub(r'^0+', '', w_plain)))
                    for f in set(forms):
                        items.append(f)
                        ward_norm2raw.setdefault((pkey, dkey), {})[f] = w_out
                trie_ward[(pkey, dkey)] = self._build_trie(items) if items else self._build_trie([])
        return trie_prov, prov_norm2raw, trie_dist, dist_norm2raw, trie_ward, ward_norm2raw

    # =========================
    # ===== CANDIDATES + CTX ==
    # =========================
    def _normalize_pipeline(self, s):
        base = self._n_base(s)
        exp  = self._expand_text_abbrev(base)
        ocr  = self._apply_ocr_lite(exp)
        return [base, exp, ocr]

    def _label_score(self, level, left, right, token_norm):
        score = 0.0
        if level == "ward":
            if re.search(r'\b(p|phuong|x|xa|tt|thi tran)\b\s*$', left): score += 0.25
            if re.search(r'\b(q|quan|h|huyen|tx|thi xa|tp|thanh pho)\b\s*$', left): score -= 0.15
            if re.fullmatch(r'\d{1,2}', token_norm) and not (re.search(r'\b(p|phuong)\b', left) or re.search(r'\b(p|phuong)\b', right)):
                score -= 0.35
        elif level == "dist":
            if re.search(r'\b(q|quan|h|huyen|tx|thi xa|tp|thanh pho)\b\s*$', left): score += 0.22
            if re.search(r'\b(p|phuong|x|xa|tt|thi tran)\b\s*$', left): score -= 0.20
            if re.fullmatch(r'\d{1,2}', token_norm) and not (re.search(r'\b(q|quan|h|huyen)\b', left) or re.search(r'\b(q|quan|h|huyen)\b', right)):
                score -= 0.30
        elif level == "prov":
            if re.search(r'\b(tinh|thanh pho|tp)\b\s*$', left): score += 0.12
        return score

    def _candidates_with_positions(self, texts, trie, norm2raw, level):
        rows = []
        for t in texts:
            L = max(1, len(t))
            for h, st, en in self._trie_scan_all(t, trie):
                base_sc = 0.55 + len(h)/30.0 + 0.05*((en+1)/L)
                left = t[max(0, st-16):st]
                right = t[en+1:min(len(t), en+1+16)]
                sc = base_sc + self._label_score(level, left, right, h)
                if h in norm2raw:
                    rows.append((norm2raw[h], sc, st, en))
        rows.sort(key=lambda x: (-x[1], x[2]))  # score desc, then earlier position
        return rows

    def _extract_anchors(self, text_norm_expanded):
        anchors = {}
        anchors['ward_numbers'] = re.findall(r'\bphuong\s+0?(\d{1,2})\b', text_norm_expanded)
        anchors['district_numbers'] = re.findall(r'\b(quan|huyen)\s+0?(\d{1,2})\b', text_norm_expanded)
        return anchors

    # =========================
    # ====== FALLBACK LOOKUP ==
    # =========================
    def _hier_lookup(self, text):
        txt = self._expand_text_abbrev(self._n_base(text)).replace(" ", "")
        for p, dmap in self.address_hierarchy.items():
            p_plain = self._strip_level_label(p)
            pnorm = self._n_base(p_plain).replace(" ", "")
            if pnorm in txt:
                for d, wards in dmap.items():
                    d_plain = self._strip_level_label(d)
                    dnorm = self._n_base(d_plain).replace(" ", "")
                    if dnorm in txt:
                        for w in wards:
                            w_plain = self._strip_level_label(w)
                            wnorm = self._n_base(w_plain).replace(" ", "")
                            if wnorm in txt:
                                return p_plain, d_plain, w_plain
                        return p_plain, d_plain, ""
                return p_plain, "", ""
        return "", "", ""


In [None]:
# NOTE: DO NOT change this cell
!rm -rf test.json
# this link is public test
!gdown --fuzzy https://drive.google.com/file/d/1PBt3U9I3EH885CDhcXspebyKI5Vw6uLB/view?usp=sharing -O test.json

Downloading...
From: https://drive.google.com/uc?id=1PBt3U9I3EH885CDhcXspebyKI5Vw6uLB
To: /content/test.json
  0% 0.00/79.4k [00:00<?, ?B/s]100% 79.4k/79.4k [00:00<00:00, 34.8MB/s]


In [None]:
# CORRECT TESTS
groups_province = {}
groups_district = {'hòa bình': ['Hoà Bình', 'Hòa Bình'], 'kbang': ['Kbang', 'KBang'], 'quy nhơn': ['Qui Nhơn', 'Quy Nhơn']}
groups_ward = {'ái nghĩa': ['ái Nghĩa', 'Ái Nghĩa'], 'ái quốc': ['ái Quốc', 'Ái Quốc'], 'ái thượng': ['ái Thượng', 'Ái Thượng'], 'ái tử': ['ái Tử', 'Ái Tử'], 'ấm hạ': ['ấm Hạ', 'Ấm Hạ'], 'an ấp': ['An ấp', 'An Ấp'], 'ẳng cang': ['ẳng Cang', 'Ẳng Cang'], 'ẳng nưa': ['ẳng Nưa', 'Ẳng Nưa'], 'ẳng tở': ['ẳng Tở', 'Ẳng Tở'], 'an hòa': ['An Hoà', 'An Hòa'], 'ayun': ['Ayun', 'AYun'], 'bắc ái': ['Bắc ái', 'Bắc Ái'], 'bảo ái': ['Bảo ái', 'Bảo Ái'], 'bình hòa': ['Bình Hoà', 'Bình Hòa'], 'châu ổ': ['Châu ổ', 'Châu Ổ'], 'chư á': ['Chư á', 'Chư Á'], 'chư rcăm': ['Chư Rcăm', 'Chư RCăm'], 'cộng hòa': ['Cộng Hoà', 'Cộng Hòa'], 'cò nòi': ['Cò  Nòi', 'Cò Nòi'], 'đại ân 2': ['Đại Ân  2', 'Đại Ân 2'], 'đak ơ': ['Đak ơ', 'Đak Ơ'], "đạ m'ri": ["Đạ M'ri", "Đạ M'Ri"], 'đông hòa': ['Đông Hoà', 'Đông Hòa'], 'đồng ích': ['Đồng ích', 'Đồng Ích'], 'hải châu i': ['Hải Châu  I', 'Hải Châu I'], 'hải hòa': ['Hải Hoà', 'Hải Hòa'], 'hành tín đông': ['Hành Tín  Đông', 'Hành Tín Đông'], 'hiệp hòa': ['Hiệp Hoà', 'Hiệp Hòa'], 'hòa bắc': ['Hoà Bắc', 'Hòa Bắc'], 'hòa bình': ['Hoà Bình', 'Hòa Bình'], 'hòa châu': ['Hoà Châu', 'Hòa Châu'], 'hòa hải': ['Hoà Hải', 'Hòa Hải'], 'hòa hiệp trung': ['Hoà Hiệp Trung', 'Hòa Hiệp Trung'], 'hòa liên': ['Hoà Liên', 'Hòa Liên'], 'hòa lộc': ['Hoà Lộc', 'Hòa Lộc'], 'hòa lợi': ['Hoà Lợi', 'Hòa Lợi'], 'hòa long': ['Hoà Long', 'Hòa Long'], 'hòa mạc': ['Hoà Mạc', 'Hòa Mạc'], 'hòa minh': ['Hoà Minh', 'Hòa Minh'], 'hòa mỹ': ['Hoà Mỹ', 'Hòa Mỹ'], 'hòa phát': ['Hoà Phát', 'Hòa Phát'], 'hòa phong': ['Hoà Phong', 'Hòa Phong'], 'hòa phú': ['Hoà Phú', 'Hòa Phú'], 'hòa phước': ['Hoà Phước', 'Hòa Phước'], 'hòa sơn': ['Hoà Sơn', 'Hòa Sơn'], 'hòa tân': ['Hoà Tân', 'Hòa Tân'], 'hòa thuận': ['Hoà Thuận', 'Hòa Thuận'], 'hòa tiến': ['Hoà Tiến', 'Hòa Tiến'], 'hòa trạch': ['Hoà Trạch', 'Hòa Trạch'], 'hòa vinh': ['Hoà Vinh', 'Hòa Vinh'], 'hương hòa': ['Hương Hoà', 'Hương Hòa'], 'ích hậu': ['ích Hậu', 'Ích Hậu'], 'ít ong': ['ít Ong', 'Ít Ong'], 'khánh hòa': ['Khánh Hoà', 'Khánh Hòa'], 'krông á': ['Krông Á', 'KRông á'], 'lộc hòa': ['Lộc Hoà', 'Lộc Hòa'], 'minh hòa': ['Minh Hoà', 'Minh Hòa'], 'mường ải': ['Mường ải', 'Mường Ải'], 'mường ẳng': ['Mường ẳng', 'Mường Ẳng'], 'nậm ét': ['Nậm ét', 'Nậm Ét'], 'nam hòa': ['Nam Hoà', 'Nam Hòa'], 'na ư': ['Na ư', 'Na Ư'], 'ngã sáu': ['Ngã sáu', 'Ngã Sáu'], 'nghi hòa': ['Nghi Hoà', 'Nghi Hòa'], 'nguyễn úy': ['Nguyễn Uý', 'Nguyễn úy', 'Nguyễn Úy'], 'nhân hòa': ['Nhân Hoà', 'Nhân Hòa'], 'nhơn hòa': ['Nhơn Hoà', 'Nhơn Hòa'], 'nhơn nghĩa a': ['Nhơn nghĩa A', 'Nhơn Nghĩa A'], 'phúc ứng': ['Phúc ứng', 'Phúc Ứng'], 'phước hòa': ['Phước Hoà', 'Phước Hòa'], 'sơn hóa': ['Sơn Hoá', 'Sơn Hóa'], 'tạ an khương đông': ['Tạ An Khương  Đông', 'Tạ An Khương Đông'], 'tạ an khương nam': ['Tạ An Khương  Nam', 'Tạ An Khương Nam'], 'tăng hòa': ['Tăng Hoà', 'Tăng Hòa'], 'tân hòa': ['Tân Hoà', 'Tân Hòa'], 'tân hòa thành': ['Tân Hòa  Thành', 'Tân Hòa Thành'], 'tân khánh trung': ['Tân  Khánh Trung', 'Tân Khánh Trung'], 'tân lợi': ['Tân lợi', 'Tân Lợi'], 'thái hòa': ['Thái Hoà', 'Thái Hòa'], 'thiết ống': ['Thiết ống', 'Thiết Ống'], 'thuận hòa': ['Thuận Hoà', 'Thuận Hòa'], 'thượng ấm': ['Thượng ấm', 'Thượng Ấm'], 'thụy hương': ['Thuỵ Hương', 'Thụy Hương'], 'thủy xuân': ['Thuỷ Xuân', 'Thủy Xuân'], 'tịnh ấn đông': ['Tịnh ấn Đông', 'Tịnh Ấn Đông'], 'tịnh ấn tây': ['Tịnh ấn Tây', 'Tịnh Ấn Tây'], 'triệu ái': ['Triệu ái', 'Triệu Ái'], 'triệu ẩu': ['Triệu ẩu', 'Triệu Ẩu'], 'trung hòa': ['Trung Hoà', 'Trung Hòa'], 'trung ý': ['Trung ý', 'Trung Ý'], 'tùng ảnh': ['Tùng ảnh', 'Tùng Ảnh'], 'úc kỳ': ['úc Kỳ', 'Úc Kỳ'], 'ứng hòe': ['ứng Hoè', 'Ứng Hoè'], 'vĩnh hòa': ['Vĩnh Hoà', 'Vĩnh Hòa'], 'vũ hòa': ['Vũ Hoà', 'Vũ Hòa'], 'xuân ái': ['Xuân ái', 'Xuân Ái'], 'xuân áng': ['Xuân áng', 'Xuân Áng'], 'xuân hòa': ['Xuân Hoà', 'Xuân Hòa'], 'xuất hóa': ['Xuất Hoá', 'Xuất Hóa'], 'ỷ la': ['ỷ La', 'Ỷ La']}
groups_ward.update({1: ['1', '01'], 2: ['2', '02'], 3: ['3', '03'], 4: ['4', '04'], 5: ['5', '05'], 6: ['6', '06'], 7: ['7', '07'], 8: ['8', '08'], 9: ['9', '09']})
def to_same(groups):
    same = {ele: k for k, v in groups.items() for ele in v}
    return same
same_province = to_same(groups_province)
same_district = to_same(groups_district)
same_ward = to_same(groups_ward)
def normalize(text, same_dict):
    return same_dict.get(text, text)

In [None]:
TEAM_NAME = 'HK251'
EXCEL_FILE = f'{TEAM_NAME}.xlsx'

import json
import time
with open('test.json') as f:
    data = json.load(f)

summary_only = True
df = []
solution = Solution()
timer = []
correct = 0
for test_idx, data_point in enumerate(data):
    address = data_point["text"]

    ok = 0
    try:
        answer = data_point["result"]
        answer["province_normalized"] = normalize(answer["province"], same_province)
        answer["district_normalized"] = normalize(answer["district"], same_district)
        answer["ward_normalized"] = normalize(answer["ward"], same_ward)

        start = time.perf_counter_ns()
        result = solution.process(address)
        finish = time.perf_counter_ns()
        timer.append(finish - start)
        result["province_normalized"] = normalize(result["province"], same_province)
        result["district_normalized"] = normalize(result["district"], same_district)
        result["ward_normalized"] = normalize(result["ward"], same_ward)

        province_correct = int(answer["province_normalized"] == result["province_normalized"])
        district_correct = int(answer["district_normalized"] == result["district_normalized"])
        ward_correct = int(answer["ward_normalized"] == result["ward_normalized"])
        ok = province_correct + district_correct + ward_correct

        df.append([
            test_idx,
            address,
            answer["province"],
            result["province"],
            answer["province_normalized"],
            result["province_normalized"],
            province_correct,
            answer["district"],
            result["district"],
            answer["district_normalized"],
            result["district_normalized"],
            district_correct,
            answer["ward"],
            result["ward"],
            answer["ward_normalized"],
            result["ward_normalized"],
            ward_correct,
            ok,
            timer[-1] / 1_000_000_000,
        ])
    except Exception as e:
        print(f"{answer = }")
        print(f"{result = }")
        df.append([
            test_idx,
            address,
            answer["province"],
            "EXCEPTION",
            answer["province_normalized"],
            "EXCEPTION",
            0,
            answer["district"],
            "EXCEPTION",
            answer["district_normalized"],
            "EXCEPTION",
            0,
            answer["ward"],
            "EXCEPTION",
            answer["ward_normalized"],
            "EXCEPTION",
            0,
            0,
            0,
        ])
        # any failure count as a zero correct
        pass
    correct += ok


    if not summary_only:
        # responsive stuff
        print(f"Test {test_idx:5d}/{len(data):5d}")
        print(f"Correct: {ok}/3")
        print(f"Time Executed: {timer[-1] / 1_000_000_000:.4f}")


print(f"-"*30)
total = len(data) * 3
score_scale_10 = round(correct / total * 10, 2)
if len(timer) == 0:
    timer = [0]
max_time_sec = round(max(timer) / 1_000_000_000, 4)
avg_time_sec = round((sum(timer) / len(timer)) / 1_000_000_000, 4)

import pandas as pd

df2 = pd.DataFrame(
    [[correct, total, score_scale_10, max_time_sec, avg_time_sec]],
    columns=['correct', 'total', 'score / 10', 'max_time_sec', 'avg_time_sec',],
)

columns = [
    'ID',
    'text',
    'province',
    'province_student',
    'province_normalized',
    'province_student_normalized',
    'province_correct',
    'district',
    'district_student',
    'district_normalized',
    'district_student_normalized',
    'district_correct',
    'ward',
    'ward_student',
    'ward_normalized',
    'ward_student_normalized',
    'ward_correct',
    'total_correct',
    'time_sec',
]

df = pd.DataFrame(df)
df.columns = columns

print(f'{TEAM_NAME = }')
print(f'{EXCEL_FILE = }')
print(df2)

!pip install xlsxwriter
writer = pd.ExcelWriter(EXCEL_FILE, engine='xlsxwriter')
df2.to_excel(writer, index=False, sheet_name='summary')
df.to_excel(writer, index=False, sheet_name='details')
writer.close()

------------------------------
TEAM_NAME = 'HK251'
EXCEL_FILE = 'HK251.xlsx'
   correct  total  score / 10  max_time_sec  avg_time_sec
0      936   1350        6.93        0.0042        0.0005
