In [22]:
# NOTE: you CAN change this cell
# If you want to use your own database, download it here
# !gdown ...

In [23]:
# NOTE: you CAN change this cell
# Add more to your needs
# you must place ALL pip install here
#!pip install editdistance
!pip install gdown
!pip install pandas



In [24]:
# NOTE: you CAN change this cell
# import your library here
import re
from functools import lru_cache
from collections import defaultdict

In [25]:
# NOTE: you MUST change this cell
# New methods / functions must be written under class Solution.
debug = False

class Solution:
    def __init__(self):
        # Load Data Set
        self.province_path = 'list_province.txt'
        self.district_path = 'list_district.txt'
        self.ward_path = 'list_ward.txt'
        # Build BK-Tree
        self.province_bktree, self.province_map = self._build_bktree(self._load_data(self.province_path))
        self.district_bktree, self.district_map = self._build_bktree(self._load_data(self.district_path))
        self.ward_bktree, self.ward_map = self._build_bktree(self._load_data(self.ward_path))
        # Build Reference
        self.ref = self._build_reference("reference.txt")
        # Set up Prefix Map
        self.prefix_map = {
            "p": "ward",
            "q": "district",
            "t": "province",
            "x": "ward",
            "h": "district",
            "tt": "ward",
            "tx": "district",
            "tp": "province", # "district"
            "p.": "ward",
            "q.": "district",
            "t.": "province",
            "x.": "ward",
            "h.": "district",
            "tp.": "province", # "district"
            "xa": "ward",
            "xã": "ward",
            "tx.": "district",
            "tt.": "ward",
            "quan": "district",
            "quận": "district",
            "tinh": "province",
            "tỉnh": "province",
            "huyen": "district",
            "huyện": "district",
            "phuong": "ward",
            "phường": "ward",
            "thi xa": "district",
            "thị xã": "district",
            "thi tran": "ward",
            "thị trấn": "ward",
            "thanh pho": "province", # "district"
            "thành phố": "province" # "district"
        }

    def _load_data(self, file_path: str):
        with open(file_path, "r", encoding="utf-8") as f:
            return [line.strip() for line in f if line.strip()]

    def _build_reference(self, file_path: str):
        ref = {}
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                parts = [self._normalize(p) for p in line.split(",")]
                if len(parts) != 3:
                    continue
                ward, district, province = parts
                if province not in ref:
                    ref[province] = {}
                if district not in ref[province]:
                    ref[province][district] = set()
                ref[province][district].add(ward)
        return ref

    def _normalize(self, s: str) -> str:
        s = s.lower()
        #s = unidecode(s)
        s = re.sub(r'[\W_]+', ' ', s)
        return s.strip()

    @staticmethod
    def _levenshtein(a: str, b: str) -> int:
        if a == b:
            return 0
        if len(a) < len(b):
            a, b = b, a
        previous = list(range(len(b) + 1))
        for i, ca in enumerate(a, start=1):
            current = [i]
            for j, cb in enumerate(b, start=1):
                ins = previous[j] + 1
                dele = current[j-1] + 1
                rep = previous[j-1] + (0 if ca == cb else 1)
                current.append(min(ins, dele, rep))
            previous = current
        return previous[-1]

    @staticmethod
    def _lcs(s1, s2):
        N, M = len(s1)+1, len(s2)+1
        vec = [0]*M
        for i in range(N-2, -1, -1):
            tmp = [0]*M
            for j in range(M-2, -1, -1):
                tmp[j] = 1 + vec[j + 1] if s1[i] == s2[j] else max(vec[j], tmp[j + 1])
            vec = tmp
        return vec[0]

    class BKTree:
        def __init__(self, distfn):
            self.distfn = distfn
            self.tree = None  # (word, {distance: child_node})

        def add(self, word):
            if self.tree is None:
                self.tree = (word, {})
                return
            node = self.tree
            while True:
                w, children = node
                d = self.distfn(word, w)
                if d in children:
                    node = children[d]
                else:
                    children[d] = (word, {})
                    return

        def search(self, word, maxdist):
            if self.tree is None:
                return []
            results = []
            stack = [self.tree]
            while stack:
                w, children = stack.pop()
                d = self.distfn(word, w)
                if d <= maxdist:
                    num = Solution._lcs(word, w)
                    results.append((w, num, 1.0 - d/max(len(word), len(w))))
                low = d - maxdist
                high = d + maxdist
                for k in children:
                    if low <= k <= high:
                        stack.append(children[k])
            return results

    def _build_bktree(self, names):
        norm_to_orig = {}
        for name in names:
            n = self._normalize(name)
            # if duplicate normalized keys, keep the first (or could store list)
            if n not in norm_to_orig:
                norm_to_orig[n] = name
        bk = self.BKTree(Solution._levenshtein)
        for n in norm_to_orig:
            bk.add(n)
        return bk, norm_to_orig

    def _max_distance(self, L: int) -> int:
        if L <= 2:
            return 0
        if L <= 5:
            return 1
        return max(1, int(L * 0.25))

    def _detect_level(self, substr: str) -> tuple:
        prefix, level, num = None, None, None
        matches = []
        for pfx, lvl in self.prefix_map.items():
            candidate = self._normalize(substr[:len(pfx)])
            num = Solution._lcs(pfx, candidate)
            score = num/max(len(pfx), len(candidate))
            matches.append((pfx, lvl, num, score))
        prefix, level, num, _ = max(matches, key=lambda x: (x[3], x[2]))
        if debug: print(f"prefix matched: {prefix}")
        return prefix, level, num

    def _worker(self, substr: str) -> bool:
        maxdist = self._max_distance(max(1, len(substr.replace(' ', ''))))
        bias = 0.2
        prefix, level, num = self._detect_level(substr)
        match level:
            case "province":
                sub = substr[num:].strip()
                if not sub: return False
                if debug: print(f"sub: {sub}")
                start = time.perf_counter()
                results = self.province_bktree.search(sub, maxdist) if self.province_bktree is not None else []
                end = time.perf_counter()
                if debug: print(f"[{end-start:.4f}s] detected level: {level} - r: {results}")
                if results:
                    norm, num, score = max(results, key=lambda x: (x[2], x[1]))
                    if debug: print(f"best matched - norm: {norm}, num: {num}, score: {score}")
                    if score+bias > self.output["province"]["score"] or (score+bias == self.output["province"]["score"] and num > self.output["province"]["num"]):
                        orig = self.province_map.get(norm, "")
                        self.output["province"] = {'orig':orig,'norm':norm,'num':num,'score':score+bias}
                        if self.output["district"]["orig"] and self.output["district"]["norm"] not in self.ref[self.output["province"]["norm"]]:
                            self.output["district"] = {'orig':'','norm':'','num':0,'score':0}
                        if debug: print(f"output: {self.output}\n")
                        return True
            case "district":
                sub = substr[num:].strip()
                if not sub: return False
                if debug: print(f"sub: {sub}")
                start = time.perf_counter()
                results = self.district_bktree.search(sub, maxdist) if self.district_bktree is not None else []
                end = time.perf_counter()
                if debug: print(f"[{end-start:.4f}s] detected level: {level} - r: {results}")
                if results:
                    for res in sorted(results, key=lambda x: (x[2], x[1]), reverse=True):
                        norm, num, score = res
                        if debug: print(f"best matched - norm: {norm}, num: {num}, score: {score}")
                        if self.output["province"]["orig"] and norm not in self.ref[self.output["province"]["norm"]]: continue
                        if score+bias > self.output["district"]["score"] or (score+bias == self.output["district"]["score"] and num > self.output["district"]["num"]):
                            orig = self.district_map.get(norm, "")
                            self.output["district"] = {'orig':orig,'norm':norm,'num':num,'score':score+bias}
                            if self.output["ward"]["orig"] and all(self.output["ward"]["norm"] not in ws for ws in [province[self.output["district"]["norm"]] for province in self.ref.values() if self.output["district"]["norm"] in province]):
                                self.output["ward"] = {'orig':'','norm':'','num':0,'score':0}
                            if debug: print(f"output: {self.output}\n")
                            return True
            case "ward":
                sub = substr[num:].strip()
                if not sub: return False
                if debug: print(f"sub: {sub}")
                start = time.perf_counter()
                results = self.ward_bktree.search(sub, maxdist) if self.ward_bktree is not None else []
                end = time.perf_counter()
                if debug: print(f"[{end-start:.4f}s] detected level: {level} - r: {results}")
                if results:
                    for res in sorted(results, key=lambda x: (x[2], x[1]), reverse=True):
                        norm, num, score = res
                        if debug: print(f"best matched - norm: {norm}, num: {num}, score: {score}")
                        if self.output["province"]["orig"]:
                            if self.output["district"]["orig"]:
                                if norm not in self.ref[self.output["province"]["norm"]][self.output["district"]["norm"]]:
                                    continue
                            else:
                                if all(norm not in wards for wards in self.ref[self.output["province"]["norm"]].values()):
                                    continue
                        else:
                            if self.output["district"]["orig"]:
                                if all(norm not in province[self.output["district"]["norm"]] for province in self.ref.values() if self.output["district"]["norm"] in province):
                                    continue
                        if score+bias > self.output["ward"]["score"] or (score+bias == self.output["ward"]["score"] and num > self.output["ward"]["num"]):
                            orig = self.ward_map.get(norm, "")
                            self.output["ward"] = {'orig':orig,'norm':norm,'num':num,'score':score+bias}
                            if debug: print(f"output: {self.output}\n")
                            return True
        bias = 0
        # province
        start = time.perf_counter()
        results = self.province_bktree.search(substr, maxdist) if self.province_bktree is not None else []
        end = time.perf_counter()
        if debug: print(f"[{end-start:.4f}s] linear-scanned level: province - r: {results}")
        if results:
            norm, num, score = max(results, key=lambda x: (x[2], x[1]))
            if debug: print(f"best matched - norm: {norm}, num: {num}, score: {score}")
            if score+bias > self.output["province"]["score"] or (score+bias == self.output["province"]["score"] and num > self.output["province"]["num"]):
                orig = self.province_map.get(norm, "")
                self.output["province"] = {'orig':orig,'norm':norm,'num':num,'score':score+bias}
                if self.output["district"]["orig"] and self.output["district"]["norm"] not in self.ref[self.output["province"]["norm"]]:
                    self.output["district"] = {'orig':'','norm':'','num':0,'score':0}
                if debug: print(f"output: {self.output}\n")
                return True
        # district
        start = time.perf_counter()
        results = self.district_bktree.search(substr, maxdist) if self.district_bktree is not None else []
        end = time.perf_counter()
        if debug: print(f"[{end-start:.4f}s] linear-scanned level: district - r: {results}")
        if results:
            for res in sorted(results, key=lambda x: (x[2], x[1]), reverse=True):
                norm, num, score = res
                if debug: print(f"best matched - norm: {norm}, num: {num}, score: {score}")
                if self.output["province"]["orig"] and norm not in self.ref[self.output["province"]["norm"]]: continue
                if score+bias > self.output["district"]["score"] or (score+bias == self.output["district"]["score"] and num > self.output["district"]["num"]):
                    orig = self.district_map.get(norm, "")
                    self.output["district"] = {'orig':orig,'norm':norm,'num':num,'score':score+bias}
                    if self.output["ward"]["orig"] and all(self.output["ward"]["norm"] not in ws for ws in [province[self.output["district"]["norm"]] for province in self.ref.values() if self.output["district"]["norm"] in province]):
                        self.output["ward"] = {'orig':'','norm':'','num':0,'score':0}
                    if debug: print(f"output: {self.output}\n")
                    return True
        # ward
        start = time.perf_counter()
        results = self.ward_bktree.search(substr, maxdist) if self.ward_bktree is not None else []
        end = time.perf_counter()
        if debug: print(f"[{end-start:.4f}s] linear-scanned level: ward - r: {results}")
        if results:
            for res in sorted(results, key=lambda x: (x[2], x[1]), reverse=True):
                norm, num, score = res
                if debug: print(f"best matched - norm: {norm}, num: {num}, score: {score}")
                if self.output["province"]["orig"]:
                    if self.output["district"]["orig"]:
                        if norm not in self.ref[self.output["province"]["norm"]][self.output["district"]["norm"]]:
                            continue
                    else:
                        if all(norm not in wards for wards in self.ref[self.output["province"]["norm"]].values()):
                            continue
                else:
                    if self.output["district"]["orig"]:
                        if all(norm not in province[self.output["district"]["norm"]] for province in self.ref.values() if self.output["district"]["norm"] in province):
                            continue
                if score+bias > self.output["ward"]["score"] or (score+bias == self.output["ward"]["score"] and num > self.output["ward"]["num"]):
                    orig = self.ward_map.get(norm, "")
                    self.output["ward"] = {'orig':orig,'norm':norm,'num':num,'score':score+bias}
                    if debug: print(f"output: {self.output}\n")
                    return True
        return False

    def process(self, input: str) -> dict:
        start = time.perf_counter()
        self.output = {
            "province": {"orig": "", "norm": "", "num": 0, "score": 0},
            "district": {"orig": "", "norm": "", "num": 0, "score": 0},
            "ward": {"orig": "", "norm": "", "num": 0, "score": 0}
        }
        parts = [part for part in reversed(input.split(','))]
        for part in parts:
            norm_text = self._normalize(part)
            if debug: print(f"\nnorm: {norm_text}")
            tokens = norm_text.split()
            max_window = 4
            for i in reversed(range(len(tokens))):  # start from the last token
                for j in range(max(0, i-max_window+1), i+1):  # window backward
                    substr = ' '.join(tokens[j:i+1])
                    if debug: print(f"\ncandidate: ({j}, {i+1}) {substr}")
                    if (self._worker(substr)): break
        end = time.perf_counter()
        if debug: print(f"\nOverall Exec Time: {end-start:.4f}s")
        return {
            "province": self.output['province']['orig'],
            "district": self.output['district']['orig'],
            "ward": self.output['ward']['orig'],
        }

In [26]:
# NOTE: DO NOT change this cell
!rm -rf test.json
# this link is public test
!gdown --fuzzy 'https://drive.google.com/file/d/1PBt3U9I3EH885CDhcXspebyKI5Vw6uLB/view?usp=sharing' -O test.json

Downloading...
From: https://drive.google.com/uc?id=1PBt3U9I3EH885CDhcXspebyKI5Vw6uLB
To: /home/phuochtran/Desktop/address-classification/test.json
100%|███████████████████████████████████████| 79.4k/79.4k [00:00<00:00, 757kB/s]


In [27]:
# CORRECT TESTS
groups_province = {}
groups_district = {'hòa bình': ['Hoà Bình', 'Hòa Bình'], 'kbang': ['Kbang', 'KBang'], 'quy nhơn': ['Qui Nhơn', 'Quy Nhơn']}
groups_ward = {'ái nghĩa': ['ái Nghĩa', 'Ái Nghĩa'], 'ái quốc': ['ái Quốc', 'Ái Quốc'], 'ái thượng': ['ái Thượng', 'Ái Thượng'], 'ái tử': ['ái Tử', 'Ái Tử'], 'ấm hạ': ['ấm Hạ', 'Ấm Hạ'], 'an ấp': ['An ấp', 'An Ấp'], 'ẳng cang': ['ẳng Cang', 'Ẳng Cang'], 'ẳng nưa': ['ẳng Nưa', 'Ẳng Nưa'], 'ẳng tở': ['ẳng Tở', 'Ẳng Tở'], 'an hòa': ['An Hoà', 'An Hòa'], 'ayun': ['Ayun', 'AYun'], 'bắc ái': ['Bắc ái', 'Bắc Ái'], 'bảo ái': ['Bảo ái', 'Bảo Ái'], 'bình hòa': ['Bình Hoà', 'Bình Hòa'], 'châu ổ': ['Châu ổ', 'Châu Ổ'], 'chư á': ['Chư á', 'Chư Á'], 'chư rcăm': ['Chư Rcăm', 'Chư RCăm'], 'cộng hòa': ['Cộng Hoà', 'Cộng Hòa'], 'cò nòi': ['Cò  Nòi', 'Cò Nòi'], 'đại ân 2': ['Đại Ân  2', 'Đại Ân 2'], 'đak ơ': ['Đak ơ', 'Đak Ơ'], "đạ m'ri": ["Đạ M'ri", "Đạ M'Ri"], 'đông hòa': ['Đông Hoà', 'Đông Hòa'], 'đồng ích': ['Đồng ích', 'Đồng Ích'], 'hải châu i': ['Hải Châu  I', 'Hải Châu I'], 'hải hòa': ['Hải Hoà', 'Hải Hòa'], 'hành tín đông': ['Hành Tín  Đông', 'Hành Tín Đông'], 'hiệp hòa': ['Hiệp Hoà', 'Hiệp Hòa'], 'hòa bắc': ['Hoà Bắc', 'Hòa Bắc'], 'hòa bình': ['Hoà Bình', 'Hòa Bình'], 'hòa châu': ['Hoà Châu', 'Hòa Châu'], 'hòa hải': ['Hoà Hải', 'Hòa Hải'], 'hòa hiệp trung': ['Hoà Hiệp Trung', 'Hòa Hiệp Trung'], 'hòa liên': ['Hoà Liên', 'Hòa Liên'], 'hòa lộc': ['Hoà Lộc', 'Hòa Lộc'], 'hòa lợi': ['Hoà Lợi', 'Hòa Lợi'], 'hòa long': ['Hoà Long', 'Hòa Long'], 'hòa mạc': ['Hoà Mạc', 'Hòa Mạc'], 'hòa minh': ['Hoà Minh', 'Hòa Minh'], 'hòa mỹ': ['Hoà Mỹ', 'Hòa Mỹ'], 'hòa phát': ['Hoà Phát', 'Hòa Phát'], 'hòa phong': ['Hoà Phong', 'Hòa Phong'], 'hòa phú': ['Hoà Phú', 'Hòa Phú'], 'hòa phước': ['Hoà Phước', 'Hòa Phước'], 'hòa sơn': ['Hoà Sơn', 'Hòa Sơn'], 'hòa tân': ['Hoà Tân', 'Hòa Tân'], 'hòa thuận': ['Hoà Thuận', 'Hòa Thuận'], 'hòa tiến': ['Hoà Tiến', 'Hòa Tiến'], 'hòa trạch': ['Hoà Trạch', 'Hòa Trạch'], 'hòa vinh': ['Hoà Vinh', 'Hòa Vinh'], 'hương hòa': ['Hương Hoà', 'Hương Hòa'], 'ích hậu': ['ích Hậu', 'Ích Hậu'], 'ít ong': ['ít Ong', 'Ít Ong'], 'khánh hòa': ['Khánh Hoà', 'Khánh Hòa'], 'krông á': ['Krông Á', 'KRông á'], 'lộc hòa': ['Lộc Hoà', 'Lộc Hòa'], 'minh hòa': ['Minh Hoà', 'Minh Hòa'], 'mường ải': ['Mường ải', 'Mường Ải'], 'mường ẳng': ['Mường ẳng', 'Mường Ẳng'], 'nậm ét': ['Nậm ét', 'Nậm Ét'], 'nam hòa': ['Nam Hoà', 'Nam Hòa'], 'na ư': ['Na ư', 'Na Ư'], 'ngã sáu': ['Ngã sáu', 'Ngã Sáu'], 'nghi hòa': ['Nghi Hoà', 'Nghi Hòa'], 'nguyễn úy': ['Nguyễn Uý', 'Nguyễn úy', 'Nguyễn Úy'], 'nhân hòa': ['Nhân Hoà', 'Nhân Hòa'], 'nhơn hòa': ['Nhơn Hoà', 'Nhơn Hòa'], 'nhơn nghĩa a': ['Nhơn nghĩa A', 'Nhơn Nghĩa A'], 'phúc ứng': ['Phúc ứng', 'Phúc Ứng'], 'phước hòa': ['Phước Hoà', 'Phước Hòa'], 'sơn hóa': ['Sơn Hoá', 'Sơn Hóa'], 'tạ an khương đông': ['Tạ An Khương  Đông', 'Tạ An Khương Đông'], 'tạ an khương nam': ['Tạ An Khương  Nam', 'Tạ An Khương Nam'], 'tăng hòa': ['Tăng Hoà', 'Tăng Hòa'], 'tân hòa': ['Tân Hoà', 'Tân Hòa'], 'tân hòa thành': ['Tân Hòa  Thành', 'Tân Hòa Thành'], 'tân khánh trung': ['Tân  Khánh Trung', 'Tân Khánh Trung'], 'tân lợi': ['Tân lợi', 'Tân Lợi'], 'thái hòa': ['Thái Hoà', 'Thái Hòa'], 'thiết ống': ['Thiết ống', 'Thiết Ống'], 'thuận hòa': ['Thuận Hoà', 'Thuận Hòa'], 'thượng ấm': ['Thượng ấm', 'Thượng Ấm'], 'thụy hương': ['Thuỵ Hương', 'Thụy Hương'], 'thủy xuân': ['Thuỷ Xuân', 'Thủy Xuân'], 'tịnh ấn đông': ['Tịnh ấn Đông', 'Tịnh Ấn Đông'], 'tịnh ấn tây': ['Tịnh ấn Tây', 'Tịnh Ấn Tây'], 'triệu ái': ['Triệu ái', 'Triệu Ái'], 'triệu ẩu': ['Triệu ẩu', 'Triệu Ẩu'], 'trung hòa': ['Trung Hoà', 'Trung Hòa'], 'trung ý': ['Trung ý', 'Trung Ý'], 'tùng ảnh': ['Tùng ảnh', 'Tùng Ảnh'], 'úc kỳ': ['úc Kỳ', 'Úc Kỳ'], 'ứng hòe': ['ứng Hoè', 'Ứng Hoè'], 'vĩnh hòa': ['Vĩnh Hoà', 'Vĩnh Hòa'], 'vũ hòa': ['Vũ Hoà', 'Vũ Hòa'], 'xuân ái': ['Xuân ái', 'Xuân Ái'], 'xuân áng': ['Xuân áng', 'Xuân Áng'], 'xuân hòa': ['Xuân Hoà', 'Xuân Hòa'], 'xuất hóa': ['Xuất Hoá', 'Xuất Hóa'], 'ỷ la': ['ỷ La', 'Ỷ La']}
groups_ward.update({1: ['1', '01'], 2: ['2', '02'], 3: ['3', '03'], 4: ['4', '04'], 5: ['5', '05'], 6: ['6', '06'], 7: ['7', '07'], 8: ['8', '08'], 9: ['9', '09']})
def to_same(groups):
    same = {ele: k for k, v in groups.items() for ele in v}
    return same
same_province = to_same(groups_province)
same_district = to_same(groups_district)
same_ward = to_same(groups_ward)
def normalize(text, same_dict):
    return same_dict.get(text, text)

In [28]:
TEAM_NAME = 'HK251'
EXCEL_FILE = f'{TEAM_NAME}.xlsx'

import json
import time
with open('test.json') as f:
    data = json.load(f)

summary_only = True
df = []
solution = Solution()
timer = []
correct = 0
for test_idx, data_point in enumerate(data):
    address = data_point["text"]

    ok = 0
    try:
        answer = data_point["result"]
        answer["province_normalized"] = normalize(answer["province"], same_province)
        answer["district_normalized"] = normalize(answer["district"], same_district)
        answer["ward_normalized"] = normalize(answer["ward"], same_ward)

        start = time.perf_counter_ns()
        result = solution.process(address)
        finish = time.perf_counter_ns()
        timer.append(finish - start)
        result["province_normalized"] = normalize(result["province"], same_province)
        result["district_normalized"] = normalize(result["district"], same_district)
        result["ward_normalized"] = normalize(result["ward"], same_ward)

        province_correct = int(answer["province_normalized"] == result["province_normalized"])
        district_correct = int(answer["district_normalized"] == result["district_normalized"])
        ward_correct = int(answer["ward_normalized"] == result["ward_normalized"])
        ok = province_correct + district_correct + ward_correct

        df.append([
            test_idx,
            address,
            answer["province"],
            result["province"],
            answer["province_normalized"],
            result["province_normalized"],
            province_correct,
            answer["district"],
            result["district"],
            answer["district_normalized"],
            result["district_normalized"],
            district_correct,
            answer["ward"],
            result["ward"],
            answer["ward_normalized"],
            result["ward_normalized"],
            ward_correct,
            ok,
            timer[-1] / 1_000_000_000,
        ])
    except Exception as e:
        print(f"{answer = }")
        print(f"{result = }")
        df.append([
            test_idx,
            address,
            answer["province"],
            "EXCEPTION",
            answer["province_normalized"],
            "EXCEPTION",
            0,
            answer["district"],
            "EXCEPTION",
            answer["district_normalized"],
            "EXCEPTION",
            0,
            answer["ward"],
            "EXCEPTION",
            answer["ward_normalized"],
            "EXCEPTION",
            0,
            0,
            0,
        ])
        # any failure count as a zero correct
        pass
    correct += ok


    if not summary_only:
        # responsive stuff
        print(f"Test {test_idx:5d}/{len(data):5d}")
        print(f"Correct: {ok}/3")
        print(f"Time Executed: {timer[-1] / 1_000_000_000:.4f}")


print(f"-"*30)
total = len(data) * 3
score_scale_10 = round(correct / total * 10, 2)
if len(timer) == 0:
    timer = [0]
max_time_sec = round(max(timer) / 1_000_000_000, 4)
avg_time_sec = round((sum(timer) / len(timer)) / 1_000_000_000, 4)

import pandas as pd

df2 = pd.DataFrame(
    [[correct, total, score_scale_10, max_time_sec, avg_time_sec]],
    columns=['correct', 'total', 'score / 10', 'max_time_sec', 'avg_time_sec',],
)

columns = [
    'ID',
    'text',
    'province',
    'province_student',
    'province_normalized',
    'province_student_normalized',
    'province_correct',
    'district',
    'district_student',
    'district_normalized',
    'district_student_normalized',
    'district_correct',
    'ward',
    'ward_student',
    'ward_normalized',
    'ward_student_normalized',
    'ward_correct',
    'total_correct',
    'time_sec',
]

df = pd.DataFrame(df)
df.columns = columns

print(f'{TEAM_NAME = }')
print(f'{EXCEL_FILE = }')
print(df2)

!pip install xlsxwriter
writer = pd.ExcelWriter(EXCEL_FILE, engine='xlsxwriter')
df2.to_excel(writer, index=False, sheet_name='summary')
df.to_excel(writer, index=False, sheet_name='details')
writer.close()

------------------------------
TEAM_NAME = 'HK251'
EXCEL_FILE = 'HK251.xlsx'
   correct  total  score / 10  max_time_sec  avg_time_sec
0     1106   1350        8.19        0.7363        0.1916
