# n-gram test

In [1]:
import json
import random
import time
import matplotlib.pyplot as plt

## Import adminstrative region data

In [2]:
with open("./Data/region_tree.json", 'r') as file:
    region_tree = json.load(file)
    
with open("./Data/reverse_index_dist_to_prov.json", 'r') as file:
    rev_index_dist_to_prov = json.load(file)
    
with open("./Data/reverse_index_ward_to_prov.json", 'r') as file:
    rev_index_ward_to_prov = json.load(file)
    
with open("./Data/reverse_index_ward_to_dist.json", 'r') as file:
    rev_index_ward_to_dist = json.load(file)

In [3]:
all_provinces = [province['name'] for province in region_tree.values()]
all_districts = [district['name'] for district in rev_index_dist_to_prov.values()]
all_wards = [ward['name'] for ward in rev_index_ward_to_prov.values()]

## Build dictionary mapping digram to region name

In [4]:
def digram_from_string(string: str) -> [str]: # return list of digram
    string = string.strip().lower()
    if len(string) == 1:
        return [string + ' ']
    
    normal_digram = [string[i]+string[i+1] for i in range(len(string)-1)]
    head_tail_digram = string[0] + string[-1]
    #one_gap_digram = [string[i]+string[i+2] for i in range(len(string)-2)]
    #one_over_last_digram = string[-1] + ' '
    one_before_first_digram = ' ' + string[0]
    
    return [head_tail_digram, 
            *normal_digram, 
            #*one_gap_digram, 
            #one_over_last_digram, 
            one_before_first_digram
           ]

In [5]:
digram_to_region = {}
digram_count = 0
dupli_count = 0

for region_name in all_provinces:
    digrams = []
    for w in region_name.split():
        digrams += digram_from_string(w)
    
    for dg in digrams:
        digram_count += 1
        dupli_count += dg in digram_to_region
        if dg in digram_to_region:
            digram_to_region[dg]['province'].add(region_name)
        else:
            digram_to_region[dg] = {
                'province': set([region_name]),
                'district': set(),
                'ward': set()
            }
            
for region_name in all_districts:
    digrams = []
    for w in region_name.split():
        digrams += digram_from_string(w)
    
    for dg in digrams:
        digram_count += 1
        dupli_count += dg in digram_to_region
        if dg in digram_to_region:
            digram_to_region[dg]['district'].add(region_name)
        else:
            digram_to_region[dg] = {
                'province': set(),
                'district': set([region_name]),
                'ward': set()
            }

for region_name in all_wards:
    digrams = []
    for w in region_name.split():
        digrams += digram_from_string(w)
    
    for dg in digrams:
        digram_count += 1
        dupli_count += dg in digram_to_region
        if dg in digram_to_region:
            digram_to_region[dg]['ward'].add(region_name)
        else:
            digram_to_region[dg] = {
                'province': set(),
                'district': set(),
                'ward': set([region_name])
            }

In [26]:
print(digram_count, dupli_count)
print(len(digram_to_region))

digrams = ['dh', 'di', 'ic', 'ch', ' d']
for d in digrams:
    print(d, digram_to_region[d]['ward'])

75710 74394
1316
dh {'Vàng Danh', 'Tìa Dình', 'Dịch Vọng', 'Danh Thắng', 'Phước Dinh', 'Dịch Vọng Hậu', 'Băng A Drênh', 'Dĩnh Kế', 'Kinh Dinh', 'Hắc Dịch', 'Hạnh Dịch', 'Mai Dịch', 'Tân Dĩnh', 'Dĩnh Trì', 'Ba Dinh', 'Kim Dinh', 'Ea Dăh', 'Nậm Dịch'}
di {'Diễn Kỷ', 'Diên Thọ', 'Kinh Dinh', 'Láng Dài', 'Diễn Xuân', 'Diễn Thành', 'Đầm Dơi', 'Diễn Bích', 'Hoàng Diệu', 'Diên Toàn', 'Di Linh', 'Diễn Tân', 'Diễn Trường', 'A Dơi', 'Di Lăng', 'Phước Dinh', 'Diên Phú', 'Vũ Di', 'Minh Diệu', 'Ia Din', 'Diên Tân', 'Diễn Thái', 'Diễn Phú', 'Diễn Cát', 'Diên Đồng', 'Diễn Hùng', 'Di Trạch', 'Diên Xuân', 'Diên Khánh', 'Cát Dài', 'Diễn Mỹ', 'Diên Lạc', 'Diên Phước', 'Diễn Yên', 'Ba Dinh', 'Diễn Đồng', 'Diễn Ngọc', 'Diên Lâm', 'Diễn Châu', 'Thọ Diên', 'Quỳnh Diễn', 'Phúc Diễn', 'Diễn Nguyên', 'Nghi Diên', 'Diễn Lợi', 'Phú Diên', 'Diễn Lâm', 'Diễn Hoàng', 'Diên Điền', 'Diễn Hạnh', 'Diên Thạnh', 'Diễn Trung', 'Phú Diễn', 'Diễn Quảng', 'Diêu Trì', 'Diên Sơn', 'Trần Quang Diệu', 'Diên Bình', 'Diễn Hoa', 'Di

## Find region with digram

In [21]:
addrs = ['Ea. Knốp', 'AnE Minh', 'Phú Lươnz ']

def find_region_with_digram(addr):
    words = addr.strip().split()
    prov_candidate = dist_candidate = ward_candidate = None
    
    for word in words:
        digrams = digram_from_string(word)
        possible_prov = possible_dist = possible_ward = set()
        
        for dg in digrams:
            possible_prov = possible_prov | (digram_to_region[dg]['province'] if dg in digram_to_region else set())
            possible_dist = possible_dist | (digram_to_region[dg]['district'] if dg in digram_to_region else set())
            possible_ward = possible_ward | (digram_to_region[dg]['ward'] if dg in digram_to_region else set())

        if prov_candidate is None:
            prov_candidate = possible_prov
        else:
            prov_candidate = prov_candidate & possible_prov
            
        if dist_candidate is None:
            dist_candidate = possible_dist
        else:
            dist_candidate = dist_candidate & possible_dist
            
        if ward_candidate is None:
            ward_candidate = possible_ward
        else:
            ward_candidate = ward_candidate & possible_ward

        #print('prov: {}\ndist: {}\nward: {}'.format(possible_prov, possible_dist, possible_ward))
        #print('-' * 5)

    #print('prov: {}\ndist: {}\nward: {}'.format(prov_candidate, dist_candidate, ward_candidate))
    #print('-' * 5)
        
    return {
        'province': prov_candidate, 
        'district': dist_candidate, 
        'ward': ward_candidate
    }

## Find region with digram within region constraint 

In [8]:
def get_list_of_region_with_constraints_recursive(result, province: {}, district: {}, ward: {}, 
                                                  level:int, stop_level:int = 0) -> None:
    # province, district and ward can be None
    if level <= stop_level:
        return
    if not (1 <= level <= 3):
        return
    
    if level == 3:
        region_level = 'province'
        if province is not None:
            result[region_level] = {province['name']: province}
        else:
            result[region_level] = region_tree
        get_list_of_region_with_constraints_recursive(result, province, district, ward, 2)

    elif level == 2:
        region_level = 'district'
        if district is not None:
            result[region_level] = {district['name']: district}
            
            new_prov_list = {}
            for prov_name, prov_info in result['province'].items():
                if district in prov_info['districts'].values():
                    new_prov_list[prov_name] = prov_info
            result['province'] = new_prov_list
        else:
            dist_list = {}
            for prov in result['province'].values():
                for dist in prov['districts'].values():
                    dist_list[(prov['name'], dist['name'])] = dist
            result[region_level] = dist_list
        get_list_of_region_with_constraints_recursive(result, province, district, ward, 1)
            
    elif level == 1:
        region_level = 'ward'
        if ward is not None:
            result[region_level] = {ward['name']: ward}
            
            new_dist_list = {}
            for dist_name, dist_info in result['district'].items():
                if ward in dist_info['wards'].values():
                    new_dist_list[dist_name] = dist_info
            result['district'] = new_dist_list
        else:
            ward_list = {}
            for dist in result['district'].values():
                for w in dist['wards'].values():
                    ward_list[(dist['name'], w['name'])] = w
            result[region_level] = ward_list
        get_list_of_region_with_constraints_recursive(result, province, district, ward, 0)


def get_list_of_region_with_constraints(province: {}, district: {}, ward: {}, stop_level:int = 0):
    # province, district and ward can be None
    result = {
        'province': None, 
        'district': None, 
        'ward': None
    }
    get_list_of_region_with_constraints_recursive(result, province, district, ward, level=3, stop_level=stop_level)
    
    result['province']  = {prov['name'] for prov in result['province'].values()}
    result['district']  = {dist['name'] for dist in result['district'].values()}
    result['ward']  = {ward['name'] for ward in result['ward'].values()}
    
    return result

In [9]:
def find_region_with_digram_with_constraints(string: str, level:int, province: {}, district: {}, ward: {}):
    # province, district and ward can be None
    from_digram = find_region_with_digram(string)
    from_constraint = get_list_of_region_with_constraints(province, district, ward)
    
    prov_result = (from_digram['province'] & from_constraint['province']) if level == 3 else from_constraint['province']
    dist_result = (from_digram['district'] & from_constraint['district']) if level == 2 else from_constraint['district']
    ward_result = (from_digram['ward'] & from_constraint['ward']) if level == 1 else from_constraint['ward']
    
    return {
        'province': prov_result, 
        'district': dist_result, 
        'ward': ward_result
    }

In [10]:
def get_region_info_list_from_possibilities_with_constraints(
    possibilities: {}, level: int, province: {}, district: {}, ward: {}):
    # designed to be used by find_region_with_digram_within_region
    
    last_level_region_list = {}
    current_level_region_list = {}
    current_level = 3
    
    while True:
        last_level_region_list = current_level_region_list
        current_level_region_list = {}
        
        if current_level == 3:
            if province is not None:
                current_level_region_list[province['name']] = province
            else:
                current_level_region_list = {region_tree[prov]['name']: region_tree[prov] 
                                             for prov in possibilities['province']}
                
                # filter out prov that exists within possibility
                filtered_list = {}
                for pos_prov in possibilities['province']:
                    if pos_prov in current_level_region_list:
                        filtered_list[pos_prov] = current_level_region_list[pos_prov]
                
                print('filtered dist: ', [r['name'] for r in filtered_list.values()])
                current_level_region_list = filtered_list
                
        elif current_level == 2:
            if district is not None:
                current_level_region_list[district['name']] = district 
            else:
                # get list of all dist with current set of prov
                for prov_name, prov_info in last_level_region_list.items():
                    for dist_name, dist_info in prov_info['districts'].items():
                        current_level_region_list[(prov_name, dist_name)] = dist_info
                
                # filter out dist that exists within possibility
                filtered_list = {}
                for pos_dist in possibilities['district']:
                    for dist_name, dist_info in current_level_region_list.items():
                        if pos_dist == dist_name[-1]:
                            if pos_dist in filtered_list:
                                print('Warning: district with same name found twice')
                            filtered_list[pos_dist] = dist_info
                
                print('filtered dist: ', [r['name'] for r in filtered_list.values()])
                current_level_region_list = filtered_list
            
        elif current_level == 1:
            if ward is not None:
                current_level_region_list[ward['name']] = ward
            else:
                # get list of all ward with current set of dist
                for dist_name, dist_info in last_level_region_list.items():
                    for ward_name, ward_info in dist_info['wards'].items():
                        current_level_region_list[(dist_name, ward_name)] = ward_info
                
                # filter out dist that exists within possibility
                filtered_list = {}
                for pos_ward in possibilities['ward']:
                    for ward_name, ward_info in current_level_region_list.items():
                        if pos_ward == ward_name[-1]:
                            if pos_ward in filtered_list:
                                print('Warning: ward with same name found twice')
                            filtered_list[pos_ward] = ward_info
                
                print('filtered ward: ', [r['name'] for r in filtered_list.values()])
                current_level_region_list = filtered_list
                
        else:
            raise
        
        if current_level == level:
            break
        current_level -= 1
        
    return current_level_region_list

## Test run

In [18]:
print(digram_from_string('Mỹ Đức'))
print(digram_from_string('dich'))

['mc', 'mỹ', 'ỹ ', ' đ', 'đứ', 'ức', ' m']
['dh', 'di', 'ic', 'ch', ' d']


In [None]:
result = find_region_with_digram('dich')
for v in result.values():
    print(len(v))
print(result)

In [None]:
start = time.perf_counter_ns()
for _ in range(100):
    choice = random.randrange(0, len(addrs))
    result = find_region_with_digram(addrs[choice])
runtime = time.perf_counter_ns() - start

print(runtime / 1_000_000, 'ms')

In [None]:
result = get_list_of_region_with_constraints(region_tree['Hà Nội'], 
                                            None, 
                                            None)
for v in result.values():
    print(len(v))
print(result)

In [None]:
start = time.perf_counter_ns()
for _ in range(100):
    find_region_with_digram_with_constraints('dich', 1,
                                      region_tree['Hà Nội'], 
                                      None, 
                                      None)
runtime = time.perf_counter_ns() - start

print(runtime / 1_000_000, 'ms')

In [20]:
result = find_region_with_digram_with_constraints('dich', 1,
                                      region_tree['Hà Nội'], 
                                      None, 
                                      None)
for v in result.values():
    print(len(v))
print(result)

dich ['dh', 'di', 'ic', 'ch', ' d']
1
30
71
{'province': {'Hà Nội'}, 'district': {'Đống Đa', 'Ứng Hòa', 'Mỹ Đức', 'Thạch Thất', 'Bắc Từ Liêm', 'Thanh Oai', 'Hai Bà Trưng', 'Đan Phượng', 'Thường Tín', 'Long Biên', 'Ba Đình', 'Thanh Xuân', 'Tây Hồ', 'Hoài Đức', 'Chương Mỹ', 'Phú Xuyên', 'Thanh Trì', 'Ba Vì', 'Phúc Thọ', 'Sơn Tây', 'Nam Từ Liêm', 'Đông Anh', 'Hà Đông', 'Quốc Oai', 'Sóc Sơn', 'Hoàng Mai', 'Gia Lâm', 'Hoàn Kiếm', 'Cầu Giấy', 'Mê Linh'}, 'ward': {'Phúc Diễn', 'Dục Tú', 'Bách Khoa', 'Phượng Dực', 'Hòa Chính', 'Bạch Đằng', 'Cầu Dền', 'Châu Sơn', 'Tân Dân', 'Phượng Cách', 'Trạch Mỹ Lộc', 'Chuyên Mỹ', 'Dương Nội', 'Dịch Vọng', 'Dương Xá', 'Duyên Hà', 'Kim Chung', 'Bạch Mai', 'Cầu Diễn', 'Đại Mạch', 'Xuân Dương', 'Châu Can', 'Chi Đông', 'Dương Hà', 'Tích Giang', 'Thạch Bàn', 'Bạch Hạ', 'Di Trạch', 'Hòa Thạch', 'Phan Chu Trinh', 'Dân Hòa', 'Cao Dương', 'Minh Châu', 'Trung Châu', 'Thạch Thán', 'Văn Chương', 'Nguyễn Du', 'Hoàng Diệu', 'Tiên Dương', 'Chu Minh', 'Phương Canh', 'Trúc B

In [29]:
start = time.perf_counter_ns()
for _ in range(100):
    result = find_region_with_digram_with_constraints('Mỹ Lươ Lập', 1,
                                          region_tree['Hưng Yên'], 
                                          None, 
                                          None)
runtime = time.perf_counter_ns() - start

print(runtime / 1_000_000, 'ms')

78.52791 ms


In [30]:
get_region_info_list_from_possibilities_with_constraints(result, 1,
                                                       region_tree['Hưng Yên'], 
                                                       None, 
                                                       None)

filtered dist:  ['Văn Lâm', 'Phù Cừ', 'Ân Thi', 'Văn Giang', 'Hưng Yên', 'Khoái Châu', 'Tiên Lữ', 'Kim Động', 'Mỹ Hào', 'Yên Mỹ']
filtered ward:  []


{}