In [1]:
import numpy as np
import re
import random
import itertools

from templates import *

In [2]:
test_adress = "286000, Вінницька, Вінниця, вул.Фрунзе, 47, кв.12"

In [3]:
def add_noise_to_address(address, noise):
    
    non_alpha_numeric_positions = [m.start() for m in re.finditer(r"[^\w]", address)]
    non_alpha_numeric_positions.append('endl')
    
    if non_alpha_numeric_positions:
        random_position = random.choice(non_alpha_numeric_positions)
        if random_position == 'endl':
            modified_address = address + noise
        else:    
            modified_address = address[:random_position] + noise + address[random_position+1:]
        return modified_address
    else:
        modified_address = address
        return modified_address

add_noise_to_address(test_adress, '111')

'286000, Вінницька, Вінниця, вул.Фрунзе111 47, кв.12'

In [4]:
def get_domains(address):
    pattern = r"\{(.*?)\}"
    matches = re.findall(pattern, address)
    return matches

In [5]:
replaces = [
    [" м.", " місто "],
    [",м.", ",місто "],
    [' вул.', " вулиця ", " в."],
    [',вул.', ",вулиця ", ",в."],
    [' буд.', ' буд ', ' будинок '],
    [',буд.', ',буд ', ',будинок '],
    [",кв.", ",квартира "],
    [" кв.", " квартира "],
    ['р-н', 'район'],
    ["бульвар ", "бульв.", "б-р "],
    ["пров.", "пров ", "провулок "],
    ["просп.", "проспект "],
    ["площа ", "пл."],
    [" село ", " с.", " селище "],
    [",село ", ",с.", ",селище "],
    ["обл.", "область "],
    ["обл,", "область,"],
    ["обл ", "область "]]

In [6]:
def find_replacement_match(address, replacement_set):
    for replacement in replacement_set:
        if replacement in address:
            return replacement

In [7]:
new_templates = address_templates.copy()

for replacement_set in replaces:
    current_count = len(new_templates)
    for ind, template in enumerate(new_templates):
        if ind == current_count:
            new_templates = list(set(new_templates))
            break
        replacement_token = find_replacement_match(template, replacement_set)
        if replacement_token:
            for domain_token in replacement_set:
                new_templates.append(template.replace(replacement_token, domain_token))


In [9]:
len(new_templates)

4668

## Generate True samples

In [10]:
len(new_templates) * len(new_templates)

21790224

In [11]:
# get available for True templates
available_diff = {'index', 'region', 'district'}
addresses_pairs = list(itertools.product(new_templates, new_templates))

true_addresses_pairs = []
for template_1, template_2 in addresses_pairs:
    if not set(get_domains(template_1)).symmetric_difference(set(get_domains(template_2))) - available_diff:
        true_addresses_pairs.append((template_1, template_2))


In [12]:
len(addresses_pairs)

21790224

In [13]:
len(true_addresses_pairs)

3897576

In [14]:
doms = set()
for t in new_templates:
    dom = get_domains(t)
    for d in dom:
        doms.add(d)

In [17]:
doms

{'avenue',
 'city',
 'district',
 'flat_num',
 'house_num',
 'index',
 'lane',
 'region',
 'street',
 'village'}

In [48]:
def generate_template_dict(real=True, house_tail_proba=0):
    
    if real == 'mix':
        tamplate_dict_real = {
            'avenue': np.random.choice(avenues),
            'city': np.random.choice(cities),
            'region': np.random.choice(regions),
            'district': np.random.choice(districts),
            'village': np.random.choice(villages),
            'street': np.random.choice(streets),
            'lane': np.random.choice(lanes),
            'house_num': str(np.random.randint(1, 120)),
            'flat_num': np.random.randint(1, 120),
            'index': ''.join(str(np.random.randint(0, 10)) for _ in range(5))
        }

        tamplate_dict_random = {
            'avenue': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'city': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'region': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'district': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'village': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'street': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'lane': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'house_num': str(np.random.randint(1, 120)),
            'flat_num': np.random.randint(1, 120),
            'index': ''.join(str(np.random.randint(0, 10)) for _ in range(5))
        }

        tamplate_dict = {key: random.choice([tamplate_dict_real[key], tamplate_dict_random[key]]) for key in tamplate_dict_real.keys()}
        
    elif real:
        tamplate_dict = {
            'avenue': np.random.choice(avenues),
            'city': np.random.choice(cities),
            'region': np.random.choice(regions),
            'district': np.random.choice(districts),
            'village': np.random.choice(villages),
            'street': np.random.choice(streets),
            'lane': np.random.choice(lanes),
            'house_num': str(np.random.randint(1, 120)),
            'flat_num': np.random.randint(1, 120),
            'index': ''.join(str(np.random.randint(0, 10)) for _ in range(5))
        }
    
    else:
        tamplate_dict = {
            'avenue': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'city': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'region': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'district': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'village': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'street': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'lane': ''.join(random.choices(ukrainian_alphabet + ' ', k=random.randint(4, 14))),
            'house_num': str(np.random.randint(1, 120)),
            'flat_num': np.random.randint(1, 120),
            'index': ''.join(str(np.random.randint(0, 10)) for _ in range(5))
        }
    
    if house_tail_proba > 0 and np.random.rand() < house_tail_proba:
        tamplate_dict['house_num'] += random.choice(['-а', '-б', '-в', '-г'] + ['к' + str(n) for n in range(1, 8)] + ['/' + str(n) for n in range(1, 8)])
    
    return tamplate_dict
        

def generate_address(address_template, template_dict):
    address = address_template.format(**template_dict)
    return address
    
    
def add_noise(address, proba):
    if np.random.rand() < proba:
        noise = random.choice(noises)
        address = add_noise_to_address(address, noise.format(noise=random.choice([' ', ' ', ',' ,'а', '-', 'к', '1', '2', '3'])))
    return address

def generate_address_pair(true_addresses_pairs, all_addresses_pairs, label=True, noise_proba=0.05):
    if label:
        address_template_1, address_template_2 = random.choice(true_addresses_pairs)
        template_dict = generate_template_dict(real='mix', house_tail_proba=0.02)
        
        address_1 = generate_address(address_template_1, template_dict)
        address_1 = add_noise(address_1, noise_proba)
        
        address_2 = generate_address(address_template_2, template_dict)
        address_2 = add_noise(address_2, noise_proba)
        
        return address_1, address_2, label
    
    else:
        address_template_1, address_template_2 = random.choice(all_addresses_pairs)
        template_dict_1 = generate_template_dict(real='mix', house_tail_proba=0.02)
        template_dict_2 = generate_template_dict(real='mix', house_tail_proba=0.02)

        address_1 = generate_address(address_template_1, template_dict_1)
        address_1 = add_noise(address_1, noise_proba)

        address_2 = generate_address(address_template_2, template_dict_2)
        address_2 = add_noise(address_2, noise_proba)

        return address_1, address_2, label
        

In [47]:
generate_address_pair(true_addresses_pairs, addresses_pairs)

('запорізька область , острог, вулиця ящґвяше, буд  112, квартира 119',
 '79433, запорізька область , острог, пров.ящґвяше, буд  112, кв.119',
 True)

In [21]:
generate_address_pair(true_addresses_pairs, addresses_pairs, label=False)

('26963, вінницька обл., місто коростень, провулок жфютю`о, буд. 18  .  квартира  30',
 '15292, сумська область , місто краматорськ, пров ь ивіичебкґге, буд. 92, кв. 89',
 False)