# Install Dependencies

In [None]:
!pip install numpy==1.21.4
!pip install pandas==1.3.4
!pip install loguru==0.5.3

# Mount Clone Extract

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
# replace with your path
%cd "drive/MyDrive/gar_colab"

In [None]:
!git clone https://github.com/nurtdinovadf/garbdfias.git

In [None]:
%cd garbdfias

In [None]:
!ls

In [None]:
!tar -xvf data.tar.gz

In [None]:
!ls data

# Import

In [None]:
import gc
import glob
import os
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
from loguru import logger
from tqdm import tqdm

# Define Helper Functions

In [None]:
def cleanup(x):
    """
    Manual object cleaning
    """
    del x
    gc.collect()


def parse_xml(x):
    """
    Parse GAR XML file into pandas dataframe object
    """
    tree = ET.parse(x)
    root = tree.getroot()
    df = [child.attrib for child in root]
    df = pd.DataFrame.from_dict(df)
    return df


def get_adms(df):
    """
    Get administrative "object-parent" relations into dictionary for later use
    """
    rftree = df[['OBJECTID', 'PARENTOBJID']].groupby(
        by='OBJECTID'
    )['PARENTOBJID'].apply(list).to_dict()
    return rftree


def get_adms_rec_rev(chain, rdadm):
    """
    Get administrative address chains recursively
    """
    objid = chain[-1]
    if objid in rdadm and objid == objid:
        chains = [chain + [obj] for obj in rdadm[objid] if obj == obj]
        return [
            get_adms_rec_rev(ch, rdadm) for ch in chains
        ] if len(chains) > 1 else get_adms_rec_rev(chains[0], rdadm)
    return chain


def get_town(x):
    """
    Chain post-cleanup.
    """
    priority = ['5', '6', '4', '7', '1']
    street = [f'{i}' for i in range(8, 0, -1)]
    streets = [p for p in street if x[p] == 1]
    if len(streets) == 0:
        street = None
    else:
        street = streets[0]
    town = [p for p in priority if p != street and x[p] == 1]
    town = town[0] if len(town) > 0 else None
    leftover = [
        x for x in streets
        if x != street
        and x != town
        and x not in ['1', '2', '3']
    ]
    muni = [x for x in streets if x in ['2', '3']]

    return street, town, leftover, muni

# Core

## Define Region

In [None]:
regions = glob.glob('data/[0-9][0-9]')
region = regions[0]
region

## Read Address Object File

In [None]:
fname = glob.glob(os.path.join(region, 'AS_ADDR_OBJ_*.XML'))
fname = [x for x in fname if 'PARAMS' not in x and 'DIVISION' not in x]
if len(fname) != 1:
    msg = f'Please check file count for region {region} there are {len(fname)} files'
    logger.error(msg)
    raise Exception(msg)
fname = fname[0]
adobj = parse_xml(fname)
adobj = adobj[(adobj['ISACTUAL'] == '1') & (adobj['ISACTIVE'] == '1')]
adobj.head()

In [None]:
fname = glob.glob('data/AS_ADDR_OBJ_TYPES_*.XML')
if len(fname) != 1:
    msg = f'Please check file count for region {region} there are {len(fname)} files'
    logger.error(msg)
    raise Exception(msg)
fname = fname[0]
adobjt = parse_xml(fname)
adobjt.head()

In [None]:
adobj = adobj.merge(
    adobjt[['SHORTNAME', 'DESC', 'LEVEL']].rename(
        columns={
            'SHORTNAME': 'TYPENAME',
            'DESC': 'TYPELONGNAME'
        }
    ),
    on=['LEVEL', 'TYPENAME']
)
cleanup(adobjt)
adobj.head()

In [None]:
fname = glob.glob('data/AS_OBJECT_LEVELS_*.XML')
if len(fname) != 1:
    msg = f'Please check file count for region {region} there are {len(fname)} files'
    logger.error(msg)
    raise Exception(msg)
fname = fname[0]
lev = parse_xml(fname)

adobj = adobj.merge(
    lev[['NAME', 'LEVEL']].rename(
        columns={
            'NAME': 'LEVELNAME'
        }
    ),
    on='LEVEL'
)
adobj.head()

## Read Houses File

In [None]:
fname = glob.glob(os.path.join(region, 'AS_HOUSES_*.XML'))
fname = [x for x in fname if 'PARAMS' not in x]
if len(fname) != 1:
    msg = f'Please check file count for region {region} there are {len(fname)} files'
    logger.error(msg)
    raise Exception(msg)
fname = fname[0]
hous = parse_xml(fname)
hous = hous.rename(
    columns={
        'ADDTYPE1': 'HOUSETYPE1',
        'ADDTYPE2': 'HOUSETYPE2',
        'ADDNUM1': 'HOUSENUM1',
        'ADDNUM2': 'HOUSENUM2'
    }
)
if 'ISACTUAL' in hous.columns:
    hous = hous[(hous['ISACTUAL'] == '1') & (hous['ISACTIVE'] == '1')]
else:
    hous = hous[(hous['ISACTIVE'] == '1')]
hous.head()

In [None]:
fname = glob.glob('data/AS_HOUSE_TYPES_*.XML')
if len(fname) != 1:
    msg = f'Please check file count for region {region} there are {len(fname)} files'
    logger.error(msg)
    raise Exception(msg)
fname = fname[0]
houst = parse_xml(fname)
houst = houst.rename(
    columns={
        'SHORTNAME': 'TYPENAME',
        'DESC': 'TYPELONGNAME',
        'ID': 'HOUSETYPE'
    }
)
houst.head()

In [None]:
hous = hous.merge(
    houst[[
        'HOUSETYPE', 'TYPENAME', 'TYPELONGNAME'
    ]].drop_duplicates(),
    on='HOUSETYPE'
)
if 'HOUSETYPE2' in hous.columns:
    hous = hous.merge(
        houst[[
            'HOUSETYPE', 'TYPENAME', 'TYPELONGNAME'
        ]].rename(
            columns={
                'HOUSETYPE': 'HOUSETYPE1'
            }
        ).drop_duplicates(),
        on='HOUSETYPE1',
        how='left',
        suffixes=(None, '1')
    )
else:
    hous['HOUSETYPE1'] = np.nan
    hous['TYPELONGNAME1'] = np.nan
    hous['HOUSENUM1'] = np.nan
    hous['TYPENAME1'] = np.nan
if 'HOUSETYPE2' in hous.columns:
    hous = hous.merge(
        houst[[
            'HOUSETYPE', 'TYPENAME', 'TYPELONGNAME'
        ]].rename(
            columns={
                'HOUSETYPE': 'HOUSETYPE2'
            }
        ).drop_duplicates(),
        on='HOUSETYPE2',
        how='left',
        suffixes=(None, '2')
    )
else:
    hous['HOUSETYPE2'] = np.nan
    hous['TYPELONGNAME2'] = np.nan
    hous['HOUSENUM2'] = np.nan
    hous['TYPENAME2'] = np.nan
cleanup(houst)
hous.head()

In [None]:
hous['LEVEL'] = '10'
hous['LEVELNAME'] = 'Здание/Сооружение'
hous['NAME'] = hous['TYPELONGNAME'].str.lower() + ' ' + \
    hous['HOUSENUM']
hous['NAME1'] = hous[['TYPELONGNAME1', 'HOUSENUM1']].apply(
    lambda x: x['TYPELONGNAME1'].lower() + ' ' + x['HOUSENUM1']
    if x['HOUSENUM1'] == x['HOUSENUM1']
    and x['TYPELONGNAME1'] == x['TYPELONGNAME1']
    else '',
    axis=1
)
hous['NAME2'] = hous[['TYPELONGNAME2', 'HOUSENUM2']].apply(
    lambda x: x['TYPELONGNAME2'].lower() + ' ' + x['HOUSENUM2']
    if x['HOUSENUM2'] == x['HOUSENUM2']
    and x['TYPELONGNAME2'] == x['TYPELONGNAME2']
    else '',
    axis=1
)
hous.head()

In [None]:
hadobj = pd.concat(
  [
      adobj[[
          'OBJECTID', 'OBJECTGUID', 'NAME', 'TYPENAME', 'LEVEL',
          'ISACTUAL', 'ISACTIVE', 'TYPELONGNAME', 'LEVELNAME'
      ]],
      hous[[
          'OBJECTID', 'OBJECTGUID', 'HOUSENUM', 'HOUSETYPE',
          'TYPENAME', 'TYPELONGNAME', 'HOUSENUM1', 'HOUSETYPE1',
          'TYPENAME1', 'TYPELONGNAME1', 'HOUSENUM2', 'HOUSETYPE2',
          'TYPENAME2', 'TYPELONGNAME2', 'ISACTUAL', 'ISACTIVE',
          'LEVEL', 'NAME', 'NAME1', 'NAME2', 'LEVELNAME'
      ]]
  ],
  sort=True,
  ignore_index=True
)
cleanup(adobj)
cleanup(hous)
hadobj.head()

## Read Administrative Relations File

In [None]:
fname = glob.glob(os.path.join(region, 'AS_ADM_HIERARCHY_*.XML'))
if len(fname) != 1:
    msg = f'Please check file count for region {region} there are {len(fname)} files'
    logger.error(msg)
    raise Exception(msg)
fname = fname[0]
adm = parse_xml(fname)
adm0 = adm[adm['ISACTIVE'] == '1'][['OBJECTID', 'PARENTOBJID']].merge(
    hadobj[(hadobj['ISACTUAL'] == '1') & (hadobj['ISACTIVE'] == '1')],
    on='OBJECTID'
)
cleanup(adm)
adm0.head()

## Building Address Chains

In [None]:
rdadm = get_adms(adm0)
cleanup(adm0)
[(k, v) for k, v in rdadm.items()][:5]

In [None]:
hadobjd = hadobj.set_index('OBJECTID').to_dict('index')
chains = [
    get_adms_rec_rev([x], rdadm)
    for x in tqdm(hadobj[hadobj['LEVEL'] == '10']['OBJECTID'])
]
# save and clean
hadobj.to_csv(f'{region}_hadobj.csv', index=False)
cleanup(hadobj)
[(k, v) for k, v in hadobjd.items()][:5]

In [None]:
dfch = pd.DataFrame()
dfch['chain'] = [tuple(x) for x in chains]
cleanup(chains)
dfch.head()

In [None]:
dfch['levchain'] = [
    tuple([hadobjd[y]['LEVEL'] for y in x if y != '0' and y in hadobjd])
    for x in tqdm(dfch['chain'])
]
dat = [
    {
        m: l
        for m, l in zip(x, y)
    }
    for x, y in zip(dfch['levchain'], dfch['chain'])
]
for i in range(10, 0, -1):
    dfch[f'{i}'] = [
        d[f'{i}']
        if f'{i}' in d
        else None
        for d in dat
    ]
dfch.head()

In [None]:
chl = list(set(dfch['levchain'].apply(lambda x: '-'.join(x))))
df = pd.DataFrame()
df['levchain'] = chl
for i in range(10, 0, -1):
    dat = [(f'{i}' in y.split('-')) * 1 for y in chl]
    df[f'{i}'] = dat
df.head()

In [None]:
lst = df.apply(get_town, axis=1)
df['street'] = [x[0] for x in lst]
df['town'] = [x[1] for x in lst]
df['leftover'] = [x[2] for x in lst]
df['muni'] = [x[3] for x in lst]
df['levchain'] = df['levchain'].apply(lambda x: tuple(x.split('-')))
df.head()

In [None]:
dfch = dfch.merge(df[['levchain', 'street', 'town', 'leftover', 'muni']], on='levchain')
dfch['region'] = region
dfch.head()

## Save

In [None]:
dfch.to_csv(f'{region}_parsed_chains.csv', index=False)

# Read Result

In [None]:
# replace with your path
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
%cd "drive/MyDrive/gar_colab/garbdfias"

In [None]:
import glob
import random
import numpy as np
import pandas as pd
from ast import literal_eval
from tqdm import tqdm

In [None]:
regions = glob.glob('data/[0-9][0-9]')
region = regions[0]
region

In [None]:
hadobj = pd.read_csv(f'{region}_hadobj.csv')
hadobj.head()

In [None]:
hadobj.info()

In [None]:
hadobj.describe()

In [None]:
for x in [
          'OBJECTID', 'HOUSENUM', 'HOUSENUM1',
          'HOUSENUM2', 'LEVEL', 'NAME',
          'TYPELONGNAME', 'TYPELONGNAME1', 'TYPELONGNAME2',
          'TYPENAME', 'TYPENAME1', 'TYPENAME2']:
    hadobj[x] = hadobj[x].apply(lambda y: str(y) if y == y else np.nan)
hadobj['OBJECTID'].values[0]

In [None]:
dfch = pd.read_csv(f'{region}_parsed_chains.csv')
dfch.head()

In [None]:
for x in ['chain', 'levchain', 'leftover', 'muni']:
    dfch[x] = [literal_eval(y) for y in tqdm(dfch[x])]
for x in range(1, 11, 1):
    dfch[f'{x}'] = dfch[f'{x}'].astype(str).apply(lambda y: y.split('.')[0] if y == y else y)
    dfch[f'{x}'] = dfch[f'{x}'].apply(lambda y: y if y != '' else np.nan)
for x in ['street', 'town', 'region']:
    dfch[x] = dfch[x].astype(str).apply(lambda y: y.split('.')[0] if y == y else y)
    dfch[x] = dfch[x].apply(lambda y: y if y != '' else np.nan)

In [None]:
dfch.head()

In [None]:
hadobjd = hadobj.set_index('OBJECTID').to_dict('index')
[(k, v) for k, v in hadobjd.items()][:5]

## Plaintext Addresses

In [None]:
addresses = {x[0]: [] for x in tqdm(dfch.chain)}
for x in tqdm(dfch.chain):
    for y in x:
        if y != '0':
            addresses[x[0]].append(hadobjd[y]['TYPELONGNAME'])
            if hadobjd[y]['LEVEL'] != '10':
                addresses[x[0]].append(hadobjd[y]['NAME'])
            else:
                addresses[x[0]].append(hadobjd[y]['HOUSENUM'])
                if hadobjd[y]['HOUSENUM1'] == hadobjd[y]['HOUSENUM1']:
                    addresses[x[0]].append(hadobjd[y]['TYPELONGNAME1'])
                    addresses[x[0]].append(hadobjd[y]['HOUSENUM1'])
                    if hadobjd[y]['HOUSENUM2'] == hadobjd[y]['HOUSENUM2']:
                        addresses[x[0]].append(hadobjd[y]['TYPELONGNAME2'])
                        addresses[x[0]].append(hadobjd[y]['HOUSENUM2'])
addresses = {k: ' '.join(v) for k, v in addresses.items()}
sample = random.sample(addresses.keys(), 10)
{s: addresses[s].lower() for s in sample}

## Structured Addresses

In [None]:
def get_struct_addr(x):
    town = hadobjd[x[x['town']]]
    town = town['TYPELONGNAME'] + ' ' + town['NAME']
    street = hadobjd[x[x['street']]]
    street = street['TYPELONGNAME'] + ' ' + street['NAME']
    house = hadobjd[x['10']]
    house0 = house['TYPELONGNAME'] + ' ' + house['HOUSENUM']
    house1 = ''
    house2 = ''
    if house['HOUSENUM1'] == house['HOUSENUM1']:
        house1 = house['TYPELONGNAME1'] + ' ' + house['HOUSENUM1']
    if house['HOUSENUM2'] == house['HOUSENUM2']:
        house2 = house['TYPELONGNAME2'] + ' ' + house['HOUSENUM2']
    leftover = []
    for y in x['leftover']:
        leftover.append(hadobjd[x[y]]['TYPELONGNAME'])
        leftover.append(hadobjd[x[y]]['NAME'])
    leftover = ' '.join(leftover)
    muni = []
    for y in x['muni']:
        muni.append(hadobjd[x[y]]['TYPELONGNAME'])
        muni.append(hadobjd[x[y]]['NAME'])
    muni = ' '.join(muni)
    return {
        'town': town,
        'street': street,
        'house': house0,
        'house1': house1,
        'house2': house2,
        'leftover': leftover,
        'muni': muni
    }

In [None]:
struct_addresses = dfch.apply(get_struct_addr, axis=1)
struct_addresses[0]

In [None]:
structdf = pd.DataFrame.from_records(struct_addresses)
structdf.sample(10)

In [None]:
grouped = structdf.groupby(by=['town', 'street', 'house', 'house1', 'house2']).aggregate(lambda x: x.to_list())
grouped.head()

In [None]:
grouped[grouped['muni'].apply(lambda x: len(x) > 1)].sample(10)