In [1]:
from collections import Counter
from itertools import chain, filterfalse
import json
import pathlib
import pickle
from typing import Tuple

from geonorm.geonormaliser_utils import decompose
import nltk
import numpy as np
import pandas as pd
import psycopg2
import sqlalchemy as sa
import tqdm

pd.set_option("display.max_columns", None)

In [2]:
CHUNKSIZE = 1e3

In [3]:
def preprocess_full_name(name: str) -> str:
    if pd.isna(name):
        return np.nan
    
    name = name.upper()
    tokens = nltk.word_tokenize(name)
    tokens = filter(lambda x: x not in stopwords, tokens)
    
    return " ".join(tokens)


def parse_address(addr: str, natasha: bool = False) -> str:
    parsed = dict.fromkeys(
        [
            "region", "region_type", "municipality", "municipality_type",
            "settlement", "settlement_type", "street", "street_type",
            "house", "location", "location_type", "not_decompose"
        ],
        np.nan
    )
    
    if pd.isna(addr):
        return parsed
    
    if natasha:
        decomposed = decompose(addr)
        for key, value in decomposed.items():
            if pd.notna(parsed[key]):
                parsed[key] = value
            
        return parsed    
    
    # Usually all addr element types are lowercase, but there are some exceptions
    for search in ["АО", "Аобл", "Респ", "Чувашия"]:
        addr = addr.replace(search, search.lower())
        
    parts = list(map(str.strip, addr.split(",")))
    
    for part in parts:
        tokens = part.split(" ")
        elem = " ".join(filterfalse(str.islower, tokens))
        elem_type = " ".join(filter(str.islower, tokens))
        
        elem_type_info = abbr_map.get(elem_type, (0, ""))
        if elem_type_info[0] == 0:
            elem_type_info = abbr_map.get(elem_type.replace(".", ""), (0, ""))
        
        if elem_type_info[0] in (1, ) and pd.isna(parsed["region"]):
            parsed["region"] = elem
            parsed["region_type"] = elem_type_info[1]          
        elif elem_type_info[0] in (3, 35, 5) and pd.isna(parsed["municipality"]):
            parsed["municipality"] = elem
            parsed["municipality_type"] = elem_type_info[1]        
        elif elem_type_info[0] in (4, 6) and pd.isna(parsed["settlement"]):
            parsed["settlement"] = elem
            parsed["settlement_type"] = elem_type_info[1]            
        elif elem_type_info[0] in (7, ) and pd.isna(parsed["street"]):
            parsed["street"] = elem
            parsed["street_type"] = elem_type_info[1]          
    
    return parsed


def get_address_els_from_row(row: pd.Series) -> Tuple:
    parsed = parse_address(row["address"])
    
    return parsed["region"], parsed["municipality"], parsed["settlement"], parsed["street"]

In [4]:
base_df = pd.read_csv(
    "../data/База по всем организациям и ИП.zip",
    sep=";", 
    chunksize=CHUNKSIZE, 
    dtype=str
)
with open("stopwords.json") as f:
    stopwords = json.load(f)

abbr = pd.read_csv("abbr.csv")
abbr_full = abbr[["fias_level", "name_full", "name_full"]]
abbr_full.columns = ["fias_level", "name", "name_full"]
abbr = pd.concat((
    abbr,
    abbr_full
))
abbr_map = {
    row["name"].lower(): (row["fias_level"], row["name_full"].lower())
    for _, row in abbr.iterrows()
}

In [5]:
out_file = pathlib.Path("/home/pavel/search_index.csv")

for chunk in tqdm.tqdm(base_df):
    try:
        chunk["name"] = chunk["Наименование полное"].apply(preprocess_full_name)
        #chunk["name_short"] = chunk["Наименование краткое"].str.upper()
        chunk["tax_number"] = chunk["ИНН"]
        chunk["individual"] = chunk["ОКОПФ (расшифровка)"] == "Индивидуальный предприниматель"
        chunk["creation_date"] = pd.to_datetime(chunk["Дата создания"], errors="coerce").dt.date
        chunk["active"] = chunk["Компания действующая (1) или нет (0)"].fillna(0).astype(bool)
        chunk["activity_code"] = chunk["ОКВЭД2"]
        chunk = chunk.melt(
            id_vars=["name", "tax_number", "individual", "creation_date", "active", "activity_code"],
            value_vars=["Юр адрес", "Факт адрес"],
            var_name="addr_type",
            value_name="address",
        )

        chunk[["region", "municipality", "settlement", "street"]] = chunk.apply(
            get_address_els_from_row,
            axis=1,
            result_type="expand",
        )

        chunk.drop(
            columns=["address", "addr_type"],
            inplace=True
        )

        if out_file.exists():
            chunk.to_csv(out_file, header=False, index=False, mode="a")
        else:
            chunk.to_csv(out_file, header=True, index=False, mode="w")
    except Exception as e:
        pass


31836it [2:57:57,  2.98it/s]


In [6]:
chunk

Unnamed: 0,name,tax_number,individual,creation_date,active,activity_code,region,municipality,settlement,street
0,,772572699880,False,NaT,False,,,,,
1,,774317564166,False,NaT,False,,,,,
2,,774334087354,False,NaT,False,,,,,
3,,745303641007,False,NaT,False,,,,,
4,,710408005071,False,NaT,False,,,,,
...,...,...,...,...,...,...,...,...,...,...
447,,501212908103,False,NaT,False,,,,,
448,,502746759523,False,NaT,False,,,,,
449,,772609259293,False,NaT,False,,,,,
450,,771609793288,False,NaT,False,,,,,


In [8]:
chunk = base_df.get_chunk(100000)

In [94]:
parse_address(chunk.loc[502, "Юр адрес"])

республика чувашия (0, '')
г. (4, 'город')
ул. (7, 'улица')
д. пом. (0, '')


{'region': None,
 'region_type': None,
 'municipality': None,
 'municipality_type': None,
 'settlement': 'Чебоксары',
 'settlement_type': 'город',
 'street': 'Якимовская',
 'street_type': 'улица',
 'house': None,
 'location': None,
 'location_type': None,
 'not_decompose': None}

In [38]:
decompose(chunk.loc[49, "Юр адрес"])

{'region': 'Крым',
 'region_type': 'республика',
 'municipality': 'Красногвардейский',
 'municipality_type': 'район',
 'settlement': 'Октябрьское',
 'settlement_type': 'посёлок',
 'street': 'Цурцумия',
 'street_type': 'улица',
 'house': 'дом 10',
 'location': '',
 'location_type': '',
 'not_decompose': 'м. . Октябрьское    кв. 25'}