In [17]:
import pathlib

import numpy as np
import pandas as pd
import tqdm

In [2]:
CHUNKSIZE = 1e5

In [3]:
def format_tax_number(tax_number: str) -> str:
    tn_len = len(tax_number)

    if tn_len < 9 or tn_len > 12:
        return np.nan

    if tn_len in (9, 11):
        return tax_number.rjust(tn_len + 1, "0")

    return tax_number

In [20]:
inv_1 = pd.read_csv("../data/opendata/matched/inventions_matched_part_1.csv", dtype=str)
inv_2 = pd.read_csv("../data/opendata/matched/inventions_matched_part_2.csv", dtype=str)
mod = pd.read_csv("../data/opendata/matched/models_matched.csv", dtype=str)
des = pd.read_csv("../data/opendata/matched/designs_matched.csv", dtype=str)

In [22]:
tax_numbers = pd.concat((inv_1, inv_2, mod, des))[["person_tax_number"]].dropna().drop_duplicates()
tax_numbers["person_tax_number"] = tax_numbers["person_tax_number"].apply(format_tax_number)
assert set(tax_numbers["person_tax_number"].str.len().value_counts().index) == {10, 12}
assert tax_numbers["person_tax_number"].nunique() == len(tax_numbers)
tax_numbers.shape

(148217, 1)

In [24]:
base_df = pd.read_csv(
    "../data/persons.zip",
    sep=";", 
    chunksize=CHUNKSIZE, 
    dtype=str
)

In [7]:
out_file = pathlib.Path("../data/persons_with_patents.csv")

In [8]:
for chunk in tqdm.tqdm(base_df):
    try:
        chunk.dropna(subset=["Наименование полное", "ИНН"], how="any", inplace=True)
        chunk = chunk.loc[chunk["Головная компания (1) или филиал (0)"] == '1', :].copy()
        chunk["ИНН"] = chunk["ИНН"].apply(format_tax_number)
        assert len(set(chunk["ИНН"].str.len().value_counts().index) - {10, 12}) == 0, \
            f"Unexpected tax number lengths {set(chunk['ИНН'].str.len().value_counts().index)}"
        
        chunk = chunk.loc[chunk["ИНН"].isin(tax_numbers["person_tax_number"])]       
        
        if out_file.exists():
            chunk.to_csv(out_file, header=False, index=False, mode="a")
        else:
            chunk.to_csv(out_file, header=True, index=False, mode="w")
    except Exception as e:
        print(e)
        pass

319it [11:26,  2.15s/it]


In [18]:
result = pd.read_csv(out_file, dtype=str)
result["ИНН"].nunique() == len(result), len(result)

(True, 148216)

In [15]:
result.drop_duplicates("ИНН").to_csv(out_file, index=False)