# Imports

In [1]:
import json
from collections import Counter
from itertools import chain
from pathlib import Path

import numpy as np
import pandas as pd
import regex
from tqdm import tqdm

In [2]:
import time

In [3]:
from src.companies.processor import clean_company_type, normalize_company_name
from src.companies.utils import replace_company_types
from src.nif_validation.validation import (
    get_nif_type,
    validate_nif,
    is_valid_nif,
    # is_valid_cif,
    # is_valid_dni,
    # is_valid_nie,
    get_info_from_cif,
)
from src.utils.utils import fill_to_length, merge_orig_dataframes
from src.utils.utils_parallelization import (
    parallelize_function,
    parallelize_function_with_progress_bar,
)

# Load info

In [4]:
with open(r"C:\Users\josea\Downloads\genCat_Junio_2023.json", "r") as f:
    gencat = pd.json_normalize(json.load(f))

In [5]:
# df = pd.read_csv(r"C:\Users\josea\Downloads\Contractaci__p_blica_a_Catalunya__publicacions_a_la_Plataforma_de_serveis_de_contractaci__p_blica.csv")
df_emp = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv")

  df_emp = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv")


In [6]:
# # Load data
# dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/insiders.parquet")
# df_in = pd.read_parquet(dir_df)
# dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/outsiders.parquet")
# df_ou = pd.read_parquet(dir_df)
# dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/minors.parquet")
# df_mi = pd.read_parquet(dir_df)

In [7]:
# df_in.columns = [".".join([el for el in c if el]) for c in df_in.columns]
# df_ou.columns = [".".join([el for el in c if el]) for c in df_ou.columns]
# df_mi.columns = [".".join([el for el in c if el]) for c in df_mi.columns]

In [10]:
# # ERROR A REVISAR
# df_mi.loc[
#     df_mi["id"]
#     .str.lower()
#     .isin(
#         [
#             "https://contrataciondelestado.es/sindicacion/datosabiertosmenores/8622601",
#             "https://contrataciondelestado.es/sindicacion/datosabiertosmenores/8410165",
#         ]
#     ),
#     [
#         "ContractFolderStatus.TenderResult.WinningParty.PartyIdentification.ID",
#         "ContractFolderStatus.TenderResult.WinningParty.PartyName.Name",
#     ],
# ]

# Aux functions
Functions necessary for processing the cells

In [13]:
def nif_from_name(name):
    name_spl = np.array(name.split())
    valid = np.array([bool(validate_nif(s)) for s in name_spl])
    new_name = " ".join(name_spl[~valid])
    new_nif = Counter(name_spl[valid]).most_common()[0][0] if valid.any() else np.nan
    return new_name, new_nif

In [14]:
import contextlib


@contextlib.contextmanager
def log_time(task_name: str):
    """Context manager to log the execution time of a block of code."""
    t0 = time.time()
    yield
    t1 = time.time()
    print(f"{task_name} - {t1-t0}")


def execute_function(func, data, prefer=None, workers=-1, *args, **kwargs):
    """Wrapper function to decide whether to use parallel processing or not."""
    if not prefer:
        return data.apply(func, *args, **kwargs)
    else:
        return parallelize_function(
            func, data, prefer=prefer, workers=workers, *args, **kwargs
        )


def clean_df(df: pd.DataFrame, prefer=None, workers=-1):
    # Remove unwanted whitespace
    with log_time("Removing unwanted whitespace"):
        df = df.applymap(
            lambda x: regex.sub(r"((?<=\w+\W)\s+)|(\s+(?=\W\w+))", "", x)
            if not pd.isna(x)
            else None
        )

    # Validate NIF
    with log_time("Validating NIF"):
        df["ID"] = execute_function(validate_nif, df["ID"], prefer, workers)

    # Clean company type
    with log_time("Cleaning company type"):
        name = [
            regex.sub(i, "", n) if not (pd.isna(n) or pd.isna(i)) else n
            for i, n in df[["ID", "Name"]].values
        ]
        df["Name"] = execute_function(
            clean_company_type, name, prefer, workers, remove_type=False
        )

    # Remove company type
    with log_time("Removing company type"):
        df["Name_proc"] = execute_function(
            clean_company_type, df["Name"], prefer, workers, remove_type=True
        )

    # Normalize company name
    with log_time("Normalizing company name"):
        df["Name_norm"] = execute_function(
            normalize_company_name, df["Name_proc"], prefer, workers
        )

    return df

# Merge data from different sources

In [16]:
# df_companies = merge_orig_dataframes(
#     dir_metadata=Path("C:/Users/josea/Documents/Trabajo/data/metadata/")
# )
# df_companies.to_parquet("companies.parquet")

df_companies = pd.read_parquet("companies.parquet")

# Obtain individual companies

In [85]:
# Use only those where all dimensions match
# (e.g. same number of companies and companies ids)
# and drop NAs
df_companies = df_companies[
    df_companies[["ID", "Name"]]
    .applymap(lambda x: not pd.isna(x[0]))
    .apply(all, axis=1)
]
df_companies = df_companies[
    df_companies.applymap(lambda x: len(x) if x[0] else None).apply(
        lambda x: len(set([el for el in x if not pd.isnull(el)])) == 1,
        axis=1,
    )
]
companies_columns = list(df_companies.columns)
# Get number of companies by tender
df_companies["_len"] = df_companies["ID"].apply(len)

# Fill lists of None to have the same number of elements and explode later
companies = pd.DataFrame(
    df_companies.apply(
        lambda x: [fill_to_length(list(el), x[-1]) for el in x[:-1]], axis=1
    ).tolist(),
    columns=companies_columns,
)

# Split companies in rows
companies = companies.explode(companies_columns)
companies = companies.reset_index(drop=True)
display(companies.head())

Unnamed: 0,SMEAwardedIndicator,ID,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,id_tender
0,,b30437347,,climayor s.l. b30437347,,,,,https://contrataciondelestado.es/sindicacion/d...
1,,b60564309,,"gometrics, s.l.",,,,,https://contrataciondelestado.es/sindicacion/d...
2,,g57694549,,associaciò alcem el c.i.n.e.,,,,,https://contrataciondelestado.es/sindicacion/d...
3,,b73326019,,diseño y decoraciones j. peñalver s.l. b73326019,,,,,https://contrataciondelestado.es/sindicacion/d...
4,,b28954170,,"thermo fisher scientific, s.l.",,,,,https://contrataciondelestado.es/sindicacion/d...


In [15]:
# # FOREIGN IDs
# c_test = companies.drop_duplicates(subset="ID")
# c_test_valid = c_test["ID"].apply(is_valid_nif)
# val = "abcdefghjnpqrsuvw"
# c_test_sub = c_test.loc[~c_test_valid, "ID"]
# c_test_sub[
#     c_test_sub.apply(
#         lambda x: x[0] not in val and x[0].isalpha() and not x[:2] in ["xx",]
#     )
# ]
# display(c_test_sub)

In [86]:
with log_time("Clean df"):
    companies_clean = clean_df(companies, prefer="processes", workers=-1)

Removing unwanted whitespace - 50.80553698539734
Validating NIF - 10.666739225387573
Cleaning company type - 160.34527444839478
Removing company type - 43.476733684539795
Normalizing company name - 9.539687871932983
Clean df - 274.85123658180237


In [16]:
# t0 = time.time()
# companies_clean = clean_df(companies, prefer="processes", workers=-1)
# t1 = time.time()
# print(t1 - t0)

 - 41.184463024139404
 - 10.22936463356018
 - 160.36841225624084
 - 41.64890956878662
 - 7.885540962219238
261.3333966732025


In [87]:
# Aggregate company info in lists
companies_clean["SMEAwardedIndicator"] = companies_clean["SMEAwardedIndicator"].apply(
    lambda x: None if not x else True if x == "true" else False
)
companies_clean = (
    companies_clean
    # companies[["ID", "Name", "Name_proc", "Name_norm"]]
    .groupby(["ID", "Name_norm"])
    .agg(list)
    .reset_index()
)
companies_clean["count"] = companies_clean["Name_proc"].apply(len)
companies_clean = companies_clean.reset_index()

#### Unique names and IDs

In [88]:
# Unique names and IDs
# These companies have always appeared with the same (id-name) association
cols_vals = [
    c for c in companies_clean.columns if c not in ["ID", "Name_norm", "count"]
]
unique_ID = ~companies_clean["ID"].duplicated(keep=False)
unique_NAME = ~companies_clean["Name_norm"].duplicated(keep=False)

# Unique by ID and name
unique = companies_clean[unique_ID & unique_NAME]

# Non unique IDs
non_unique_ids = list(set(companies_clean["index"]) - set(unique["index"]))
non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]

unique["index"] = unique["index"].apply(lambda x: [x])
non_unique["index"] = non_unique["index"].apply(lambda x: [x])
print(unique.shape, non_unique.shape)

(205967, 13) (172180, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique["index"] = unique["index"].apply(lambda x: [x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_unique["index"] = non_unique["index"].apply(lambda x: [x])


#### Repeated IDs and Names

In [89]:
# Choose definitive values
def suggest_value(elements):
    """
    Select elements based on appearance.
    If same number of appearances, choose the longest.
    If shorter elements are not included in the 'main' one, return all.
    """
    cnt = Counter(elements)
    cnt.pop(None, None)
    cnt = cnt.most_common()
    if cnt:
        max_cnt = cnt[0][1]
        els = sorted([k for k, v in cnt if v == max_cnt], key=lambda x: (-len(x), x))
        # return els[0]
        base = els.pop(0)
        return [base]
        # if all(
        #     [all(t in base for t in regex.sub(r"\W", " ", el).split()) for el in els]
        # ):
        #     return [base]
        # return [base] + els
    else:
        # return None
        return [None]


# Repeated IDs
def unify_repeated_col(df: pd.DataFrame, rep_col: str, un_col: str):
    """
    Takes a dataframe with duplicated values in one column that should be unique (e.g. repeated IDs)
    and another column that should also be unique given the previous one (e.g. title)
    and unifies it so that it chooses the best option.

    Parameters
    ----------
    df: pd.DataFrame
    rep_col: str
        Name of column with repeated values that will be unified
    un_col: str
        Name of column with non unique values
    """
    # Non-unique columns
    cols_vals = [c for c in df.columns if c not in [rep_col, "count", "index"]]
    repeated_rows = df[rep_col].duplicated(keep=False)
    repeated = df[repeated_rows]

    # Count times the values appear
    repeated.loc[repeated.index, [un_col]] = (
        repeated.loc[repeated.index, un_col].apply(lambda x: [x])
        * repeated.loc[repeated.index, "count"]
    )
    # Group by repeated
    repeated = repeated.reset_index()
    repeated = repeated.groupby(rep_col).agg(
        {
            # "index": list,
            "index": sum,
            **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
            "count": sum,
        }
    )
    # Get the most common values for each column
    repeated.loc[repeated.index, un_col] = (
        repeated.loc[repeated.index, un_col].apply(suggest_value).values
    )
    repeated = repeated.reset_index()

    # Concatenate unique
    use_index = repeated.loc[repeated[un_col].apply(len) == 1, un_col].index
    repeated.loc[use_index, un_col] = repeated.loc[use_index, un_col].apply(
        lambda x: x[0]
    )
    unified = repeated.loc[use_index]

    return unified

In [90]:
# Obtain unique ID-name
unified_ID = unify_repeated_col(non_unique, "ID", "Name_norm")
# Update non_unique
non_unique_ids = list(
    set(chain.from_iterable(non_unique["index"]))
    - set(chain.from_iterable(unified_ID["index"]))
)
# non_unique = companies_clean.loc[non_unique_ids]
non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
non_unique["index"] = non_unique["index"].apply(lambda x: [x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_unique["index"] = non_unique["index"].apply(lambda x: [x])


In [91]:
# Obtain unique name-ID
unified_NAME = unify_repeated_col(non_unique, "Name_norm", "ID")
# Update non_unique
non_unique_ids = list(
    set(chain.from_iterable(non_unique["index"]))
    - set(chain.from_iterable(unified_NAME["index"]))
)
# non_unique = companies_clean.loc[non_unique_ids]
non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
non_unique["index"] = non_unique["index"].apply(lambda x: [x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_unique["index"] = non_unique["index"].apply(lambda x: [x])


#### Merge Companies info

In [92]:
# Global
# Merge unique+unifiedID+unifiedName+nonUnique
merged_global = pd.concat([unique, unified_ID, unified_NAME, non_unique])
cols_vals = [
    c
    for c in merged_global.columns
    if c not in ["ID", "Name_norm", "count", "index", "id_tender"]
]
merged_global = merged_global.groupby(["ID", "Name_norm"]).agg(
    {
        # "index": lambda x: list(chain.from_iterable(x)),
        "index": sum,
        "id_tender": sum,
        **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
        "count": sum,
    }
)
merged_global = merged_global.reset_index()
print(len(merged_global))
display(merged_global.head())

268641


Unnamed: 0,ID,Name_norm,index,id_tender,SMEAwardedIndicator,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count
0,00021492x,carmenbalgueriasjimenez,[0],[https://contrataciondelestado.es/sindicacion/...,"[None, None]","[None, None]","[carmen balguerias jiménez, carmen balguerias ...","[None, None]","[None, None]","[None, None]","[None, None]","[carmen balguerias jiménez, carmen balguerias ...",2
1,00035211k,palomasainzdelamazadelaserna,[1],[https://contrataciondelestado.es/sindicacion/...,[None],[None],[paloma sáinz de la maza de la serna],[None],[None],[None],[None],[paloma sáinz de la maza de la serna],1
2,00067665e,albertodelgadocebrian,[2],[https://contrataciondelestado.es/sindicacion/...,[None],[None],[alberto delgado cebrián],[None],[None],[None],[None],[alberto delgado cebrián],1
3,00072839k,fernandezabadvicente,[3],[https://contrataciondelestado.es/sindicacion/...,"[None, None]","[None, None]","[fernandez abad vicente, fernandez abad,vicente]","[None, None]","[None, None]","[None, None]","[None, None]","[fernandez abad vicente, fernandez abad,vicente]",2
4,00076938a,luisteranlopez,[4],[https://contrataciondelestado.es/sindicacion/...,"[None, None, None, None, None, None, None, Non...","[None, None, None, None, None, None, None, Non...","[luis terán lópez, luis terán lópez, luis terá...","[None, None, None, None, None, None, None, Non...","[None, None, None, None, None, None, None, Non...","[None, None, None, None, None, None, None, Non...","[None, None, None, None, None, None, None, Non...","[luis terán lópez, luis terán lópez, luis terá...",29


#### Unify found names

In [93]:
# Get all names found in the tenders
merged_global["UsedNames"] = (merged_global["Name"] + merged_global["Name_proc"]).apply(
    lambda x: sorted(list(set(x)))
)

#### Propose a final name

In [94]:
# Initial computations
data = merged_global["Name_proc"]
local_frequencies = data.apply(Counter)
proposed_names = local_frequencies.apply(lambda x: max(x, key=x.get))
# First proposal
merged_global["Name_proposed"] = proposed_names

# Total names
global_freq_dict = data.explode().value_counts().to_dict()
names_list = list(global_freq_dict.keys())

In [95]:
# test_ids = (
#     ["b83230870", "43715062c", "a28062339", "b28062339"]
#     + ["a62581798", "a62581905", "a63736078", "a63907273", "a64645120", "a82037300"]
#     + []
# )
# display(
#     merged_global.loc[
#         merged_global["ID"].str.lower().isin(test_ids),
#         ["ID", "Name", "Name_norm", "Name_proc", "Name_proposed"],
#     ]
# )

In [96]:
def adjust_names_for_batch_v2(names_batch, local_frequencies, proposed_names):
    """
    Adjusts and determines the most probable names for a given batch of names.

    For each name in the batch, the function checks if the name is associated
    with multiple IDs. If so, it retains the name only for the ID where the name
    has the highest frequency (local priority). For the other IDs, it assigns
    the next best name based on local frequency.

    Parameters:
    -----------
    names_batch: List[str]
        A batch of names to be processed.
    local_frequencies: pd.Series
        Series containing the frequency count of each name for each ID.
    proposed_names: pd.Series
        Series mapping each ID to its currently assigned name.

    Returns:
    --------
    modified_real_names: Dict[int, str]:
        A dictionary with IDs as keys and their adjusted real names as values.
    """
    modified_real_names = {}
    for name in names_batch:
        ids_with_name = proposed_names[proposed_names == name].index.tolist()
        if len(ids_with_name) > 1:
            ids_with_name.sort(key=lambda idx: -local_frequencies.loc[idx][name])
            for idx in ids_with_name[1:]:
                del local_frequencies.loc[idx][name]
                if local_frequencies.loc[idx]:
                    modified_real_names[idx] = max(
                        local_frequencies.loc[idx], key=local_frequencies.loc[idx].get
                    )

    return modified_real_names


new_proposed_name = parallelize_function_with_progress_bar(
    func=adjust_names_for_batch_v2,
    data=names_list,
    batch_size=1000,
    desc="progress_p2",
    workers=-1,
    prefer="processes",
    output="series",
    local_frequencies=local_frequencies,
    proposed_names=proposed_names,
)


# Update the proposed names with the new ones
for item in new_proposed_name:
    merged_global.loc[item.keys(), "Name_proposed"] = list(item.values())

progress_p2:   0%|          | 0/424 [00:00<?, ?it/s]

progress_p2: 100%|██████████| 424/424 [11:41<00:00,  1.66s/it]


In [98]:
# print(len([k for el in new_proposed_name for k in list(el.keys())]))
# print(Counter([k for el in new_proposed_name for k in list(el.keys())]).most_common())
# display(
#     merged_global.loc[
#         merged_global["ID"].str.lower().isin(test_ids),
#         ["ID", "Name", "Name_norm", "Name_proc", "Name_proposed"],
#     ]
# )

In [129]:
# merged_global.to_parquet("merged_global.parquet")
merged_global = pd.read_parquet("merged_global.parquet")

#### Check if company is SME

In [67]:
def isPYME(SMEIndicators):
    # Evaluate if is SME based on the SMEAwardedIndicator appearances
    # If True and False are present, return None
    # TODO: make a better decision
    sme_counts = Counter(SMEIndicators)
    if True in sme_counts and False in sme_counts:
        return None
    return sme_counts.most_common(1)[0][0]


merged_global["isPYME"] = merged_global["SMEAwardedIndicator"].apply(isPYME)

#### Check CityName and PostalZone

In [128]:
def get_city_name(CityName):
    # Evaluate the city name based on the CityName appearances
    # Get most common excluding None
    # TODO: make a better decision
    city_names = Counter(CityName)
    if None in city_names.keys():
        city_names.pop(None)
    if not len(city_names) == 1:
        return None
    return city_names.most_common(1)[0][0]


def get_postal_zone(PostalZone):
    # Evaluate the postal zone based on the PostalZone appearances
    # Get most common excluding None
    # TODO: make a better decision
    postal_zones = Counter(PostalZone)
    if None in postal_zones.keys():
        postal_zones.pop(None)
    if not len(postal_zones) == 1:
        return None
    return postal_zones.most_common(1)[0][0].split(".")[0]


merged_global["City"] = merged_global["CityName"].apply(get_city_name)
merged_global["PostalCode"] = merged_global["PostalZone"].apply(get_postal_zone)

### Add info

In [69]:
# Add information based on NIF
merged_global["NIF_type"] = merged_global["ID"].apply(get_nif_type)
merged_global["prov"], merged_global["comp_type"], merged_global["comp_desc"] = list(
    zip(*merged_global["ID"].apply(get_info_from_cif))
)
merged_global["comp_type"] = merged_global["comp_type"].apply(
    lambda x: x.split(",")[0] if not pd.isna(x) else None
)

### Find UTEs

In [173]:
# # Find UTEs based on name
# ute_n = provisional_company_info["UsedNames"].apply(
#     lambda x: bool(regex.search(r"\bu(\.)?t(\.)?e(\.)?\b", " ".join(x)))
# )
# # Find UTEs based on ID
# ute_i = provisional_company_info["NIF"].apply(lambda x: x.startswith("u"))

# # provisional_company_info[ute_i | ute_n][["NIF", "NameProposed", "UsedNames"]]
# # sum(ute_n), sum(ute_i), sum(ute_n & ute_i), sum(ute_n & ute_i)/min(sum(ute_n), sum(ute_i))

# utes = provisional_company_info[ute_i | ute_n]

### Save data

In [139]:
# provisional_company_info = merged_global.rename(
#     columns={
#         "ID": "NIF",
#         "id_tender": "TenderAppearance",
#         "prov": "Province",
#         "NIF_type": "NIFtype",
#         "comp_type": "CompanyType",
#         "comp_desc": "CompanyDescription",
#         "Name_proposed": "NameProposed",
#     }
# )[
#     [
#         "NIF",
#         "NameProposed",
#         "UsedNames",
#         "Province",
#         "City",
#         "NIFtype",
#         "CompanyType",
#         "CompanyDescription",
#         "isPYME",
#         "TenderAppearance",
#     ]
# ]

In [None]:
# provisional_company_info.to_parquet("provisional_company_info.parquet")
# utes.to_parquet("utes.parquet")

provisional_company_info = pd.read_parquet("provisional_company_info.parquet")
utes = pd.read_parquet("utes.parquet")

In [150]:
provisional_company_info

Unnamed: 0,NIF,NameProposed,UsedNames,Province,City,NIFtype,CompanyType,CompanyDescription,isPYME,TenderAppearance
0,00021492x,carmen balguerias jiménez,[carmen balguerias jiménez],,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
1,00035211k,paloma sáinz de la maza de la serna,[paloma sáinz de la maza de la serna],,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
2,00067665e,alberto delgado cebrián,[alberto delgado cebrián],,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
3,00072839k,fernandez abad vicente,"[fernandez abad vicente, fernandez abad,vicente]",,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
4,00076938a,luis terán lópez,[luis terán lópez],,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
...,...,...,...,...,...,...,...,...,...,...
268636,y9709754l,martín antoine marie payen,[martín antoine marie payen],,,NIE,,,True,[https://contrataciondelestado.es/sindicacion/...
268637,y9754280v,donatella magelli,[donatella magelli],,la oliva,NIE,,,False,[https://contrataciondelestado.es/sindicacion/...
268638,z0013339g,messe berlin gmbh,[messe berlin gmbh],,,NIE,,,True,[https://contrataciondelestado.es/sindicacion/...
268639,z0059955e,stichting vulture conservation foundation,[stichting vulture conservation foundation],,,NIE,,,,[https://contrataciondelestado.es/sindicacion/...


## Empresas Zaragoza

In [40]:
# df = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv", sep=";", header=0, nrows=64, index_col=False)
# df = pd.read_excel(r"C:\Users\josea\Downloads\empresas.xlsx")

with open(r"C:\Users\josea\Downloads\empresas_zgz.csv", "r", encoding="utf-8") as f:
    emp = [
        [el.replace('"', "").strip() for el in l.lower().strip().split(";", 4)]
        for l in f.readlines()
        if len(l) > 2
    ]
cols = emp[0]
data = emp[1:]
emp_zgz = pd.DataFrame(data=data, columns=cols)
emp_zgz = emp_zgz.applymap(lambda x: x if x else None)
emp_zgz = emp_zgz.dropna(how="all").drop_duplicates().reset_index(drop=True)
emp_zgz["empresa"] = emp_zgz["empresa"].apply(clean_company_type)
emp_zgz["empresa_proc"] = emp_zgz["empresa"].apply(clean_company_type, remove_type=True)
emp_zgz["nif"] = emp_zgz["nif"].apply(lambda x: regex.sub(r"\W", "", x) if x else None)
emp_zgz["nif_type"] = emp_zgz["nif"].apply(lambda x: get_nif_type(x) if x else None)

In [41]:
print(len(emp_zgz))
display(emp_zgz.head())

889


Unnamed: 0,nombre,expediente,empresa,nif,texto,empresa_proc,nif_type
0,alquiler de un equipo de iluminación para esce...,cul09-19,fuse records - fluge zaragoza s.l.,b99053506,https://contrataciondelestado.es/wps/poc?uri=d...,fuse records - fluge zaragoza,CIF
1,contrato mixto de servicios y obras menores de...,eco-2019311,u.t.e. fcc medioambiente s.a. - fcc aqualia s.a.,,https://contrataciondelestado.es/wps/poc?uri=d...,fcc medioambiente fcc aqualia,
2,derribo de la estructura actual de polipasto y...,eco2019335-19,u.t.e. casale ipocsa,,https://contrataciondelestado.es/wps/poc?uri=d...,casale ipocsa,
3,derribo de la estructura actual de polipasto y...,eco2019335-19,depuracion de aguas de mediterraneo s.l.,b96456553,https://contrataciondelestado.es/wps/poc?uri=d...,depuracion de aguas de mediterraneo,CIF
4,obras de acondicionamiento de la planta 3ª de ...,viv01-19,u.t.e. paspa romero polo meg,,https://contrataciondelestado.es/wps/poc?uri=d...,paspa romero polo meg,


In [42]:
# Empresas con nombre repetido y cuyo nif es None en alguno de los casos
valid_emp = emp_zgz[["nif", "empresa", "nombre"]].reset_index()
display(
    valid_emp[
        valid_emp["empresa"].duplicated(keep=False)
        & valid_emp["empresa"].isin(
            valid_emp.loc[valid_emp["nif"].isna(), "empresa"].values
        )
    ].sort_values(by="empresa")
)
# (No hay ningún nif para rellenar)

Unnamed: 0,index,nif,empresa,nombre
410,410,,artal vehículos zaragoza s.l.,"suministro de dos vehículos tipo pick-up, dobl..."
547,547,,artal vehículos zaragoza s.l.,suministro de dos (2) vehículos híbridos no en...
341,341,,comsa service facility management s.a.,servicio de mantenimiento y conducción de inst...
727,727,,comsa service facility management s.a.,servicios de conducción y mantenimiento de las...
305,305,,dnota medioambiente s.l.,"suministro, instalación y mantenimiento durant..."
500,500,,dnota medioambiente s.l.,"suministro, instalación y mantenimiento en el ..."
389,389,,dolores trading s.l. la buganvilla,servicio de catering para los plenos del ayunt...
629,629,,dolores trading s.l. la buganvilla,servicio de catering para los plenos del ayunt...
433,433,,tecnica y proyectos s.a. typsa,asistencia técnica de apoyo en materia de reda...
451,451,,tecnica y proyectos s.a. typsa,redacción del proyecto de urbanización de la p...


In [46]:
# valid_emp = emp_zgz.groupby("nif").agg(set).reset_index()
valid_emp = emp_zgz[["nif", "empresa"]].reset_index()
print(len(valid_emp))
display(valid_emp.head())

889


Unnamed: 0,index,nif,empresa
0,0,b99053506,fuse records - fluge zaragoza s.l.
1,1,,u.t.e. fcc medioambiente s.a. - fcc aqualia s.a.
2,2,,u.t.e. casale ipocsa
3,3,b96456553,depuracion de aguas de mediterraneo s.l.
4,4,,u.t.e. paspa romero polo meg


## Compare companies from Tenders and Zaragoza

In [47]:
# final_merged[["ID", "Name", "count"]]

In [48]:
# Create dataframe with counter for appearances
ids = [Counter({k: v}) for k, v in final_merged[["ID", "count"]].values]
names = [Counter({k: v}) for k, v in final_merged[["Name", "count"]].values]

### Same nif

In [49]:
common_nif = set(final_merged["ID"]) & set(valid_emp["nif"])
# Companies
common_comp_nif = pd.DataFrame(data={"ID": final_merged["ID"].values, "Name": names})
common_comp_nif = common_comp_nif[common_comp_nif["ID"].isin(common_nif)]
# Zaragoza
common_emp_nif = (
    valid_emp[valid_emp["nif"].isin(common_nif)]
    .groupby("nif")
    .agg({"index": list, "empresa": Counter})
    .reset_index()
)
# Merge
c_nif = pd.merge(
    common_comp_nif,
    common_emp_nif,
    left_on="ID",
    right_on="nif",
).reset_index(drop=True)

# Select final values
final_name = []
for vals in zip(c_nif["Name"].values, c_nif["empresa"].values):
    v = Counter()
    [v.update(el) for el in vals]
    final_name.append(v)
c_nif["final_name"] = final_name
c_nif["final_name"] = c_nif["final_name"].apply(lambda x: suggest_value(x)[0])
c_nif["final_id"] = c_nif["ID"]

print(len(c_nif))
display(c_nif.head())

# Update used and remaining
idx = set(chain.from_iterable(c_nif["index"]))
rest_valid_emp = valid_emp[~valid_emp["index"].isin(idx)]

469


Unnamed: 0,ID,Name,nif,index,empresa,final_name,final_id
0,07046603r,{'alba mª ortega altamirano': 1},07046603r,"[590, 604]",{'alba m.ª ortega altamirano': 2},alba m.ª ortega altamirano,07046603r
1,07823314w,{'jesus gutierrez sierra': 86},07823314w,[489],{'jesús gutiérrez sierra': 1},jesus gutierrez sierra,07823314w
2,17144195h,{'jaime macipe gayarre': 7},17144195h,[616],{'jaime macipe gayarre': 1},jaime macipe gayarre,17144195h
3,17215345y,{'pascual pardos sierra': 1},17215345y,[324],{'pascual pardos sierra': 1},pascual pardos sierra,17215345y
4,17747571n,{'dª maría nila atienza fanlo': 1},17747571n,"[390, 630]",{'mª nila atienza fanlo': 2},mª nila atienza fanlo,17747571n


In [50]:
valid_emp[valid_emp["nif"].isin(common_nif)]

Unnamed: 0,index,nif,empresa
0,0,b99053506,fuse records - fluge zaragoza s.l.
3,3,b96456553,depuracion de aguas de mediterraneo s.l.
5,5,b99413874,inardec construccion creativa s.l.
6,6,a50070663,construcciones rubio morte s.a.
7,7,a80241789,serveo servicios s.a.u. antigua ferrovial serv...
...,...,...,...
884,884,w0049001a,arag s.e.e. s.e.e.
885,885,a08055196,das defensa del automovilista y de siniestros ...
886,886,b61790663,rotagrama lithoformas s.l.
887,887,a20524310,impresiones transkrit s.a.


### Same name

In [51]:
common_name = set(final_merged["Name"]) & set(rest_valid_emp["empresa"])
# Companies
common_comp_name = pd.DataFrame(data={"ID": ids, "Name": final_merged["Name"].values})
common_comp_name = common_comp_name[common_comp_name["Name"].isin(common_name)]
# Zaragoza
common_emp_name = (
    rest_valid_emp[rest_valid_emp["empresa"].isin(common_name)]
    .groupby("empresa")
    .agg({"index": list, "nif": Counter})
    .reset_index()
)
# Merge
c_name = pd.merge(
    common_comp_name,
    common_emp_name,
    left_on="Name",
    right_on="empresa",
).reset_index(drop=True)

# Select final values
final_id = []
for vals in zip(c_name["ID"].values, c_name["nif"].values):
    v = Counter()
    [v.update(el) for el in vals]
    final_id.append(v)
c_name["final_id"] = final_id
c_name["final_id"] = c_name["final_id"].apply(lambda x: suggest_value(x)[0])
c_name["final_name"] = c_name["empresa"]

print(len(c_name))
display(c_name.head())


# Update used and remaining
idx.update(set(chain.from_iterable(c_name["index"])))
rest_valid_emp = valid_emp[~valid_emp["index"].isin(idx)]

23


Unnamed: 0,ID,Name,empresa,index,nif,final_id,final_name
0,{'05005241g': 2},wilmer ossa buitrago,wilmer ossa buitrago,"[586, 593, 608, 710]",{None: 4},05005241g,wilmer ossa buitrago
1,{'07992498k': 2},juan gabriel rodriguez holgado,juan gabriel rodriguez holgado,[33],{'7992498k': 1},07992498k,juan gabriel rodriguez holgado
2,{'40972429e': 58},juan sirera pascual,juan sirera pascual,[396],{None: 1},40972429e,juan sirera pascual
3,{'a48228399': 39},carsa,carsa,[516],{None: 1},a48228399,carsa
4,{'a50066190': 26},durban maquinaria para la construccion s.a.,durban maquinaria para la construccion s.a.,[411],{None: 1},a50066190,durban maquinaria para la construccion s.a.


In [52]:
matches = pd.concat([c_nif, c_name])
matches

Unnamed: 0,ID,Name,nif,index,empresa,final_name,final_id
0,07046603r,{'alba mª ortega altamirano': 1},07046603r,"[590, 604]",{'alba m.ª ortega altamirano': 2},alba m.ª ortega altamirano,07046603r
1,07823314w,{'jesus gutierrez sierra': 86},07823314w,[489],{'jesús gutiérrez sierra': 1},jesus gutierrez sierra,07823314w
2,17144195h,{'jaime macipe gayarre': 7},17144195h,[616],{'jaime macipe gayarre': 1},jaime macipe gayarre,17144195h
3,17215345y,{'pascual pardos sierra': 1},17215345y,[324],{'pascual pardos sierra': 1},pascual pardos sierra,17215345y
4,17747571n,{'dª maría nila atienza fanlo': 1},17747571n,"[390, 630]",{'mª nila atienza fanlo': 2},mª nila atienza fanlo,17747571n
...,...,...,...,...,...,...,...
18,{'b98134752': 1},beberapid c.b.,{'e98134752': 1},[44],beberapid c.b.,beberapid c.b.,b98134752
19,{'w0173298a': 23},fundación delegación fundación finnova,{None: 1},[514],fundación delegación fundación finnova,fundación delegación fundación finnova,w0173298a
20,{'b86907128': 1442},agilent technologies spain s.l.,{'b82381682': 2},"[25, 780]",agilent technologies spain s.l.,agilent technologies spain s.l.,b86907128
21,{'a08432338': 105},mantenimiento y montajes industriales s.a.,{None: 1},[722],mantenimiento y montajes industriales s.a.,mantenimiento y montajes industriales s.a.,a08432338


In [53]:
final_merged[final_merged["ID"] == "b50931302"]

Unnamed: 0,ID,Name_norm,index,id_tender,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count,NIF_type,prov,comp_type,comp_desc


In [54]:
validate_nif("05412400v")

'05412400v'

In [56]:
# (
#     valid_emp.loc[
#         valid_emp["empresa"].duplicated(keep=False)
#         & valid_emp["empresa"].isin(
#             valid_emp.loc[valid_emp["nif"].isna(), "empresa"].values
#         ),
#         ["nif", "empresa", "nombre"],
#     ].sort_values(by="empresa")
# ).to_csv("empresas_zgz_no_nif.csv")

In [59]:
emp_zgz_wrong = valid_emp.loc[
    ~valid_emp["index"].isin(idx), ["nif", "empresa"]
].dropna()

In [74]:
validate_nif("17410114f", correct=True, verbose=True)

Error: wrong first digit.
Error: wrong control digit.
Error: wrong first digit.


In [76]:
emp_zgz_wrong[
    ~emp_zgz_wrong["nif"].apply(validate_nif, correct=True).apply(bool)
].to_csv("empresas_zgz_mal_nif.csv", index=False)

In [None]:
len(idx), len(valid_emp), len(idx) / len(valid_emp)

(782, 889, 0.8796400449943758)

In [None]:
# Common name, different nif
c_name_diff_nif_comp_ids = list(
    set(common_comp_name.index) - set(common_comp_nif.index)
)
c_name_diff_nif_emp_ids = list(set(common_emp_name.index) - set(common_emp_nif.index))

In [None]:
# emp_zgz.loc[c_name_diff_nif_emp_ids]

In [None]:
final_merged.loc[c_name_diff_nif_comp_ids].head()

Unnamed: 0,ID,Name_norm,index,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count
241254,b96911037,bluedec,"[345549, 345550, 345551, 345552, 345553, 345554]",,bluedec s.l.,,es,,es523,bluedec,49
218790,b84044205,imagicvision,[313116],,imagic vision s.l.,madrid,es,28033.0,es43,imagic vision,10
184139,b50941855,agenciaaragonesadenoticias,[260591],,agencia aragonesa de noticias s.l.,,,,,agencia aragonesa de noticias,10
243852,b98134752,beberapid,[349790],,beberapid c.b.,,,,,beberapid,1
11181,07992498k,juangabrielrodriguezholgado,"[14027, 14028]",,juan gabriel rodriguez holgado,,,,,juan gabriel rodriguez holgado,2


In [None]:
# c_concat = pd.concat([c_nif, c_name])
# c_concat[~c_concat.duplicated(keep=False)]

In [280]:
set(emp_zgz["nif"]) - set(final_merged["ID"])

{'17410114f',
 '17732626v',
 '17763291t',
 '18426455m',
 '514863471',
 '5412400v',
 '7992498k',
 'a27178789b50113562',
 'a50004431',
 'a50016666',
 'a50032002',
 'a50169333',
 'a81196743',
 'a96141361',
 'b01644889',
 'b06851471',
 'b50349323',
 'b50709641',
 'b50755149',
 'b50931302',
 'b54941855',
 'b673750006',
 'b79309902',
 'b82381682',
 'b840444205',
 'b84049238',
 'b85508059',
 'b9691037',
 'b99399438',
 'b99412629',
 'b99532574',
 'bb09671256',
 'e98134752',
 'g50493097',
 'n0171609a',
 'q99118598',
 'siret56211521200077',
 'sl',
 'u02778884',
 'u99489411'}

In [282]:
# common_nif = set(df["nif"]) - set(final_merged["ID"])
# common_comp = final_merged[final_merged["ID"].isin(common_nif)][
#     ["ID", "Name"]
# ].drop_duplicates()
# common_emp = df[df["nif"].isin(common_nif)][["nif", "empresa"]].drop_duplicates()

In [None]:
# dup_emp = df["empresa"].duplicated(keep=False)
# dup_nif = df["nif"].duplicated(keep=False)