# Imports

In [1]:
import json
from collections import Counter
from itertools import chain
from pathlib import Path

import numpy as np
import pandas as pd
import regex
from tqdm import tqdm

In [2]:
import time

In [3]:
from src.companies.processor import clean_company_type, normalize_company_name
from src.companies.utils import replace_company_types
from src.nif_validation.validation import (
    get_nif_type,
    validate_nif,
    is_valid_nif,
    # is_valid_cif,
    # is_valid_dni,
    # is_valid_nie,
    get_info_from_cif,
)
from src.utils.utils import fill_to_length, merge_orig_dataframes
from src.utils.utils_parallelization import (
    parallelize_function,
    parallelize_function_with_progress_bar,
)

# Load info

In [4]:
with open(r"C:\Users\josea\Downloads\genCat_Junio_2023.json", "r") as f:
    gencat = pd.json_normalize(json.load(f))

In [5]:
# df = pd.read_csv(r"C:\Users\josea\Downloads\Contractaci__p_blica_a_Catalunya__publicacions_a_la_Plataforma_de_serveis_de_contractaci__p_blica.csv")
df_emp = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv")

  df_emp = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv")


In [6]:
# # Load data
# dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/insiders.parquet")
# df_in = pd.read_parquet(dir_df)
# dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/outsiders.parquet")
# df_ou = pd.read_parquet(dir_df)
# dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/minors.parquet")
# df_mi = pd.read_parquet(dir_df)

In [7]:
# df_in.columns = [".".join([el for el in c if el]) for c in df_in.columns]
# df_ou.columns = [".".join([el for el in c if el]) for c in df_ou.columns]
# df_mi.columns = [".".join([el for el in c if el]) for c in df_mi.columns]

In [8]:
# # ERROR A REVISAR
# df_mi.loc[
#     df_mi["id"]
#     .str.lower()
#     .isin(
#         [
#             "https://contrataciondelestado.es/sindicacion/datosabiertosmenores/8622601",
#             "https://contrataciondelestado.es/sindicacion/datosabiertosmenores/8410165",
#         ]
#     ),
#     [
#         "ContractFolderStatus.TenderResult.WinningParty.PartyIdentification.ID",
#         "ContractFolderStatus.TenderResult.WinningParty.PartyName.Name",
#     ],
# ]

# Aux functions
Functions necessary for processing the cells

In [9]:
def nif_from_name(name):
    name_spl = np.array(name.split())
    valid = np.array([bool(validate_nif(s)) for s in name_spl])
    new_name = " ".join(name_spl[~valid])
    new_nif = Counter(name_spl[valid]).most_common()[0][0] if valid.any() else np.nan
    return new_name, new_nif

In [10]:
import contextlib


@contextlib.contextmanager
def log_time(task_name: str):
    """Context manager to log the execution time of a block of code."""
    t0 = time.time()
    yield
    t1 = time.time()
    print(f"{task_name} - {t1-t0}")


def execute_function(func, data, prefer=None, workers=-1, *args, **kwargs):
    """Wrapper function to decide whether to use parallel processing or not."""
    if not prefer:
        return data.apply(func, *args, **kwargs)
    else:
        return parallelize_function(
            func, data, prefer=prefer, workers=workers, *args, **kwargs
        )


def clean_df(df: pd.DataFrame, prefer=None, workers=-1):
    # Remove unwanted whitespace
    with log_time("Removing unwanted whitespace"):
        df = df.applymap(
            lambda x: regex.sub(r"((?<=\w+\W)\s+)|(\s+(?=\W\w+))", "", x)
            if not pd.isna(x)
            else None
        )

    # Validate NIF
    with log_time("Validating NIF"):
        df["ID"] = execute_function(validate_nif, df["ID"], prefer, workers)

    # Clean company type
    with log_time("Cleaning company type"):
        name = [
            regex.sub(i, "", n) if not (pd.isna(n) or pd.isna(i)) else n
            for i, n in df[["ID", "Name"]].values
        ]
        df["Name"] = execute_function(
            clean_company_type, name, prefer, workers, remove_type=False
        )

    # Remove company type
    with log_time("Removing company type"):
        df["Name_proc"] = execute_function(
            clean_company_type, df["Name"], prefer, workers, remove_type=True
        )

    # Normalize company name
    with log_time("Normalizing company name"):
        df["Name_norm"] = execute_function(
            normalize_company_name, df["Name_proc"], prefer, workers
        )

    return df

# Merge data from different sources

In [11]:
# df_companies = merge_orig_dataframes(
#     dir_metadata=Path("C:/Users/josea/Documents/Trabajo/data/metadata/")
# )
# df_companies.to_parquet("data/companies.parquet")

df_companies = pd.read_parquet("data/companies.parquet")

# Obtain individual companies

In [12]:
# # Use only those where all dimensions match
# # (e.g. same number of companies and companies ids)
# # and drop NAs
# df_companies = df_companies[
#     df_companies[["ID", "Name"]]
#     .applymap(lambda x: not pd.isna(x[0]))
#     .apply(all, axis=1)
# ]
# df_companies = df_companies[
#     df_companies.applymap(lambda x: len(x) if x[0] else None).apply(
#         lambda x: len(set([el for el in x if not pd.isnull(el)])) == 1,
#         axis=1,
#     )
# ]
# companies_columns = list(df_companies.columns)
# # Get number of companies by tender
# df_companies["_len"] = df_companies["ID"].apply(len)

# # Fill lists of None to have the same number of elements and explode later
# companies = pd.DataFrame(
#     df_companies.apply(
#         lambda x: [fill_to_length(list(el), x[-1]) for el in x[:-1]], axis=1
#     ).tolist(),
#     columns=companies_columns,
# )

# # Split companies in rows
# companies = companies.explode(companies_columns)
# companies = companies.reset_index(drop=True)
# display(companies.head())

In [13]:
# # FOREIGN IDs
# c_test = companies.drop_duplicates(subset="ID")
# c_test_valid = c_test["ID"].apply(is_valid_nif)
# val = "abcdefghjnpqrsuvw"
# c_test_sub = c_test.loc[~c_test_valid, "ID"]
# c_test_sub[
#     c_test_sub.apply(
#         lambda x: x[0] not in val and x[0].isalpha() and not x[:2] in ["xx",]
#     )
# ]
# display(c_test_sub)

In [14]:
# with log_time("Clean df"):
#     companies_clean = clean_df(companies, prefer="processes", workers=-1)

In [15]:
# # Aggregate company info in lists
# companies_clean["SMEAwardedIndicator"] = companies_clean["SMEAwardedIndicator"].apply(
#     lambda x: None if not x else True if x == "true" else False
# )
# companies_clean = (
#     companies_clean
#     # companies[["ID", "Name", "Name_proc", "Name_norm"]]
#     .groupby(["ID", "Name_norm"])
#     .agg(list)
#     .reset_index()
# )
# companies_clean["count"] = companies_clean["Name_proc"].apply(len)
# companies_clean = companies_clean.reset_index()

#### Unique names and IDs

In [16]:
# # Unique names and IDs
# # These companies have always appeared with the same (id-name) association
# cols_vals = [
#     c for c in companies_clean.columns if c not in ["ID", "Name_norm", "count"]
# ]
# unique_ID = ~companies_clean["ID"].duplicated(keep=False)
# unique_NAME = ~companies_clean["Name_norm"].duplicated(keep=False)

# # Unique by ID and name
# unique = companies_clean[unique_ID & unique_NAME]

# # Non unique IDs
# non_unique_ids = list(set(companies_clean["index"]) - set(unique["index"]))
# non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]

# unique["index"] = unique["index"].apply(lambda x: [x])
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])
# print(unique.shape, non_unique.shape)

#### Repeated IDs and Names

In [17]:
# Choose definitive values
def suggest_value(elements):
    """
    Select elements based on appearance.
    If same number of appearances, choose the longest.
    If shorter elements are not included in the 'main' one, return all.
    """
    cnt = Counter(elements)
    cnt.pop(None, None)
    cnt = cnt.most_common()
    if cnt:
        max_cnt = cnt[0][1]
        els = sorted([k for k, v in cnt if v == max_cnt], key=lambda x: (-len(x), x))
        # return els[0]
        base = els.pop(0)
        return [base]
        # if all(
        #     [all(t in base for t in regex.sub(r"\W", " ", el).split()) for el in els]
        # ):
        #     return [base]
        # return [base] + els
    else:
        # return None
        return [None]


# Repeated IDs
def unify_repeated_col(df: pd.DataFrame, rep_col: str, un_col: str):
    """
    Takes a dataframe with duplicated values in one column that should be unique (e.g. repeated IDs)
    and another column that should also be unique given the previous one (e.g. title)
    and unifies it so that it chooses the best option.

    Parameters
    ----------
    df: pd.DataFrame
    rep_col: str
        Name of column with repeated values that will be unified
    un_col: str
        Name of column with non unique values
    """
    # Non-unique columns
    cols_vals = [c for c in df.columns if c not in [rep_col, "count", "index"]]
    repeated_rows = df[rep_col].duplicated(keep=False)
    repeated = df[repeated_rows]

    # Count times the values appear
    repeated.loc[repeated.index, [un_col]] = (
        repeated.loc[repeated.index, un_col].apply(lambda x: [x])
        * repeated.loc[repeated.index, "count"]
    )
    # Group by repeated
    repeated = repeated.reset_index()
    repeated = repeated.groupby(rep_col).agg(
        {
            # "index": list,
            "index": sum,
            **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
            "count": sum,
        }
    )
    # Get the most common values for each column
    repeated.loc[repeated.index, un_col] = (
        repeated.loc[repeated.index, un_col].apply(suggest_value).values
    )
    repeated = repeated.reset_index()

    # Concatenate unique
    use_index = repeated.loc[repeated[un_col].apply(len) == 1, un_col].index
    repeated.loc[use_index, un_col] = repeated.loc[use_index, un_col].apply(
        lambda x: x[0]
    )
    unified = repeated.loc[use_index]

    return unified

In [18]:
# # Obtain unique ID-name
# unified_ID = unify_repeated_col(non_unique, "ID", "Name_norm")
# # Update non_unique
# non_unique_ids = list(
#     set(chain.from_iterable(non_unique["index"]))
#     - set(chain.from_iterable(unified_ID["index"]))
# )
# # non_unique = companies_clean.loc[non_unique_ids]
# non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])

In [19]:
# # Obtain unique name-ID
# unified_NAME = unify_repeated_col(non_unique, "Name_norm", "ID")
# # Update non_unique
# non_unique_ids = list(
#     set(chain.from_iterable(non_unique["index"]))
#     - set(chain.from_iterable(unified_NAME["index"]))
# )
# # non_unique = companies_clean.loc[non_unique_ids]
# non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])

#### Merge Companies info

In [20]:
# # Global
# # Merge unique+unifiedID+unifiedName+nonUnique
# merged_global = pd.concat([unique, unified_ID, unified_NAME, non_unique])
# cols_vals = [
#     c
#     for c in merged_global.columns
#     if c not in ["ID", "Name_norm", "count", "index", "id_tender"]
# ]
# merged_global = merged_global.groupby(["ID", "Name_norm"]).agg(
#     {
#         # "index": lambda x: list(chain.from_iterable(x)),
#         "index": sum,
#         "id_tender": sum,
#         **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
#         "count": sum,
#     }
# )
# merged_global = merged_global.reset_index()
# print(len(merged_global))
# display(merged_global.head())

#### Unify found names

In [21]:
# # Get all names found in the tenders
# merged_global["UsedNames"] = (merged_global["Name"] + merged_global["Name_proc"]).apply(
#     lambda x: sorted(list(set(x)))
# )

#### Propose a final name

In [22]:
# # Initial computations
# data = merged_global["Name_proc"]
# local_frequencies = data.apply(Counter)
# proposed_names = local_frequencies.apply(lambda x: max(x, key=x.get))
# # First proposal
# merged_global["Name_proposed"] = proposed_names

# # Total names
# global_freq_dict = data.explode().value_counts().to_dict()
# names_list = list(global_freq_dict.keys())

In [23]:
# def adjust_names_for_batch_v2(names_batch, local_frequencies, proposed_names):
#     """
#     Adjusts and determines the most probable names for a given batch of names.

#     For each name in the batch, the function checks if the name is associated
#     with multiple IDs. If so, it retains the name only for the ID where the name
#     has the highest frequency (local priority). For the other IDs, it assigns
#     the next best name based on local frequency.

#     Parameters:
#     -----------
#     names_batch: List[str]
#         A batch of names to be processed.
#     local_frequencies: pd.Series
#         Series containing the frequency count of each name for each ID.
#     proposed_names: pd.Series
#         Series mapping each ID to its currently assigned name.

#     Returns:
#     --------
#     modified_real_names: Dict[int, str]:
#         A dictionary with IDs as keys and their adjusted real names as values.
#     """
#     modified_real_names = {}
#     for name in names_batch:
#         ids_with_name = proposed_names[proposed_names == name].index.tolist()
#         if len(ids_with_name) > 1:
#             ids_with_name.sort(key=lambda idx: -local_frequencies.loc[idx][name])
#             for idx in ids_with_name[1:]:
#                 del local_frequencies.loc[idx][name]
#                 if local_frequencies.loc[idx]:
#                     modified_real_names[idx] = max(
#                         local_frequencies.loc[idx], key=local_frequencies.loc[idx].get
#                     )

#     return modified_real_names


# new_proposed_name = parallelize_function_with_progress_bar(
#     func=adjust_names_for_batch_v2,
#     data=names_list,
#     batch_size=1000,
#     desc="progress_p2",
#     workers=-1,
#     prefer="processes",
#     output="series",
#     local_frequencies=local_frequencies,
#     proposed_names=proposed_names,
# )


# # Update the proposed names with the new ones
# for item in new_proposed_name:
#     merged_global.loc[item.keys(), "Name_proposed"] = list(item.values())

In [24]:
# # merged_global.to_parquet("data/merged_global.parquet")
# merged_global = pd.read_parquet("data/merged_global.parquet")

#### Check if company is SME

In [25]:
# def isPYME(SMEIndicators):
#     # Evaluate if is SME based on the SMEAwardedIndicator appearances
#     # If True and False are present, return None
#     # TODO: make a better decision
#     sme_counts = Counter(SMEIndicators)
#     if True in sme_counts and False in sme_counts:
#         return None
#     return sme_counts.most_common(1)[0][0]


# merged_global["isPYME"] = merged_global["SMEAwardedIndicator"].apply(isPYME)

#### Check CityName and PostalZone

In [26]:
# def get_city_name(CityName):
#     # Evaluate the city name based on the CityName appearances
#     # Get most common excluding None
#     # TODO: make a better decision
#     city_names = Counter(CityName)
#     if None in city_names.keys():
#         city_names.pop(None)
#     if not len(city_names) == 1:
#         return None
#     return city_names.most_common(1)[0][0]


# def get_postal_zone(PostalZone):
#     # Evaluate the postal zone based on the PostalZone appearances
#     # Get most common excluding None
#     # TODO: make a better decision
#     postal_zones = Counter(PostalZone)
#     if None in postal_zones.keys():
#         postal_zones.pop(None)
#     if not len(postal_zones) == 1:
#         return None
#     return postal_zones.most_common(1)[0][0].split(".")[0]


# merged_global["City"] = merged_global["CityName"].apply(get_city_name)
# merged_global["PostalCode"] = merged_global["PostalZone"].apply(get_postal_zone)

### Add info

In [27]:
# # Add information based on NIF
# merged_global["NIF_type"] = merged_global["ID"].apply(get_nif_type)
# merged_global["prov"], merged_global["comp_type"], merged_global["comp_desc"] = list(
#     zip(*merged_global["ID"].apply(get_info_from_cif))
# )
# merged_global["comp_type"] = merged_global["comp_type"].apply(
#     lambda x: x.split(",")[0] if not pd.isna(x) else None
# )

### Find UTEs

In [28]:
# # Find UTEs based on name
# ute_n = provisional_company_info["UsedNames"].apply(
#     lambda x: bool(regex.search(r"\bu(\.)?t(\.)?e(\.)?\b", " ".join(x)))
# )
# # Find UTEs based on ID
# ute_i = provisional_company_info["NIF"].apply(lambda x: x.startswith("u"))

# # provisional_company_info[ute_i | ute_n][["NIF", "NameProposed", "UsedNames"]]
# # sum(ute_n), sum(ute_i), sum(ute_n & ute_i), sum(ute_n & ute_i)/min(sum(ute_n), sum(ute_i))

# utes = provisional_company_info[ute_i | ute_n]

### Save data

In [29]:
# provisional_company_info = merged_global.rename(
#     columns={
#         "ID": "NIF",
#         "id_tender": "TenderAppearance",
#         "prov": "Province",
#         "NIF_type": "NIFtype",
#         "comp_type": "CompanyType",
#         "comp_desc": "CompanyDescription",
#         "Name_proposed": "NameProposed",
#     }
# )[
#     [
#         "NIF",
#         "NameProposed",
#         "UsedNames",
#         "Province",
#         "City",
#         "NIFtype",
#         "CompanyType",
#         "CompanyDescription",
#         "isPYME",
#         "TenderAppearance",
#     ]
# ]

In [30]:
# provisional_company_info.to_parquet("data/provisional_company_info.parquet")
# utes.to_parquet("data/utes.parquet")

provisional_company_info = pd.read_parquet("data/provisional_company_info.parquet")
utes = pd.read_parquet("data/utes.parquet")

In [31]:
provisional_company_info

Unnamed: 0,NIF,NameProposed,UsedNames,Province,City,NIFtype,CompanyType,CompanyDescription,isPYME,TenderAppearance
0,00021492x,carmen balguerias jiménez,[carmen balguerias jiménez],,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
1,00035211k,paloma sáinz de la maza de la serna,[paloma sáinz de la maza de la serna],,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
2,00067665e,alberto delgado cebrián,[alberto delgado cebrián],,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
3,00072839k,fernandez abad vicente,"[fernandez abad vicente, fernandez abad,vicente]",,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
4,00076938a,luis terán lópez,[luis terán lópez],,,DNI,,,,[https://contrataciondelestado.es/sindicacion/...
...,...,...,...,...,...,...,...,...,...,...
268636,y9709754l,martín antoine marie payen,[martín antoine marie payen],,,NIE,,,True,[https://contrataciondelestado.es/sindicacion/...
268637,y9754280v,donatella magelli,[donatella magelli],,la oliva,NIE,,,False,[https://contrataciondelestado.es/sindicacion/...
268638,z0013339g,messe berlin gmbh,[messe berlin gmbh],,,NIE,,,True,[https://contrataciondelestado.es/sindicacion/...
268639,z0059955e,stichting vulture conservation foundation,[stichting vulture conservation foundation],,,NIE,,,,[https://contrataciondelestado.es/sindicacion/...


## Empresas Zaragoza

In [32]:
# Cargar las empresas de Zaragoza y darle el formato propio
emp = pd.read_excel("data/licitador_09_23.xls")
emp_zgz = emp[["ID_EMPRESA", "NIF", "NOMBRE", "UTE"]].copy()
emp_zgz["empresa"] = emp_zgz["NOMBRE"].apply(clean_company_type)
emp_zgz["empresa_proc"] = emp_zgz["empresa"].apply(clean_company_type, remove_type=True)
emp_zgz["nif"] = emp_zgz["NIF"].apply(
    lambda x: regex.sub(r"\W", "", x.lower()) if not pd.isna(x) else None
)
emp_zgz["valid_nif"] = emp_zgz["nif"].apply(lambda x: is_valid_nif(x) if x else None)
emp_zgz["nif_type"] = emp_zgz["nif"].apply(lambda x: get_nif_type(x) if x else None)

# Stats
print("Elementos inválidos en los datos:")
emp_zgz.apply(pd.isna, axis=1).sum()

Elementos inválidos en los datos:


ID_EMPRESA        0
NIF             237
NOMBRE            0
UTE             351
empresa           0
empresa_proc      0
nif             237
valid_nif       237
nif_type        306
dtype: int64

### Empresas repetidas

In [33]:
total_empresas = len(emp_zgz)

# Identifica los NIF duplicados
emp_zgz_dup_nif = emp_zgz.loc[
    emp_zgz["nif"].duplicated(keep=False) & emp_zgz["nif"].notna(),
    ["ID_EMPRESA", "nif", "empresa_proc"],
].sort_values(by="nif")
print(f"Hay {len(emp_zgz_dup_nif)} NIFs duplicados:")
display(emp_zgz_dup_nif)
print(
    f"Los NIFs duplicados representan el {(len(emp_zgz_dup_nif) / total_empresas) * 100:.2f}% del total."
)

print()

# Identifica los nombres duplicados"""
emp_zgz_dup_name = emp_zgz.loc[
    emp_zgz["empresa_proc"].duplicated(keep=False) & emp_zgz["empresa_proc"].notna(),
    ["ID_EMPRESA", "nif", "empresa_proc"],
].sort_values(by="empresa_proc")
print(f"Hay {len(emp_zgz_dup_name)} nombres duplicados:")
display(emp_zgz_dup_name)
print(
    f"Los nombres duplicados representan el {(len(emp_zgz_dup_name) / total_empresas) * 100:.2f}% del total."
)

Hay 6 NIFs duplicados:


Unnamed: 0,ID_EMPRESA,nif,empresa_proc
1065,6597,b50640937,dc - veinticinco
1066,324,b50640937,pavimentos dc 25 impermeabilizaciones
1860,1298,b99061558,jose - manuel - del - pozo - casi
1861,5058,b99061558,audiovisuales el pozo
2292,2856,g50130590,unión de consumidores de aragón
2293,6529,g50130590,union - consumidores - de - aragon


Los NIFs duplicados representan el 0.19% del total.

Hay 10 nombres duplicados:


Unnamed: 0,ID_EMPRESA,nif,empresa_proc
2232,548,f22302400,atelier de ideas
2262,6469,f99444325,atelier de ideas
587,2858,b07673460,gesto
1930,6382,b99189722,gesto
998,1432,b50493097,ht publicidad grupo tafalla
2315,601,g50493097,ht publicidad grupo tafalla
1931,2624,b99193245,magen arquitectos
2968,8430,,magen arquitectos
2560,235,x5005241g,wilmer ossa buitrago
2883,710,73286854e,wilmer ossa buitrago


Los nombres duplicados representan el 0.32% del total.


### Incorrect data

In [34]:
# Empresas que tienen nif incorrecto
emp_zgz_wrong_nif = emp_zgz.loc[
    (emp_zgz["nif"].apply(bool))
    & (~emp_zgz["valid_nif"].apply(bool))
    & (emp_zgz["nif_type"].isna())
].copy()
print(f"Hay {len(emp_zgz_wrong_nif)} con un NIF no válido.")
# Intento de corrección:
emp_zgz_wrong_nif["nif_propuesto"] = emp_zgz_wrong_nif["nif"].apply(
    validate_nif,
    correct=True,
)
emp_zgz_corrected_nif = emp_zgz_wrong_nif.loc[
    ~emp_zgz_wrong_nif["nif_propuesto"].isna(),
    ["ID_EMPRESA", "empresa_proc", "nif", "nif_propuesto"],
]
print(f"Se han podido corregir los siguientes {len(emp_zgz_corrected_nif)}:")
display(emp_zgz_corrected_nif)

Hay 69 con un NIF no válido.
Se han podido corregir los siguientes 2:


Unnamed: 0,ID_EMPRESA,empresa_proc,nif,nif_propuesto
2841,238,esther dominguez alejandre,5412400v,05412400v
2901,406,juan gabriel rodriguez holgado,7992498k,07992498k


### Search companies in own data

In [189]:
own_info = provisional_company_info[
    ["NIF", "UsedNames", "NameProposed", "Province", "City", "isPYME"]
].rename(columns={"NIF": "nif"})
own_info["UsedNames"] = own_info["UsedNames"].apply(list)
emp_zgz["UsedNames"] = emp_zgz[["empresa", "empresa_proc"]].apply(
    lambda x: [x[0], x[1]], axis=1
)

In [194]:
# Buscar por NIF
nifs_usados = own_info.reset_index()
nif_zgz = emp_zgz[["ID_EMPRESA", "nif", "UTE", "empresa", "UsedNames"]]
found_by_nif = pd.merge(
    nif_zgz, nifs_usados, left_on="nif", right_on="nif", how="inner"
)
found_by_nif["UsedNames"] = found_by_nif[["UsedNames_x", "UsedNames_y"]].sum(axis=1)
found_by_nif = (
    found_by_nif.groupby("ID_EMPRESA")
    .agg(
        {
            "nif": set,
            "index": set,
            "UsedNames": sum,
            "NameProposed": set,
            "Province": set,
            "City": set,
            "isPYME": set,
            "UTE": set,
        }
    )
    .applymap(lambda x: list(set(x)))
    .reset_index()
)
found_by_nif

Unnamed: 0,ID_EMPRESA,nif,index,UsedNames,NameProposed,Province,City,isPYME,UTE
0,2,[b82387770],[202035],"[everis spain montero aramburu, ntt data spain...",[everis spain],[Madrid],[None],[None],[N]
1,3,[b50835669],[171771],"[everis aragón, everis aragon, everis solucion...",[everis aragón],[Zaragoza],[zaragoza],[None],[N]
2,10,[u99468712],[264738],"[zitycard hiberus u.t.e., zeumat zitycard hibe...",[zitycard hiberus],[Zaragoza],[None],[None],[S]
3,17,[u99484065],[264742],"[pavijus beton catalan, beton catalan pavijus,...",[beton catalan pavijus],[Zaragoza],[None],[None],[S]
4,20,[a50316595],[104462],"[construcciones mariano lopez navarro s.a.u., ...",[construcciones mariano lópez navarro],[Zaragoza],[zaragoza],[None],[N]
...,...,...,...,...,...,...,...,...,...
2104,12798,[b25838566],[142269],"[events91 josep pera, events91 josep pera s.l....",[events91 josep pera],[Lérida],[None],[None],[nan]
2105,12858,[a44004059],[103020],"[autocares teruel zaragoza, autobuses teruel z...",[autobuses teruel zaragoza],[Teruel],[None],[None],[nan]
2106,12878,[a50066190],[104341],"[durban maquinaria para la construcción s.a., ...",[durban maquinaria para la construccion],[Zaragoza],[None],[None],[N]
2107,12898,[b99442485],[233109],[colaboración y asesoramiento en hostelería sa...,[colaboración y asesoramiento en hostelería sa...,[Zaragoza],[None],[True],[N]


In [206]:
# Buscar por nombre
nombres_usados = own_info.explode(column="UsedNames").reset_index()
nombres_zgz = emp_zgz[["ID_EMPRESA", "nif", "UTE", "empresa", "UsedNames"]].explode(
    column="UsedNames"
)
found_by_name = pd.merge(
    nombres_zgz, nombres_usados, left_on="UsedNames", right_on="UsedNames", how="inner"
)
found_by_name["nif"] = found_by_name[["nif_x", "nif_y"]].apply(
    lambda x: list(set([el for el in x if el])), axis=1
)
found_by_name = (
    found_by_name.groupby("ID_EMPRESA")
    .agg(
        {
            "nif": sum,
            "index": set,
            "UsedNames": set,
            "NameProposed": set,
            "Province": set,
            "City": set,
            "isPYME": set,
            "UTE": set,
        }
    )
    .applymap(lambda x: list(set(x)))
    .reset_index()
)
found_by_name

Unnamed: 0,ID_EMPRESA,nif,index,UsedNames,NameProposed,Province,City,isPYME,UTE
0,3,[b50835669],[171771],"[everis aragón, everis aragón s.l.u.]",[everis aragón],[Zaragoza],[zaragoza],[None],[N]
1,24,[a28171288],[99865],"[tecnica y proyectos typsa, tecnica y proyecto...",[técnica y proyectos],[Madrid],[None],[None],[N]
2,27,[b22330039],[138632],"[zeumat zitycard s.l., zeumat zitycard]",[zeumat zitycard],[Huesca],[None],[None],[N]
3,28,"[b99045379, b99344319]","[232100, 232791]","[hiberus tecnologías de la información s.l., h...","[hiberus sistemas informaticos, hiberus tecnol...",[Zaragoza],"[None, zaragoza]",[None],[N]
4,29,[a78137957],[105976],[consultrans],[consultrans],[Madrid],[madrid],[None],[N]
...,...,...,...,...,...,...,...,...,...
1565,12719,[b64020431],[182359],"[teatrerya textiles escenograficos s.l., teatr...",[teatrerya textiles escenográficos],[Barcelona],[None],[None],[nan]
1566,12778,[a28406775],[100131],"[unipublic, unipublic s.a.u.]",[unipublic],[Madrid],[None],[None],[nan]
1567,12798,[b25838566],[142269],"[events91 josep pera, events91 josep pera s.l.u.]",[events91 josep pera],[Lérida],[None],[None],[nan]
1568,12858,[a44004059],[103020],"[autobuses teruel zaragoza s.a., autobuses ter...",[autobuses teruel zaragoza],[Teruel],[None],[None],[nan]


In [207]:
# Unimos los dos dataframes
found = (
    pd.concat([found_by_nif, found_by_name])
    .groupby("ID_EMPRESA")
    .agg(sum)
    .applymap(lambda x: list(set(x)) if x else None)
    .reset_index()
)

In [208]:
found

Unnamed: 0,ID_EMPRESA,nif,index,UsedNames,NameProposed,Province,City,isPYME,UTE
0,2,[b82387770],[202035],"[everis spain montero aramburu, ntt data spain...",[everis spain],[Madrid],[None],[None],[N]
1,3,[b50835669],[171771],"[everis aragón, everis aragon, everis solucion...",[everis aragón],[Zaragoza],[zaragoza],[None],[N]
2,10,[u99468712],[264738],"[zitycard hiberus u.t.e., zeumat zitycard hibe...",[zitycard hiberus],[Zaragoza],[None],[None],[S]
3,17,[u99484065],[264742],"[pavijus beton catalan, beton catalan pavijus,...",[beton catalan pavijus],[Zaragoza],[None],[None],[S]
4,20,[a50316595],[104462],"[construcciones mariano lopez navarro s.a.u., ...",[construcciones mariano lópez navarro],[Zaragoza],[zaragoza],[None],[N]
...,...,...,...,...,...,...,...,...,...
2196,12798,[b25838566],[142269],"[events91 josep pera, events91 josep pera s.l....",[events91 josep pera],[Lérida],[None],[None],[nan]
2197,12858,[a44004059],[103020],"[autocares teruel zaragoza, autobuses teruel z...",[autobuses teruel zaragoza],[Teruel],[None],[None],[nan]
2198,12878,[a50066190],[104341],"[durban maquinaria para la construcción s.a., ...",[durban maquinaria para la construccion],[Zaragoza],[None],[None],[N]
2199,12898,[b99442485],[233109],[colaboración y asesoramiento en hostelería sa...,[colaboración y asesoramiento en hostelería sa...,[Zaragoza],[None],[True],[N]


In [209]:
# # Rellenar NIF:
# empty_nif = emp_zgz[
#     (emp_zgz["ID_EMPRESA"].isin(found["ID_EMPRESA"])) & (emp_zgz["nif"].isna())
# ]
# pd.merge(
#     empty_nif[["ID_EMPRESA", "empresa", "empresa_proc"]],
#     found,
#     left_on="ID_EMPRESA",
#     right_on="ID_EMPRESA",
#     how="inner",
# )

In [None]:
# df = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv", sep=";", header=0, nrows=64, index_col=False)
# df = pd.read_excel(r"C:\Users\josea\Downloads\empresas.xlsx")

with open(r"C:\Users\josea\Downloads\empresas_zgz.csv", "r", encoding="utf-8") as f:
    emp = [
        [el.replace('"', "").strip() for el in l.lower().strip().split(";", 4)]
        for l in f.readlines()
        if len(l) > 2
    ]
cols = emp[0]
data = emp[1:]
emp_zgz = pd.DataFrame(data=data, columns=cols)
emp_zgz = emp_zgz.applymap(lambda x: x if x else None)
emp_zgz = emp_zgz.dropna(how="all").drop_duplicates().reset_index(drop=True)
emp_zgz["empresa"] = emp_zgz["empresa"].apply(clean_company_type)
emp_zgz["empresa_proc"] = emp_zgz["empresa"].apply(clean_company_type, remove_type=True)
emp_zgz["nif"] = emp_zgz["nif"].apply(lambda x: regex.sub(r"\W", "", x) if x else None)
emp_zgz["nif_type"] = emp_zgz["nif"].apply(lambda x: get_nif_type(x) if x else None)

In [None]:
print(len(emp_zgz))
display(emp_zgz.head())

In [None]:
# Empresas con nombre repetido y cuyo nif es None en alguno de los casos
valid_emp = emp_zgz[["nif", "empresa", "nombre"]].reset_index()
display(
    valid_emp[
        valid_emp["empresa"].duplicated(keep=False)
        & valid_emp["empresa"].isin(
            valid_emp.loc[valid_emp["nif"].isna(), "empresa"].values
        )
    ].sort_values(by="empresa")
)
# (No hay ningún nif para rellenar)

In [None]:
# valid_emp = emp_zgz.groupby("nif").agg(set).reset_index()
valid_emp = emp_zgz[["nif", "empresa"]].reset_index()
print(len(valid_emp))
display(valid_emp.head())

## Compare companies from Tenders and Zaragoza

In [None]:
# final_merged[["ID", "Name", "count"]]

In [None]:
# Create dataframe with counter for appearances
ids = [Counter({k: v}) for k, v in final_merged[["ID", "count"]].values]
names = [Counter({k: v}) for k, v in final_merged[["Name", "count"]].values]

### Same nif

In [None]:
common_nif = set(final_merged["ID"]) & set(valid_emp["nif"])
# Companies
common_comp_nif = pd.DataFrame(data={"ID": final_merged["ID"].values, "Name": names})
common_comp_nif = common_comp_nif[common_comp_nif["ID"].isin(common_nif)]
# Zaragoza
common_emp_nif = (
    valid_emp[valid_emp["nif"].isin(common_nif)]
    .groupby("nif")
    .agg({"index": list, "empresa": Counter})
    .reset_index()
)
# Merge
c_nif = pd.merge(
    common_comp_nif,
    common_emp_nif,
    left_on="ID",
    right_on="nif",
).reset_index(drop=True)

# Select final values
final_name = []
for vals in zip(c_nif["Name"].values, c_nif["empresa"].values):
    v = Counter()
    [v.update(el) for el in vals]
    final_name.append(v)
c_nif["final_name"] = final_name
c_nif["final_name"] = c_nif["final_name"].apply(lambda x: suggest_value(x)[0])
c_nif["final_id"] = c_nif["ID"]

print(len(c_nif))
display(c_nif.head())

# Update used and remaining
idx = set(chain.from_iterable(c_nif["index"]))
rest_valid_emp = valid_emp[~valid_emp["index"].isin(idx)]

In [None]:
valid_emp[valid_emp["nif"].isin(common_nif)]

### Same name

In [None]:
common_name = set(final_merged["Name"]) & set(rest_valid_emp["empresa"])
# Companies
common_comp_name = pd.DataFrame(data={"ID": ids, "Name": final_merged["Name"].values})
common_comp_name = common_comp_name[common_comp_name["Name"].isin(common_name)]
# Zaragoza
common_emp_name = (
    rest_valid_emp[rest_valid_emp["empresa"].isin(common_name)]
    .groupby("empresa")
    .agg({"index": list, "nif": Counter})
    .reset_index()
)
# Merge
c_name = pd.merge(
    common_comp_name,
    common_emp_name,
    left_on="Name",
    right_on="empresa",
).reset_index(drop=True)

# Select final values
final_id = []
for vals in zip(c_name["ID"].values, c_name["nif"].values):
    v = Counter()
    [v.update(el) for el in vals]
    final_id.append(v)
c_name["final_id"] = final_id
c_name["final_id"] = c_name["final_id"].apply(lambda x: suggest_value(x)[0])
c_name["final_name"] = c_name["empresa"]

print(len(c_name))
display(c_name.head())


# Update used and remaining
idx.update(set(chain.from_iterable(c_name["index"])))
rest_valid_emp = valid_emp[~valid_emp["index"].isin(idx)]

In [None]:
matches = pd.concat([c_nif, c_name])
matches

In [None]:
final_merged[final_merged["ID"] == "b50931302"]

In [None]:
validate_nif("05412400v")

In [None]:
# (
#     valid_emp.loc[
#         valid_emp["empresa"].duplicated(keep=False)
#         & valid_emp["empresa"].isin(
#             valid_emp.loc[valid_emp["nif"].isna(), "empresa"].values
#         ),
#         ["nif", "empresa", "nombre"],
#     ].sort_values(by="empresa")
# ).to_csv("empresas_zgz_no_nif.csv")

In [None]:
emp_zgz_wrong = valid_emp.loc[
    ~valid_emp["index"].isin(idx), ["nif", "empresa"]
].dropna()

In [None]:
validate_nif("17410114f", correct=True, verbose=True)

In [None]:
emp_zgz_wrong[
    ~emp_zgz_wrong["nif"].apply(validate_nif, correct=True).apply(bool)
].to_csv("empresas_zgz_mal_nif.csv", index=False)

In [None]:
len(idx), len(valid_emp), len(idx) / len(valid_emp)

In [None]:
# Common name, different nif
c_name_diff_nif_comp_ids = list(
    set(common_comp_name.index) - set(common_comp_nif.index)
)
c_name_diff_nif_emp_ids = list(set(common_emp_name.index) - set(common_emp_nif.index))

In [None]:
# emp_zgz.loc[c_name_diff_nif_emp_ids]

In [None]:
final_merged.loc[c_name_diff_nif_comp_ids].head()

In [None]:
# c_concat = pd.concat([c_nif, c_name])
# c_concat[~c_concat.duplicated(keep=False)]

In [None]:
set(emp_zgz["nif"]) - set(final_merged["ID"])

In [None]:
# common_nif = set(df["nif"]) - set(final_merged["ID"])
# common_comp = final_merged[final_merged["ID"].isin(common_nif)][
#     ["ID", "Name"]
# ].drop_duplicates()
# common_emp = df[df["nif"].isin(common_nif)][["nif", "empresa"]].drop_duplicates()

In [None]:
# dup_emp = df["empresa"].duplicated(keep=False)
# dup_nif = df["nif"].duplicated(keep=False)