# Imports

In [None]:
import json
from collections import Counter
from itertools import chain
from pathlib import Path

import numpy as np
import pandas as pd
import regex
from tqdm import tqdm

In [None]:
import time

In [None]:
from src.companies.processor import clean_company_type, normalize_company_name
from src.companies.utils import replace_company_types
from src.nif_validation.validation import (
    get_nif_type,
    validate_nif,
    is_valid_nif,
    # is_valid_cif,
    # is_valid_dni,
    # is_valid_nie,
    get_info_from_cif,
)
from src.utils.utils import fill_to_length, merge_orig_dataframes
from src.utils.utils_parallelization import (
    parallelize_function,
    parallelize_function_with_progress_bar,
)

# Load info

In [None]:
with open(r"C:\Users\josea\Downloads\genCat_Junio_2023.json", "r") as f:
    gencat = pd.json_normalize(json.load(f))

In [None]:
# df = pd.read_csv(r"C:\Users\josea\Downloads\Contractaci__p_blica_a_Catalunya__publicacions_a_la_Plataforma_de_serveis_de_contractaci__p_blica.csv")
df_emp = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv")

In [None]:
# # Load data
# dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/insiders.parquet")
# df_in = pd.read_parquet(dir_df)
# dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/outsiders.parquet")
# df_ou = pd.read_parquet(dir_df)
# dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/minors.parquet")
# df_mi = pd.read_parquet(dir_df)

In [None]:
# df_in.columns = [".".join([el for el in c if el]) for c in df_in.columns]
# df_ou.columns = [".".join([el for el in c if el]) for c in df_ou.columns]
# df_mi.columns = [".".join([el for el in c if el]) for c in df_mi.columns]

In [None]:
# # ERROR A REVISAR
# df_mi.loc[
#     df_mi["id"]
#     .str.lower()
#     .isin(
#         [
#             "https://contrataciondelestado.es/sindicacion/datosabiertosmenores/8622601",
#             "https://contrataciondelestado.es/sindicacion/datosabiertosmenores/8410165",
#         ]
#     ),
#     [
#         "ContractFolderStatus.TenderResult.WinningParty.PartyIdentification.ID",
#         "ContractFolderStatus.TenderResult.WinningParty.PartyName.Name",
#     ],
# ]

# Aux functions
Functions necessary for processing the cells

In [None]:
def nif_from_name(name):
    name_spl = np.array(name.split())
    valid = np.array([bool(validate_nif(s)) for s in name_spl])
    new_name = " ".join(name_spl[~valid])
    new_nif = Counter(name_spl[valid]).most_common()[0][0] if valid.any() else np.nan
    return new_name, new_nif

In [None]:
import contextlib


@contextlib.contextmanager
def log_time(task_name: str):
    """Context manager to log the execution time of a block of code."""
    t0 = time.time()
    yield
    t1 = time.time()
    print(f"{task_name} - {t1-t0}")


def execute_function(func, data, prefer=None, workers=-1, *args, **kwargs):
    """Wrapper function to decide whether to use parallel processing or not."""
    if not prefer:
        return data.apply(func, *args, **kwargs)
    else:
        return parallelize_function(
            func, data, prefer=prefer, workers=workers, *args, **kwargs
        )


def clean_df(df: pd.DataFrame, prefer=None, workers=-1):
    # Remove unwanted whitespace
    with log_time("Removing unwanted whitespace"):
        df = df.applymap(
            lambda x: regex.sub(r"((?<=\w+\W)\s+)|(\s+(?=\W\w+))", "", x)
            if not pd.isna(x)
            else None
        )

    # Validate NIF
    with log_time("Validating NIF"):
        df["ID"] = execute_function(validate_nif, df["ID"], prefer, workers)

    # Clean company type
    with log_time("Cleaning company type"):
        name = [
            regex.sub(i, "", n) if not (pd.isna(n) or pd.isna(i)) else n
            for i, n in df[["ID", "Name"]].values
        ]
        df["Name"] = execute_function(
            clean_company_type, name, prefer, workers, remove_type=False
        )

    # Remove company type
    with log_time("Removing company type"):
        df["Name_proc"] = execute_function(
            clean_company_type, df["Name"], prefer, workers, remove_type=True
        )

    # Normalize company name
    with log_time("Normalizing company name"):
        df["Name_norm"] = execute_function(
            normalize_company_name, df["Name_proc"], prefer, workers
        )

    return df

# Merge data from different sources

In [None]:
# df_companies = merge_orig_dataframes(
#     dir_metadata=Path("C:/Users/josea/Documents/Trabajo/data/metadata/")
# )
# df_companies.to_parquet("data/companies.parquet")

df_companies = pd.read_parquet("data/companies.parquet")

# Obtain individual companies

In [None]:
# # Use only those where all dimensions match
# # (e.g. same number of companies and companies ids)
# # and drop NAs
# df_companies = df_companies[
#     df_companies[["ID", "Name"]]
#     .applymap(lambda x: not pd.isna(x[0]))
#     .apply(all, axis=1)
# ]
# df_companies = df_companies[
#     df_companies.applymap(lambda x: len(x) if x[0] else None).apply(
#         lambda x: len(set([el for el in x if not pd.isnull(el)])) == 1,
#         axis=1,
#     )
# ]
# companies_columns = list(df_companies.columns)
# # Get number of companies by tender
# df_companies["_len"] = df_companies["ID"].apply(len)

# # Fill lists of None to have the same number of elements and explode later
# companies = pd.DataFrame(
#     df_companies.apply(
#         lambda x: [fill_to_length(list(el), x[-1]) for el in x[:-1]], axis=1
#     ).tolist(),
#     columns=companies_columns,
# )

# # Split companies in rows
# companies = companies.explode(companies_columns)
# companies = companies.reset_index(drop=True)
# display(companies.head())

### Foreign/European IDs

In [None]:
# companies_foreign = companies[
#     companies["ID"].apply(lambda x: x[0].isalpha() and not is_valid_nif(x))
# ]
# companies_foreign["ID"].apply(lambda x: x[:2]).value_counts()

In [None]:
# companies_foreign_valid = companies_foreign.loc[
#     companies_foreign["ID"].apply(lambda x: validate_nif(x[2:])).dropna().index
# ]
# companies_foreign_valid

### Clean companies info

In [None]:
# with log_time("Clean df"):
#     companies_clean = clean_df(companies, prefer="processes", workers=-1)

In [None]:
# # Aggregate company info in lists
# companies_clean["SMEAwardedIndicator"] = companies_clean["SMEAwardedIndicator"].apply(
#     lambda x: None if not x else True if x == "true" else False
# )
# companies_clean = (
#     companies_clean
#     # companies[["ID", "Name", "Name_proc", "Name_norm"]]
#     .groupby(["ID", "Name_norm"])
#     .agg(list)
#     .reset_index()
# )
# companies_clean["count"] = companies_clean["Name_proc"].apply(len)
# companies_clean = companies_clean.reset_index()

#### Unique names and IDs

In [None]:
# # Unique names and IDs
# # These companies have always appeared with the same (id-name) association
# cols_vals = [
#     c for c in companies_clean.columns if c not in ["ID", "Name_norm", "count"]
# ]
# unique_ID = ~companies_clean["ID"].duplicated(keep=False)
# unique_NAME = ~companies_clean["Name_norm"].duplicated(keep=False)

# # Unique by ID and name
# unique = companies_clean[unique_ID & unique_NAME]

# # Non unique IDs
# non_unique_ids = list(set(companies_clean["index"]) - set(unique["index"]))
# non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]

# unique["index"] = unique["index"].apply(lambda x: [x])
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])
# print(unique.shape, non_unique.shape)

#### Repeated IDs and Names

In [None]:
# Choose definitive values
def suggest_value(elements):
    """
    Select elements based on appearance.
    If same number of appearances, choose the longest.
    If shorter elements are not included in the 'main' one, return all.
    """
    cnt = Counter(elements)
    cnt.pop(None, None)
    cnt = cnt.most_common()
    if cnt:
        max_cnt = cnt[0][1]
        els = sorted([k for k, v in cnt if v == max_cnt], key=lambda x: (-len(x), x))
        # return els[0]
        base = els.pop(0)
        return [base]
        # if all(
        #     [all(t in base for t in regex.sub(r"\W", " ", el).split()) for el in els]
        # ):
        #     return [base]
        # return [base] + els
    else:
        # return None
        return [None]


# Repeated IDs
def unify_repeated_col(df: pd.DataFrame, rep_col: str, un_col: str):
    """
    Takes a dataframe with duplicated values in one column that should be unique (e.g. repeated IDs)
    and another column that should also be unique given the previous one (e.g. title)
    and unifies it so that it chooses the best option.

    Parameters
    ----------
    df: pd.DataFrame
    rep_col: str
        Name of column with repeated values that will be unified
    un_col: str
        Name of column with non unique values
    """
    # Non-unique columns
    cols_vals = [c for c in df.columns if c not in [rep_col, "count", "index"]]
    repeated_rows = df[rep_col].duplicated(keep=False)
    repeated = df[repeated_rows]

    # Count times the values appear
    repeated.loc[repeated.index, [un_col]] = (
        repeated.loc[repeated.index, un_col].apply(lambda x: [x])
        * repeated.loc[repeated.index, "count"]
    )
    # Group by repeated
    repeated = repeated.reset_index()
    repeated = repeated.groupby(rep_col).agg(
        {
            # "index": list,
            "index": sum,
            **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
            "count": sum,
        }
    )
    # Get the most common values for each column
    repeated.loc[repeated.index, un_col] = (
        repeated.loc[repeated.index, un_col].apply(suggest_value).values
    )
    repeated = repeated.reset_index()

    # Concatenate unique
    use_index = repeated.loc[repeated[un_col].apply(len) == 1, un_col].index
    repeated.loc[use_index, un_col] = repeated.loc[use_index, un_col].apply(
        lambda x: x[0]
    )
    unified = repeated.loc[use_index]

    return unified

In [None]:
# # Obtain unique ID-name
# unified_ID = unify_repeated_col(non_unique, "ID", "Name_norm")
# # Update non_unique
# non_unique_ids = list(
#     set(chain.from_iterable(non_unique["index"]))
#     - set(chain.from_iterable(unified_ID["index"]))
# )
# # non_unique = companies_clean.loc[non_unique_ids]
# non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])

In [None]:
# # Obtain unique name-ID
# unified_NAME = unify_repeated_col(non_unique, "Name_norm", "ID")
# # Update non_unique
# non_unique_ids = list(
#     set(chain.from_iterable(non_unique["index"]))
#     - set(chain.from_iterable(unified_NAME["index"]))
# )
# # non_unique = companies_clean.loc[non_unique_ids]
# non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])

#### Merge Companies info

In [None]:
# # Global
# # Merge unique+unifiedID+unifiedName+nonUnique
# merged_global = pd.concat([unique, unified_ID, unified_NAME, non_unique])
# cols_vals = [
#     c
#     for c in merged_global.columns
#     if c not in ["ID", "Name_norm", "count", "index", "id_tender"]
# ]
# merged_global = merged_global.groupby(["ID", "Name_norm"]).agg(
#     {
#         # "index": lambda x: list(chain.from_iterable(x)),
#         "index": sum,
#         "id_tender": sum,
#         **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
#         "count": sum,
#     }
# )
# merged_global = merged_global.reset_index()
# print(len(merged_global))
# display(merged_global.head())

#### Unify found names

In [None]:
# # Get all names found in the tenders
# merged_global["UsedNames"] = (merged_global["Name"] + merged_global["Name_proc"]).apply(
#     lambda x: sorted(list(set(x)))
# )

#### Propose a final name

In [None]:
# # Initial computations
# data = merged_global["Name_proc"]
# local_frequencies = data.apply(Counter)
# proposed_names = local_frequencies.apply(lambda x: max(x, key=x.get))
# # First proposal
# merged_global["Name_proposed"] = proposed_names

# # Total names
# global_freq_dict = data.explode().value_counts().to_dict()
# names_list = list(global_freq_dict.keys())

In [None]:
# def adjust_names_for_batch_v2(names_batch, local_frequencies, proposed_names):
#     """
#     Adjusts and determines the most probable names for a given batch of names.

#     For each name in the batch, the function checks if the name is associated
#     with multiple IDs. If so, it retains the name only for the ID where the name
#     has the highest frequency (local priority). For the other IDs, it assigns
#     the next best name based on local frequency.

#     Parameters:
#     -----------
#     names_batch: List[str]
#         A batch of names to be processed.
#     local_frequencies: pd.Series
#         Series containing the frequency count of each name for each ID.
#     proposed_names: pd.Series
#         Series mapping each ID to its currently assigned name.

#     Returns:
#     --------
#     modified_real_names: Dict[int, str]:
#         A dictionary with IDs as keys and their adjusted real names as values.
#     """
#     modified_real_names = {}
#     for name in names_batch:
#         ids_with_name = proposed_names[proposed_names == name].index.tolist()
#         if len(ids_with_name) > 1:
#             ids_with_name.sort(key=lambda idx: -local_frequencies.loc[idx][name])
#             for idx in ids_with_name[1:]:
#                 del local_frequencies.loc[idx][name]
#                 if local_frequencies.loc[idx]:
#                     modified_real_names[idx] = max(
#                         local_frequencies.loc[idx], key=local_frequencies.loc[idx].get
#                     )

#     return modified_real_names


# new_proposed_name = parallelize_function_with_progress_bar(
#     func=adjust_names_for_batch_v2,
#     data=names_list,
#     batch_size=1000,
#     desc="progress_p2",
#     workers=-1,
#     prefer="processes",
#     output="series",
#     local_frequencies=local_frequencies,
#     proposed_names=proposed_names,
# )


# # Update the proposed names with the new ones
# for item in new_proposed_name:
#     merged_global.loc[item.keys(), "Name_proposed"] = list(item.values())

In [None]:
# # merged_global.to_parquet("data/merged_global.parquet")
# merged_global = pd.read_parquet("data/merged_global.parquet")

#### Check if company is SME

In [None]:
# def isPYME(SMEIndicators):
#     # Evaluate if is SME based on the SMEAwardedIndicator appearances
#     # If True and False are present, return None
#     # TODO: make a better decision
#     sme_counts = Counter(SMEIndicators)
#     if True in sme_counts and False in sme_counts:
#         return None
#     return sme_counts.most_common(1)[0][0]


# merged_global["isPYME"] = merged_global["SMEAwardedIndicator"].apply(isPYME)

#### Check CityName and PostalZone

In [None]:
# def get_city_name(CityName):
#     # Evaluate the city name based on the CityName appearances
#     # Get most common excluding None
#     # TODO: make a better decision
#     city_names = Counter(CityName)
#     if None in city_names.keys():
#         city_names.pop(None)
#     if not len(city_names) == 1:
#         return None
#     return city_names.most_common(1)[0][0]


# def get_postal_zone(PostalZone):
#     # Evaluate the postal zone based on the PostalZone appearances
#     # Get most common excluding None
#     # TODO: make a better decision
#     postal_zones = Counter(PostalZone)
#     if None in postal_zones.keys():
#         postal_zones.pop(None)
#     if not len(postal_zones) == 1:
#         return None
#     return postal_zones.most_common(1)[0][0].split(".")[0]


# merged_global["City"] = merged_global["CityName"].apply(get_city_name)
# merged_global["PostalCode"] = merged_global["PostalZone"].apply(get_postal_zone)

### Add info

In [None]:
# # Add information based on NIF
# merged_global["NIF_type"] = merged_global["ID"].apply(get_nif_type)
# merged_global["prov"], merged_global["comp_type"], merged_global["comp_desc"] = list(
#     zip(*merged_global["ID"].apply(get_info_from_cif))
# )
# merged_global["comp_type"] = merged_global["comp_type"].apply(
#     lambda x: x.split(",")[0] if not pd.isna(x) else None
# )

### Find UTEs

In [None]:
# # Find UTEs based on name
# ute_n = provisional_company_info["UsedNames"].apply(
#     lambda x: bool(regex.search(r"\bu(\.)?t(\.)?e(\.)?\b", " ".join(x)))
# )
# # Find UTEs based on ID
# ute_i = provisional_company_info["NIF"].apply(lambda x: x.startswith("u"))

# # provisional_company_info[ute_i | ute_n][["NIF", "NameProposed", "UsedNames"]]
# # sum(ute_n), sum(ute_i), sum(ute_n & ute_i), sum(ute_n & ute_i)/min(sum(ute_n), sum(ute_i))

# utes = provisional_company_info[ute_i | ute_n]

### Save data

In [None]:
# provisional_company_info = merged_global.rename(
#     columns={
#         "ID": "NIF",
#         "id_tender": "TenderAppearance",
#         "prov": "Province",
#         "NIF_type": "NIFtype",
#         "comp_type": "CompanyType",
#         "comp_desc": "CompanyDescription",
#         "Name_proposed": "NameProposed",
#     }
# )[
#     [
#         "NIF",
#         "NameProposed",
#         "UsedNames",
#         "Province",
#         "City",
#         "NIFtype",
#         "CompanyType",
#         "CompanyDescription",
#         "isPYME",
#         "TenderAppearance",
#     ]
# ]

In [None]:
# provisional_company_info.to_parquet("data/provisional_company_info.parquet")
# utes.to_parquet("data/utes.parquet")

provisional_company_info = pd.read_parquet("data/provisional_company_info.parquet")
utes = pd.read_parquet("data/utes.parquet")

In [None]:
provisional_company_info

## Empresas Zaragoza

In [None]:
# Cargar las empresas de Zaragoza y darle el formato propio
emp = pd.read_excel("data/licitador_09_23.xls")
emp_zgz = emp[["ID_EMPRESA", "NIF", "NOMBRE", "UTE"]].copy()
# Limpieza de nombres
emp_zgz["empresa"] = emp_zgz["NOMBRE"].apply(clean_company_type)
emp_zgz["empresa_proc"] = emp_zgz["empresa"].apply(clean_company_type, remove_type=True)
# Creamos una columna con los nombres que se han usado para identificar la empresa
emp_zgz["UsedNames"] = emp_zgz[["empresa", "empresa_proc"]].apply(
    lambda x: list(set([x[0], x[1]])), axis=1
)
emp_zgz["nif"] = emp_zgz["NIF"].apply(
    lambda x: regex.sub(r"\W", "", x.lower()) if not pd.isna(x) else None
)
emp_zgz["valid_nif"] = emp_zgz["nif"].apply(lambda x: is_valid_nif(x) if x else None)
emp_zgz["nif_type"] = emp_zgz["nif"].apply(lambda x: get_nif_type(x) if x else None)

# Transformar UTE a bool
emp_zgz["UTE"] = emp_zgz["UTE"].apply(
    lambda x: True if x == "S" else False if x == "N" else None
)

# Stats
print("Elementos inválidos en los datos:")
emp_zgz.apply(pd.isna, axis=1).sum()

### Empresas repetidas

In [None]:
total_empresas = len(emp_zgz)

# Identificar NIF repetidos
emp_zgz_dup_nif = emp_zgz.loc[
    emp_zgz["nif"].duplicated(keep=False) & emp_zgz["nif"].notna(),
    ["ID_EMPRESA", "nif", "empresa_proc"],
].sort_values(by="nif")
print(f"Hay {len(emp_zgz_dup_nif)} NIFs duplicados:")
display(emp_zgz_dup_nif)
print(
    f"Los NIFs duplicados representan el {(len(emp_zgz_dup_nif) / total_empresas) * 100:.2f}% del total."
)

print()

# Identificar nombres repetidos
emp_zgz_dup_name = emp_zgz.loc[
    emp_zgz["empresa_proc"].duplicated(keep=False) & emp_zgz["empresa_proc"].notna(),
    ["ID_EMPRESA", "nif", "empresa_proc"],
].sort_values(by="empresa_proc")
print(f"Hay {len(emp_zgz_dup_name)} nombres duplicados:")
display(emp_zgz_dup_name)
print(
    f"Los nombres duplicados representan el {(len(emp_zgz_dup_name) / total_empresas) * 100:.2f}% del total."
)

### Search companies in own data

In [None]:
own_info = provisional_company_info[
    ["NIF", "UsedNames", "NameProposed", "isPYME"]
].rename(columns={"NIF": "nif"})
own_info["UsedNames"] = own_info["UsedNames"].apply(list)

In [None]:
# Buscar por NIF
nifs_usados = own_info.reset_index()
nif_zgz = emp_zgz[["ID_EMPRESA", "nif", "UTE", "empresa", "UsedNames"]]
found_by_nif = pd.merge(
    nif_zgz, nifs_usados, left_on="nif", right_on="nif", how="inner"
)
found_by_nif["UsedNames"] = found_by_nif[["UsedNames_x", "UsedNames_y"]].sum(axis=1)
found_by_nif = (
    found_by_nif.groupby("ID_EMPRESA")
    .agg(
        {
            "nif": set,
            "index": set,
            "UsedNames": sum,
            "NameProposed": set,
            "isPYME": set,
            "UTE": set,
        }
    )
    .applymap(lambda x: list(set(x)))
    .reset_index()
)
found_by_nif

In [None]:
# found_by_name.loc[found_by_name["UsedNames"].apply(lambda x: "reby rides" in x)]

In [None]:
# Buscar por nombre excluyendo las que hemos encontrado por NIF
nombres_usados = own_info.explode(column="UsedNames").reset_index()
nombres_zgz = emp_zgz[~emp_zgz["ID_EMPRESA"].isin(found_by_nif["ID_EMPRESA"])]
nombres_zgz = nombres_zgz[["ID_EMPRESA", "nif", "UTE", "empresa", "UsedNames"]].explode(
    column="UsedNames"
)
found_by_name = pd.merge(
    nombres_zgz, nombres_usados, left_on="UsedNames", right_on="UsedNames", how="inner"
)
found_by_name["nif"] = (
    found_by_name[["nif_x", "nif_y"]]
    .applymap(validate_nif)
    .apply(lambda x: list(set([el for el in x if el])), axis=1)
)
found_by_name = (
    found_by_name.groupby("ID_EMPRESA")
    .agg(
        {
            "nif": sum,
            "index": set,
            "UsedNames": set,
            "NameProposed": set,
            "isPYME": set,
            "UTE": set,
        }
    )
    .applymap(lambda x: list(set(x)))
    .reset_index()
)
found_by_name

In [None]:
# Unimos los dos dataframes
found = (
    pd.concat([found_by_nif, found_by_name])
    .groupby("ID_EMPRESA")
    .agg(sum)
    .applymap(lambda x: list(set(x)) if x else None)
    .reset_index()
)
not_found = emp_zgz[~emp_zgz["ID_EMPRESA"].isin(found["ID_EMPRESA"])]
# Stats
print(f"Se han encontrado {len(found)} empresas de Zaragoza en las empresas previas")
print("Elementos inválidos en los datos:")
print(found.apply(pd.isna, axis=1).sum().to_dict())
print(f"Hay {len(not_found)} empresas que no se han encontrado")
# found[["nif", "index", "UsedNames", "NameProposed", "isPYME", "UTE"]].applymap(
#     lambda x: x[0]
# )

## Análisis empresas encontradas y no encontradas

### Incorrect data

In [None]:
null_id_emp_ids = emp_zgz[emp_zgz["nif"].isna()]["ID_EMPRESA"].values

print(f"Null ids: {len(null_id_emp_ids)}")
print(f"Null ids found: {found['ID_EMPRESA'].isin(null_id_emp_ids).sum()}")
print(f"Null ids not found: {not_found['ID_EMPRESA'].isin(null_id_emp_ids).sum()}")

In [None]:
print(f"De las empresas que no se han encontrado ({len(not_found)})")
not_found_valid = not_found[not_found["valid_nif"].apply(bool)]
not_found_invalid = not_found[~not_found["valid_nif"].apply(bool)].copy()
print(
    f"Hay {len(not_found_valid)} que tienen un NIF válido y {len(not_found_invalid)} que no"
)

# Intento de corrección del nif
not_found_invalid["proposed_nif"] = (
    not_found_invalid["nif"].dropna().apply(validate_nif, correct=True)
)
print(f"De los no encontrados, que tienen NIF no nulo, se han podido corregir:")
not_found_invalid.dropna(subset="proposed_nif")

In [None]:
# Columnas que deberían tener valores únicos
unique_columns = [c for c in found.columns if c not in ["ID_EMPRESA", "UsedNames"]]
found[unique_columns] = found[unique_columns].applymap(
    lambda x: [el for el in x if not pd.isna(el)]
)

In [None]:
# Encontradas
unique = (
    found[unique_columns]
    .applymap(lambda x: len(x) if x else 1)
    .apply(lambda x: all(x == 1), axis=1)
)
# Válidas (todo son valores únicos)
valid_unique = found[unique].copy()
valid_unique[unique_columns] = valid_unique[unique_columns].applymap(
    lambda x: x[0] if x else None
)

# Inválidas (algún valor que debería ser único no lo es)
invalid_unique = found[~unique].copy()
# invalid_unique[unique_columns] = invalid_unique[unique_columns].applymap(lambda x: x if x else None)

In [None]:
# # TODO: Para Zaragoza, podemos sugerir un NIF en función de si la provincia es Zaragoza:
# invalid_unique["nif"].apply(
#     lambda x: [get_info_from_cif(el)[0] == "Zaragoza" for el in x]
# )

#### UTEs

In [None]:
emp_zgz.loc[emp_zgz["UTE"] == True, "empresa"].values

In [None]:
print(valid_unique["UTE"].value_counts().to_dict())
print(invalid_unique["UTE"].value_counts())

In [None]:
found["UTE"]

In [None]:
valid_unique[valid_unique["UTE"] == True]

In [None]:
# Empresas no encontradas en datos propios, con NIF válido
not_found = emp_zgz.loc[
    ~emp_zgz["ID_EMPRESA"].isin(found["ID_EMPRESA"]) & emp_zgz["valid_nif"].apply(bool)
]

In [None]:
# # Rellenar NIF:
# empty_nif = emp_zgz[
#     (emp_zgz["ID_EMPRESA"].isin(found["ID_EMPRESA"])) & (emp_zgz["nif"].isna())
# ]
# pd.merge(
#     empty_nif[["ID_EMPRESA", "empresa", "empresa_proc"]],
#     found,
#     left_on="ID_EMPRESA",
#     right_on="ID_EMPRESA",
#     how="inner",
# )

## OLD

In [None]:
# # df = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv", sep=";", header=0, nrows=64, index_col=False)
# # df = pd.read_excel(r"C:\Users\josea\Downloads\empresas.xlsx")

# with open(r"C:\Users\josea\Downloads\empresas_zgz.csv", "r", encoding="utf-8") as f:
#     emp = [
#         [el.replace('"', "").strip() for el in l.lower().strip().split(";", 4)]
#         for l in f.readlines()
#         if len(l) > 2
#     ]
# cols = emp[0]
# data = emp[1:]
# emp_zgz = pd.DataFrame(data=data, columns=cols)
# emp_zgz = emp_zgz.applymap(lambda x: x if x else None)
# emp_zgz = emp_zgz.dropna(how="all").drop_duplicates().reset_index(drop=True)
# emp_zgz["empresa"] = emp_zgz["empresa"].apply(clean_company_type)
# emp_zgz["empresa_proc"] = emp_zgz["empresa"].apply(clean_company_type, remove_type=True)
# emp_zgz["nif"] = emp_zgz["nif"].apply(lambda x: regex.sub(r"\W", "", x) if x else None)
# emp_zgz["nif_type"] = emp_zgz["nif"].apply(lambda x: get_nif_type(x) if x else None)