# Imports

In [1]:
import json
from collections import Counter
from itertools import chain
from pathlib import Path

import numpy as np
import pandas as pd
import regex

# from unidecode import unidecode

In [2]:
import time

In [3]:
from src.companies.processor import clean_company_type, normalize_company_name
from src.companies.utils import replace_company_types
from src.nif_validation.validation import (
    get_nif_type,
    validate_nif,
    is_valid_nif,
    is_valid_cif,
    is_valid_dni,
    is_valid_nie,
    get_info_from_cif,
)
from src.utils.utils import fill_na, fill_to_length

# Load info

In [4]:
with open(r"C:\Users\josea\Downloads\genCat_Junio_2023.json", "r") as f:
    gencat = pd.json_normalize(json.load(f))

In [5]:
# df = pd.read_csv(r"C:\Users\josea\Downloads\Contractaci__p_blica_a_Catalunya__publicacions_a_la_Plataforma_de_serveis_de_contractaci__p_blica.csv")
df_emp = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv")

  df_emp = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv")


In [6]:
# Load data
dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/insiders.parquet")
df_in = pd.read_parquet(dir_df)
dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/outsiders.parquet")
df_ou = pd.read_parquet(dir_df)
dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/minors.parquet")
df_mi = pd.read_parquet(dir_df)

In [7]:
df_in.columns = [".".join([el for el in c if el]) for c in df_in.columns]
df_ou.columns = [".".join([el for el in c if el]) for c in df_ou.columns]
df_mi.columns = [".".join([el for el in c if el]) for c in df_mi.columns]

In [131]:
# # ERROR A REVISAR
# df_mi.loc[
#     df_mi["id"]
#     .str.lower()
#     .isin(
#         [
#             "https://contrataciondelestado.es/sindicacion/datosabiertosmenores/8622601",
#             "https://contrataciondelestado.es/sindicacion/datosabiertosmenores/8410165",
#         ]
#     ),
#     [
#         "ContractFolderStatus.TenderResult.WinningParty.PartyIdentification.ID",
#         "ContractFolderStatus.TenderResult.WinningParty.PartyName.Name",
#     ],
# ]

# Aux functions
Functions necessary for processing the cells

In [8]:
from joblib import Parallel, delayed
from typing import Dict, List, Union


def parallelize_function(
    func,
    data: Union[pd.Series, List],
    workers=-1,
    prefer="processes",
    output: str = "series",
    *args,
    **kwargs
):
    results = Parallel(
        n_jobs=workers,
        prefer=prefer,
        verbose=0,
        # backend="threading", # loky, multiprocessing, threading
        # return_as="generator", # list
        # timeout=None,
        # pre_dispatch="2 * n_jobs",
        batch_size="auto",
        # max_nbytes="1M",
        # mmap_mode="r",
        # require=None,
    )(delayed(func)(x, *args, **kwargs) for x in data)
    if output == "series" and isinstance(data, pd.Series):
        return pd.Series(results, index=data.index)
    return list(results)

In [9]:
def evaluate_cell(cell):
    if not isinstance(cell, (list, np.ndarray)):
        cell = [cell]
        # print("1", cell)
    if pd.isnull(cell[0]):
        return [None]
    elif isinstance(cell[0], str) and cell[0].startswith("[") and cell[0].endswith("]"):
        # print("2")
        return eval(cell[0])
    else:
        # print("3")
        return cell


def clean_df(df: pd.DataFrame, prefer=None, workers=-1):
    t0 = time.time()
    df = df.applymap(
        lambda x: regex.sub(r"((?<=\w+\W)\s+)|(\s+(?=\W\w+))", "", x)
        if not pd.isna(x)
        else None
    )
    t1 = time.time()
    print(f" - {t1-t0}")
    if not prefer:
        # Validate NIF
        t0 = time.time()
        df["ID"] = df["ID"].apply(validate_nif)
        t1 = time.time()
        print(f" - {t1-t0}")

        # Clean company type
        t0 = time.time()
        name = []
        for i, n in df[["ID", "Name"]].values:
            if pd.isna(n) or pd.isna(i):
                name.append(n)
            else:
                name.append(regex.sub(i, "", n) if not pd.isna(n) else None)
        df["Name"] = name
        df["Name"] = df["Name"].apply(clean_company_type, remove_type=False)
        t1 = time.time()
        print(f" - {t1-t0}")

        # Remove company type
        t0 = time.time()
        df["Name_proc"] = df["Name"].apply(clean_company_type, remove_type=True)
        t1 = time.time()
        print(f" - {t1-t0}")

        # Normalize company name
        t0 = time.time()
        df["Name_norm"] = df["Name_proc"].apply(normalize_company_name)
        t1 = time.time()
        print(f" - {t1-t0}")

    else:
        # Validate NIF
        t0 = time.time()
        df["ID"] = parallelize_function(
            validate_nif,
            df["ID"],
            prefer=prefer,
            workers=workers,
        )
        t1 = time.time()
        print(f" - {t1-t0}")

        # Clean company type
        t0 = time.time()
        name = []
        for i, n in df[["ID", "Name"]].values:
            if pd.isna(n) or pd.isna(i):
                name.append(n)
            else:
                name.append(regex.sub(i, "", n) if not pd.isna(n) else None)
        df["Name"] = name
        df["Name"] = parallelize_function(
            clean_company_type,
            df["Name"],
            prefer=prefer,
            workers=workers,
            remove_type=False,
        )
        t1 = time.time()
        print(f" - {t1-t0}")

        # Remove company type
        t0 = time.time()
        df["Name_proc"] = parallelize_function(
            clean_company_type,
            df["Name"],
            prefer=prefer,
            workers=workers,
            remove_type=True,
        )
        t1 = time.time()
        print(f" - {t1-t0}")

        # Normalize company name
        t0 = time.time()
        df["Name_norm"] = parallelize_function(
            normalize_company_name,
            df["Name_proc"],
            prefer=prefer,
            workers=workers,
        )
        t1 = time.time()
        print(f" - {t1-t0}")

    return df


def nif_from_name(name):
    name_spl = np.array(name.split())
    valid = np.array([bool(validate_nif(s)) for s in name_spl])
    new_name = " ".join(name_spl[~valid])
    new_nif = Counter(name_spl[valid]).most_common()[0][0] if valid.any() else np.nan
    return new_name, new_nif

# Merge data from different sources

In [12]:
# def merge_data(
#     dir_metadata: Path,
#     merge_dfs=["minors", "insiders", "outsiders"],
# ):
#     """
#     Merge original data parquet files into single dataframe
#     """
#     dfs = [pd.read_parquet(dir_metadata.joinpath(f"{d}.parquet")) for d in merge_dfs]

#     # Unify texts from all sources
#     dfs_companies = []
#     for df in dfs:
#         # Reset index and rename to common identifier
#         index_names = df.index.names
#         orig_cols = df.columns
#         df.reset_index(inplace=True)
#         df["identifier"] = df[index_names].astype(str).agg("/".join, axis=1)
#         # df.drop(index_names, inplace=True, axis=1)
#         df.set_index("identifier", inplace=True)
#         df = df[orig_cols]

#         # Select company columns from winning parties and rename them
#         join_str = lambda x: ".".join([el for el in x if el])
#         joint_cnames = {join_str(c): c for c in df.columns}
#         reverse_joint_cnames = {v: k for k, v in joint_cnames.items()}
#         comp_cols = sorted(
#             [v for k, v in joint_cnames.items() if "WinningParty" in k or k == "id"]
#         )
#         print(comp_cols)

#         df_companies = df.loc[:, comp_cols]
#         use_cols = [reverse_joint_cnames[c].split(".")[-1] for c in comp_cols]
#         print(use_cols)
#         df_companies.columns = use_cols

#         dfs_companies.append(df_companies.rename(columns={"id": "id_tender"}))
#         print()
#     df_companies = pd.concat(dfs_companies)

#     # Normalize info (lists of strings)
#     df_companies = df_companies.applymap(fill_na, fill=[None])
#     for c in df_companies.columns:
#         df_companies[c] = (
#             df_companies[c]
#             .apply(evaluate_cell)
#             .apply(
#                 lambda x: [None] if not x[0] else [str(el).strip().lower() for el in x]
#             )
#         )
#     return df_companies


# df_companies = merge_data(
#     dir_metadata=Path("C:/Users/josea/Documents/Trabajo/data/metadata/")
# )
# df_companies.to_parquet("companies.parquet")

[('ContractFolderStatus', 'TenderResult', 'WinningParty', 'PartyIdentification', 'ID', '', '', '', '', '', '', ''), ('ContractFolderStatus', 'TenderResult', 'WinningParty', 'PartyLegalEntity', 'CompanyTypeCode', '', '', '', '', '', '', ''), ('ContractFolderStatus', 'TenderResult', 'WinningParty', 'PartyName', 'Name', '', '', '', '', '', '', ''), ('ContractFolderStatus', 'TenderResult', 'WinningParty', 'PhysicalLocation', 'Address', 'CityName', '', '', '', '', '', ''), ('ContractFolderStatus', 'TenderResult', 'WinningParty', 'PhysicalLocation', 'Address', 'Country', 'IdentificationCode', '', '', '', '', ''), ('ContractFolderStatus', 'TenderResult', 'WinningParty', 'PhysicalLocation', 'Address', 'PostalZone', '', '', '', '', '', ''), ('ContractFolderStatus', 'TenderResult', 'WinningParty', 'PhysicalLocation', 'CountrySubentityCode', '', '', '', '', '', '', ''), ('id', '', '', '', '', '', '', '', '', '', '', '')]
['ID', 'CompanyTypeCode', 'Name', 'CityName', 'IdentificationCode', 'PostalZ

In [16]:
df_companies = pd.read_parquet("companies.parquet")

# Obtain individual companies

In [23]:
# # Use only those where all dimensions match
# # (e.g. same number of companies and companies ids)
# # and drop NAs
# df_companies = df_companies[
#     df_companies[["ID", "Name"]]
#     .applymap(lambda x: not pd.isna(x[0]))
#     .apply(all, axis=1)
# ]
# df_companies = df_companies[
#     df_companies.applymap(lambda x: len(x) if x[0] else None).apply(
#         lambda x: len(set([el for el in x if not pd.isnull(el)])) == 1,
#         axis=1,
#     )
# ]
# companies_columns = list(df_companies.columns)
# # Get number of companies by tender
# df_companies["_len"] = df_companies["ID"].apply(len)

# # Fill lists of None to have the same number of elements and explode later
# companies = pd.DataFrame(
#     df_companies.apply(
#         lambda x: [fill_to_length(list(el), x[-1]) for el in x[:-1]], axis=1
#     ).tolist(),
#     columns=companies_columns,
# )

# # Split companies in rows
# companies = companies.explode(companies_columns)
# companies = companies.reset_index(drop=True)
# display(companies.head())

Unnamed: 0,ID,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,id_tender
0,b30437347,,climayor s.l. b30437347,,,,,https://contrataciondelestado.es/sindicacion/d...
1,b60564309,,"gometrics, s.l.",,,,,https://contrataciondelestado.es/sindicacion/d...
2,g57694549,,associaciò alcem el c.i.n.e.,,,,,https://contrataciondelestado.es/sindicacion/d...
3,b73326019,,diseño y decoraciones j. peñalver s.l. b73326019,,,,,https://contrataciondelestado.es/sindicacion/d...
4,b28954170,,"thermo fisher scientific, s.l.",,,,,https://contrataciondelestado.es/sindicacion/d...


In [35]:
# # WRONG IDs
# c_test = companies.drop_duplicates(subset="ID")
# c_test_valid = c_test["ID"].apply(is_valid_nif)
# val = "abcdefghjnpqrsuvw"
# c_test_sub = c_test.loc[~c_test_valid, "ID"]
# c_test_sub[
#     c_test_sub.apply(
#         lambda x: x[0] not in val and x[0].isalpha() and not x[:2] in ["xx", "ie", "it"]
#     )
# ]

In [24]:
# # all_ids = companies["ID"].apply(validate_nif)
# all_ids = parallelize_function(
#     validate_nif,
#     companies["ID"],
#     prefer="processes",
# )
# display(all_ids.apply(bool).value_counts())
# display(companies.loc[all_ids.apply(bool)].head())

True     2119795
False      55151
Name: count, dtype: int64

Unnamed: 0,ID,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,id_tender
0,b30437347,,climayor s.l. b30437347,,,,,https://contrataciondelestado.es/sindicacion/d...
1,b60564309,,"gometrics, s.l.",,,,,https://contrataciondelestado.es/sindicacion/d...
2,g57694549,,associaciò alcem el c.i.n.e.,,,,,https://contrataciondelestado.es/sindicacion/d...
3,b73326019,,diseño y decoraciones j. peñalver s.l. b73326019,,,,,https://contrataciondelestado.es/sindicacion/d...
4,b28954170,,"thermo fisher scientific, s.l.",,,,,https://contrataciondelestado.es/sindicacion/d...


In [25]:
# t0 = time.time()
# companies_clean = clean_df(companies, prefer="processes", workers=-1)
# t1 = time.time()
# print(t1 - t0)

 - 47.49102807044983
 - 12.631223440170288
 - 191.50651478767395
 - 62.5355441570282
 - 7.556449890136719
321.7387843132019


In [26]:
# # Aggregate company info in lists
# companies_clean = (
#     companies_clean
#     # companies[["ID", "Name", "Name_proc", "Name_norm"]]
#     .groupby(["ID", "Name_norm"])
#     .agg(list)
#     .reset_index()
# )
# companies_clean["count"] = companies_clean["Name_proc"].apply(len)
# companies_clean = companies_clean.reset_index()

#### Unique names and IDs

In [135]:
# # Unique names and IDs
# # These companies have always appeared with the same (id-name) association
# cols_vals = [
#     c for c in companies_clean.columns if c not in ["ID", "Name_norm", "count"]
# ]
# unique_ID = ~companies_clean["ID"].duplicated(keep=False)
# unique_NAME = ~companies_clean["Name_norm"].duplicated(keep=False)

# # Unique by ID and name
# unique = companies_clean[unique_ID & unique_NAME]

# # Non unique IDs
# non_unique_ids = list(set(companies_clean["index"]) - set(unique["index"]))
# non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]

# unique["index"] = unique["index"].apply(lambda x: [x])
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])
# print(unique.shape, non_unique.shape)

#### Repeated IDs and Names

In [28]:
# Choose definitive values
def suggest_value(elements):
    """
    Select elements based on appearance.
    If same number of appearances, choose the longest.
    If shorter elements are not included in the 'main' one, return all.
    """
    cnt = Counter(elements)
    cnt.pop(None, None)
    cnt = cnt.most_common()
    if cnt:
        max_cnt = cnt[0][1]
        els = sorted([k for k, v in cnt if v == max_cnt], key=lambda x: (-len(x), x))
        # return els[0]
        base = els.pop(0)
        return [base]
        # if all(
        #     [all(t in base for t in regex.sub(r"\W", " ", el).split()) for el in els]
        # ):
        #     return [base]
        # return [base] + els
    else:
        # return None
        return [None]


# Repeated IDs
def unify_repeated_col(df: pd.DataFrame, rep_col: str, un_col: str):
    """
    Takes a dataframe with duplicated values in one column that should be unique (e.g. repeated IDs)
    and another column that should also be unique given the previous one (e.g. title)
    and unifies it so that it chooses the best option.

    Parameters
    ----------
    df: pd.DataFrame
    rep_col: str
        Name of column with repeated values that will be unified
    un_col: str
        Name of column with non unique values
    """
    # Non-unique columns
    cols_vals = [c for c in df.columns if c not in [rep_col, "count", "index"]]
    repeated_rows = df[rep_col].duplicated(keep=False)
    repeated = df[repeated_rows]

    # Count times the values appear
    repeated.loc[repeated.index, [un_col]] = (
        repeated.loc[repeated.index, un_col].apply(lambda x: [x])
        * repeated.loc[repeated.index, "count"]
    )
    # Group by repeated
    repeated = repeated.reset_index()
    repeated = repeated.groupby(rep_col).agg(
        {
            # "index": list,
            "index": sum,
            **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
            "count": sum,
        }
    )
    # Get the most common values for each column
    repeated.loc[repeated.index, un_col] = (
        repeated.loc[repeated.index, un_col].apply(suggest_value).values
    )
    repeated = repeated.reset_index()

    # Concatenate unique
    use_index = repeated.loc[repeated[un_col].apply(len) == 1, un_col].index
    repeated.loc[use_index, un_col] = repeated.loc[use_index, un_col].apply(
        lambda x: x[0]
    )
    unified = repeated.loc[use_index]

    return unified

In [138]:
# # Obtain unique ID-name
# unified_ID = unify_repeated_col(non_unique, "ID", "Name_norm")
# # Update non_unique
# non_unique_ids = list(
#     set(chain.from_iterable(non_unique["index"]))
#     - set(chain.from_iterable(unified_ID["index"]))
# )
# # non_unique = companies_clean.loc[non_unique_ids]
# non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])

In [139]:
# # Obtain unique name-ID
# unified_NAME = unify_repeated_col(non_unique, "Name_norm", "ID")
# # Update non_unique
# non_unique_ids = list(
#     set(chain.from_iterable(non_unique["index"]))
#     - set(chain.from_iterable(unified_NAME["index"]))
# )
# # non_unique = companies_clean.loc[non_unique_ids]
# non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])

#### Companies info

In [31]:
# display(unique.head())
# display(unified_ID.head())
# display(unified_NAME.head())
# display(non_unique.head())

In [32]:
# len(
#     list(chain.from_iterable(unique["index"].values))
#     + list(chain.from_iterable(unified_ID["index"].values))
#     + list(chain.from_iterable(unified_NAME["index"].values))
#     + list(chain.from_iterable(non_unique["index"].values))
# )

In [140]:
# # Global
# # Merge unique+unifiedID+unifiedName+nonUnique
# merged_global = pd.concat([unique, unified_ID, unified_NAME, non_unique])
# cols_vals = [
#     c
#     for c in merged_global.columns
#     if c not in ["ID", "Name_norm", "count", "index", "id_tender"]
# ]
# merged_global = merged_global.groupby(["ID", "Name_norm"]).agg(
#     {
#         # "index": lambda x: list(chain.from_iterable(x)),
#         "index": sum,
#         "id_tender": sum,
#         **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
#         "count": sum,
#     }
# )
# merged_global = merged_global.reset_index()

In [143]:
# merged_global[merged_global["Name_norm"] == "mediamarkt"]

In [144]:
# Unify all branches into one??
unified_ID2 = unify_repeated_col(merged_global, "ID", "Name_norm")
merged_global2 = merged_global[~merged_global["ID"].isin(unified_ID2["ID"])]
# non_unique = companies.loc[non_unique_ids]

unified_NAME2 = unify_repeated_col(merged_global2, "Name_norm", "ID")
merged_global2 = merged_global2[
    ~merged_global2["Name_norm"].isin(unified_NAME2["Name_norm"])
]
# # Update non_unique
# non_unique_ids = list(set(non_unique.index) - set(chain.from_iterable(unified_NAME["index"])))
# non_unique = companies.loc[non_unique_ids]

final_merged = pd.concat(
    [
        merged_global2,
        unified_ID2,
        unified_NAME2,
    ]
).reset_index(drop=True)

final_merged.loc[final_merged.index, cols_vals] = (
    final_merged.loc[final_merged.index, cols_vals]
    .applymap(suggest_value)
    .applymap(lambda x: x[0])
)
print(len(final_merged))
display(final_merged.head())

# Save final_merged
# final_merged.to_parquet("final_merged_companies.parquet")

267243


Unnamed: 0,ID,Name_norm,index,id_tender,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count
0,00021492x,carmenbalgueriasjimenez,[0],[https://contrataciondelestado.es/sindicacion/...,,carmen balguerias jiménez,,,,,carmen balguerias jiménez,2
1,00035211k,palomasainzdelamazadelaserna,[1],[https://contrataciondelestado.es/sindicacion/...,,paloma sáinz de la maza de la serna,,,,,paloma sáinz de la maza de la serna,1
2,00067665e,albertodelgadocebrian,[2],[https://contrataciondelestado.es/sindicacion/...,,alberto delgado cebrián,,,,,alberto delgado cebrián,1
3,00072839k,fernandezabadvicente,[3],[https://contrataciondelestado.es/sindicacion/...,,fernandez abad vicente,,,,,fernandez abad vicente,2
4,00076938a,luisteranlopez,[4],[https://contrataciondelestado.es/sindicacion/...,,luis terán lópez,,,,,luis terán lópez,29


In [149]:
final_merged.to_parquet("companies_info.parquet")

In [145]:
final_merged[["id_tender", "ID", "Name"]].values

array([[list(['https://contrataciondelestado.es/sindicacion/datosabiertosmenores/5695519', 'https://contrataciondelestado.es/sindicacion/datosabiertosmenores/6113113']),
        '00021492x', 'carmen balguerias jiménez'],
       [list(['https://contrataciondelestado.es/sindicacion/datosabiertosmenores/3204269']),
        '00035211k', 'paloma sáinz de la maza de la serna'],
       [list(['https://contrataciondelestado.es/sindicacion/datosabiertosmenores/6899971']),
        '00067665e', 'alberto delgado cebrián'],
       ...,
       [list(['https://contrataciondelestado.es/sindicacion/datosabiertosmenores/6938330', 'https://contrataciondelestado.es/sindicacion/datosabiertosmenores/6228983', 'https://contrataciondelestado.es/sindicacion/datosabiertosmenores/3071577', 'https://contrataciondelestado.es/sindicacion/datosabiertosmenores/6262317', 'https://contrataciondelestado.es/sindicacion/datosabiertosmenores/6262309', 'https://contrataciondelestado.es/sindicacion/datosabiertosmenores/62698

In [147]:
# final_merged[final_merged["Name_norm"] == "mediamarkt"]

In [96]:
# Save final_merged
final_merged = pd.read_parquet("final_merged_companies.parquet")
print(len(final_merged))
display(final_merged.head())

288058


Unnamed: 0,ID,Name_norm,index,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count
0,00021492x,carmenbalgueriasjimenez,[0],,carmen balguerias jiménez,,,,,carmen balguerias jiménez,2
1,00024509z,ondoangiroatxagorritxu,[1],,u.t.e. ondoan giroa txagorritxu,,,,,ondoan giroa txagorritxu,1
2,00035211k,palomasainzdelamazadelaserna,[2],,paloma sáinz de la maza de la serna,,,,,paloma sáinz de la maza de la serna,1
3,00067665e,albertodelgadocebrian,[3],,alberto delgado cebrián,,,,,alberto delgado cebrián,1
4,00072839k,vicentefernandezabad,"[4, 5]",,vicente fernandez abad,,,,,vicente fernandez abad,5


In [50]:
# Add information
final_merged["NIF_type"] = final_merged["ID"].apply(get_nif_type)
final_merged["prov"], final_merged["comp_type"], final_merged["comp_desc"] = list(
    zip(*final_merged["ID"].apply(get_info_from_cif))
)
final_merged["comp_type"] = final_merged["comp_type"].apply(
    lambda x: x.split(",")[0] if not pd.isna(x) else None
)

In [59]:
final_merged.head()

Unnamed: 0,ID,Name_norm,index,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count,NIF_type,prov,comp_type,comp_desc
0,00021492x,carmenbalgueriasjimenez,[0],,carmen balguerias jiménez,,,,,carmen balguerias jiménez,2,DNI,,,
1,00024509z,ondoangiroatxagorritxu,[1],,u.t.e. ondoan giroa txagorritxu,,,,,ondoan giroa txagorritxu,1,DNI,,,
2,00035211k,palomasainzdelamazadelaserna,[2],,paloma sáinz de la maza de la serna,,,,,paloma sáinz de la maza de la serna,1,DNI,,,
3,00067665e,albertodelgadocebrian,[3],,alberto delgado cebrián,,,,,alberto delgado cebrián,1,DNI,,,
4,00072839k,vicentefernandezabad,"[4, 5]",,vicente fernandez abad,,,,,vicente fernandez abad,5,DNI,,,


In [42]:
# Find UTEs based on name
ute_n = (
    final_merged[["Name", "Name_proc"]]
    .apply(lambda x: regex.search(r"u.t.e.", " ".join(x)), axis=1)
    .apply(bool)
)
# Find UTEs based on ID
ute_i = final_merged["ID"].apply(lambda x: x.startswith("u"))

final_merged[ute_i | ute_n]["Name"].values
# sum(ute_n), sum(ute_i), sum(ute_n & ute_i), sum(ute_n & ute_i)/min(sum(ute_n), sum(ute_i))

array(['u.t.e. ondoan giroa txagorritxu',
       'u.t.e. ferran navazo herrero f.aranaz oliver forteza ferre',
       'u.t.e. getinsa+verdasco arquitectos', ...,
       'u.t.e. terminal granollers',
       'u.t.e. terra ingenieros s.l. oca construcciones y proyectos s.a.',
       'u.t.e. transportes blindados s.a. - visabren servicios generales s.l.'],
      dtype=object)

In [43]:
final_merged[ute_i | ute_n]

Unnamed: 0,ID,Name_norm,index,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count
1,00024509z,ondoangiroatxagorritxu,[1],,u.t.e. ondoan giroa txagorritxu,,,,,ondoan giroa txagorritxu,1
160,00400871g,ferrannavazoherrerofaranazoliverfortezaferre,"[193, 194]",,u.t.e. ferran navazo herrero f.aranaz oliver f...,,es,,,ferran navazo herrero f.aranaz oliver forteza ...,2
220,00416934j,getinsaverdascoarquitectos,[264],,u.t.e. getinsa+verdasco arquitectos,,es,,,getinsa+verdasco arquitectos,1
1417,02528996p,anaisabelazaustrecriado,[1732],,ana isabel azaustre criado,,,,,ana isabel azaustre criado,4
1608,02633000y,joaquinpararedajeronimomartin,"[1961, 1962, 1963]",,joaquin parareda jeronimo martin u.t.e.,,,,,joaquin parareda jeronimo martin,3
...,...,...,...,...,...,...,...,...,...,...,...
287938,u01888080,telefonicadeespanaytelefonicamovilesespana,"[401049, 401585, 407065, 407230, 407232, 40742...",,u.t.e. telefónica de españa s.a.u. y telefónic...,,,,,telefónica de españa y telefónica móviles españa,10
287939,u87302147,telefonicaespanatelefonicamovilesespana,"[406739, 406740, 406741, 406742, 402293, 40597...",,u.t.e. telefonica españa s.a.u. - telefonica m...,,,,,telefonica españa telefonica moviles españa,14
287946,u88418785,terminalgranollers,"[406652, 407708, 407709]",,u.t.e. terminal granollers,,,,,terminal granollers,5
287948,u02841708,terraingenierosocaconstruccionesyproyectos,"[401376, 402061, 402399, 404225, 404226]",,u.t.e. terra ingenieros s.l. oca construccione...,,es,,,terra ingenieros - oca construcciones y proyectos,5


In [44]:
# final_merged[ute_i].head()

## Empresas Zaragoza

In [61]:
# df = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv", sep=";", header=0, nrows=64, index_col=False)
# df = pd.read_excel(r"C:\Users\josea\Downloads\empresas.xlsx")

with open(r"C:\Users\josea\Downloads\empresas_zgz.csv", "r", encoding="utf-8") as f:
    emp = [
        [el.replace('"', "").strip() for el in l.lower().strip().split(";", 4)]
        for l in f.readlines()
        if len(l) > 2
    ]
cols = emp[0]
data = emp[1:]
emp_zgz = pd.DataFrame(data=data, columns=cols)
emp_zgz = emp_zgz.applymap(lambda x: x if x else None)
emp_zgz = emp_zgz.dropna(how="all").drop_duplicates().reset_index(drop=True)
emp_zgz["empresa"] = emp_zgz["empresa"].apply(clean_company_type)
emp_zgz["empresa_proc"] = emp_zgz["empresa"].apply(clean_company_type, remove_type=True)
emp_zgz["nif"] = emp_zgz["nif"].apply(lambda x: regex.sub(r"\W", "", x) if x else None)
emp_zgz["nif_type"] = emp_zgz["nif"].apply(lambda x: get_nif_type(x) if x else None)

In [62]:
print(len(emp_zgz))
display(emp_zgz.head())

889


Unnamed: 0,nombre,expediente,empresa,nif,texto,empresa_proc,nif_type
0,alquiler de un equipo de iluminación para esce...,cul09-19,fuse records - fluge zaragoza s.l.,b99053506,https://contrataciondelestado.es/wps/poc?uri=d...,fuse records - fluge zaragoza,CIF
1,contrato mixto de servicios y obras menores de...,eco-2019311,u.t.e. fcc medioambiente s.a. - fcc aqualia s.a.,,https://contrataciondelestado.es/wps/poc?uri=d...,fcc medioambiente fcc aqualia,
2,derribo de la estructura actual de polipasto y...,eco2019335-19,u.t.e. casale ipocsa,,https://contrataciondelestado.es/wps/poc?uri=d...,casale ipocsa,
3,derribo de la estructura actual de polipasto y...,eco2019335-19,depuracion de aguas de mediterraneo s.l.,b96456553,https://contrataciondelestado.es/wps/poc?uri=d...,depuracion de aguas de mediterraneo,CIF
4,obras de acondicionamiento de la planta 3ª de ...,viv01-19,u.t.e. paspa romero polo meg,,https://contrataciondelestado.es/wps/poc?uri=d...,paspa romero polo meg,


In [387]:
# valid_emp = emp_zgz.groupby("nif").agg(set).reset_index()
valid_emp = emp_zgz[["nif", "empresa"]].reset_index()
print(len(valid_emp))
display(valid_emp.head())

889


Unnamed: 0,index,nif,empresa
0,0,b99053506,fuse records - fluge zaragoza s.l.
1,1,,u.t.e. fcc medioambiente s.a. - fcc aqualia s.a.
2,2,,u.t.e. casale ipocsa
3,3,b96456553,depuracion de aguas de mediterraneo s.l.
4,4,,u.t.e. paspa romero polo meg


In [57]:
# # Empresas con nombre repetido y cuyo nif es None en alguno de los casos
# valid_emp = emp_zgz[["nif", "empresa", "nombre"]]
# valid_emp[
#     valid_emp["empresa"].duplicated(keep=False)
#     & valid_emp["empresa"].isin(
#         valid_emp.loc[valid_emp["nif"].isna(), "empresa"].values
#     )
# ].sort_values(by="empresa")
# # (No hay ningún nif para rellenar)

In [108]:
# valid_emp[valid_emp["nombre"].apply(len) > 1]["nombre"]

## Compare companies from Tenders and Zaragoza

In [386]:
# final_merged[["ID", "Name", "count"]]

In [191]:
# Create dataframe with counter for appearances
ids = [
    Counter({k: v})
    for k, v in list(zip(final_merged["ID"].values, final_merged["count"].values))
]
names = [
    Counter({k: v})
    for k, v in list(zip(final_merged["Name"].values, final_merged["count"].values))
]

### Same nif

In [367]:
common_nif = set(final_merged["ID"]) & set(valid_emp["nif"])
# common_comp_nif = final_merged[final_merged["ID"].isin(common_nif)][
#     ["ID", "Name", "count"]
# ].drop_duplicates()
# Companies
common_comp_nif = pd.DataFrame(data={"ID": final_merged["ID"].values, "Name": names})
common_comp_nif = common_comp_nif[common_comp_nif["ID"].isin(common_nif)]
# Zaragoza
# common_emp_nif = valid_emp[valid_emp["nif"].isin(common_nif)][["nif", "empresa"]]
common_emp_nif = (
    valid_emp[valid_emp["nif"].isin(common_nif)]
    .groupby("nif")
    .agg({"index": list, "empresa": Counter})
    .reset_index()
)
# Merge
c_nif = pd.merge(
    common_comp_nif,
    common_emp_nif,
    left_on="ID",
    right_on="nif",
).reset_index(drop=True)

# Select final values
final_name = []
for vals in zip(c_nif["Name"].values, c_nif["empresa"].values):
    v = Counter()
    [v.update(el) for el in vals]
    final_name.append(v)
c_nif["final_name"] = final_name
c_nif["final_name"] = c_nif["final_name"].apply(lambda x: suggest_value(x)[0])
c_nif["final_id"] = c_nif["ID"]

print(len(c_nif))
display(c_nif.head())

# Update used and remaining
idx = set(chain.from_iterable(c_nif["index"]))
rest_valid_emp = valid_emp[~valid_emp["index"].isin(idx)]

473


Unnamed: 0,ID,Name,nif,index,empresa,final_name,final_id
0,07046603r,{'alba mª ortega altamirano': 1},07046603r,"[590, 604]",{'alba m.ª ortega altamirano': 2},alba m.ª ortega altamirano,07046603r
1,07823314w,{'jesus gutierrez sierra': 225},07823314w,[489],{'jesús gutiérrez sierra': 1},jesus gutierrez sierra,07823314w
2,17144195h,{'jaime macipe gayarre': 8},17144195h,[616],{'jaime macipe gayarre': 1},jaime macipe gayarre,17144195h
3,17215345y,{'pascual pardos sierra': 1},17215345y,[324],{'pascual pardos sierra': 1},pascual pardos sierra,17215345y
4,17747571n,{'dª maría nila atienza fanlo': 1},17747571n,"[390, 630]",{'mª nila atienza fanlo': 2},mª nila atienza fanlo,17747571n


### Same name

In [370]:
common_name = set(final_merged["Name"]) & set(rest_valid_emp["empresa"])
# Companies
# common_comp_name = final_merged[final_merged["Name"].isin(common_name)][
#     ["ID", "Name", "count"]
# ].drop_duplicates()
common_comp_name = pd.DataFrame(data={"ID": ids, "Name": final_merged["Name"].values})
common_comp_name = common_comp_name[common_comp_name["Name"].isin(common_name)]
# Zaragoza
# common_emp_name = rest_valid_emp[rest_valid_emp["empresa"].isin(common_name)].drop_duplicates()
common_emp_name = (
    rest_valid_emp[rest_valid_emp["empresa"].isin(common_name)]
    .groupby("empresa")
    .agg({"index": list, "nif": Counter})
    .reset_index()
)
# Merge
c_name = pd.merge(
    common_comp_name,
    common_emp_name,
    left_on="Name",
    right_on="empresa",
).reset_index(drop=True)

# Select final values
final_id = []
for vals in zip(c_name["ID"].values, c_name["nif"].values):
    v = Counter()
    [v.update(el) for el in vals]
    final_id.append(v)
c_name["final_id"] = final_id
c_name["final_id"] = c_name["final_id"].apply(lambda x: suggest_value(x)[0])
c_name["final_name"] = c_name["empresa"]

print(len(c_name))
display(c_name.head())


# Update used and remaining
idx.update(set(chain.from_iterable(c_name["index"])))
rest_valid_emp = valid_emp[~valid_emp["index"].isin(idx)]

22


Unnamed: 0,ID,Name,empresa,index,nif,final_id,final_name
0,{'05005241g': 2},wilmer ossa buitrago,wilmer ossa buitrago,"[586, 593, 608, 710]",{None: 4},05005241g,wilmer ossa buitrago
1,{'07992498k': 2},juan gabriel rodriguez holgado,juan gabriel rodriguez holgado,[33],{'7992498k': 1},07992498k,juan gabriel rodriguez holgado
2,{'40972429e': 63},juan sirera pascual,juan sirera pascual,[396],{None: 1},40972429e,juan sirera pascual
3,{'a50066190': 30},durban maquinaria para la construccion s.a.,durban maquinaria para la construccion s.a.,[411],{None: 1},a50066190,durban maquinaria para la construccion s.a.
4,{'b09671256': 1},ansamater s.l.,ansamater s.l.,[609],{'bb09671256': 1},bb09671256,ansamater s.l.


In [373]:
matches = pd.concat([c_nif, c_name])
matches

Unnamed: 0,ID,Name,nif,index,empresa,final_name,final_id
0,07046603r,{'alba mª ortega altamirano': 1},07046603r,"[590, 604]",{'alba m.ª ortega altamirano': 2},alba m.ª ortega altamirano,07046603r
1,07823314w,{'jesus gutierrez sierra': 225},07823314w,[489],{'jesús gutiérrez sierra': 1},jesus gutierrez sierra,07823314w
2,17144195h,{'jaime macipe gayarre': 8},17144195h,[616],{'jaime macipe gayarre': 1},jaime macipe gayarre,17144195h
3,17215345y,{'pascual pardos sierra': 1},17215345y,[324],{'pascual pardos sierra': 1},pascual pardos sierra,17215345y
4,17747571n,{'dª maría nila atienza fanlo': 1},17747571n,"[390, 630]",{'mª nila atienza fanlo': 2},mª nila atienza fanlo,17747571n
...,...,...,...,...,...,...,...
17,{'b98134752': 1},beberapid c.b.,{'e98134752': 1},[44],beberapid c.b.,beberapid c.b.,b98134752
18,{'w0173298a': 23},fundación delegación fundación finnova,{None: 1},[514],fundación delegación fundación finnova,fundación delegación fundación finnova,w0173298a
19,{'b86907128': 1773},agilent technologies spain s.l.,{'b82381682': 2},"[25, 780]",agilent technologies spain s.l.,agilent technologies spain s.l.,b86907128
20,{'a08432338': 113},mantenimiento y montajes industriales s.a.,{None: 1},[722],mantenimiento y montajes industriales s.a.,mantenimiento y montajes industriales s.a.,a08432338


In [382]:
final_merged[final_merged["ID"] == "b50931302"]

Unnamed: 0,ID,Name_norm,index,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count


In [378]:
validate_nif("b50931302")

'b50931302'

In [377]:
valid_emp[~valid_emp["index"].isin(idx)]

Unnamed: 0,index,nif,empresa
1,1,,u.t.e. fcc medioambiente s.a. - fcc aqualia s.a.
2,2,,u.t.e. casale ipocsa
4,4,,u.t.e. paspa romero polo meg
35,35,,conther hosteleria s.l.u.
36,36,5412400v,esther dominguez alejandre
...,...,...,...
811,811,,u.t.e. industria de servicios ideser - obras y...
820,820,b50931302,logistica social s.l.
838,838,a50004431,talleres mercier s.a.
846,846,,salzillo servicios integrales s.l.u. imesapi s.a.


In [365]:
len(idx), len(valid_emp), len(idx) / len(valid_emp)

(788, 889, 0.8863892013498312)

In [82]:
# Common name, different nif
c_name_diff_nif_comp_ids = list(
    set(common_comp_name.index) - set(common_comp_nif.index)
)
c_name_diff_nif_emp_ids = list(set(common_emp_name.index) - set(common_emp_nif.index))

In [84]:
# emp_zgz.loc[c_name_diff_nif_emp_ids]

In [279]:
final_merged.loc[c_name_diff_nif_comp_ids].head()

Unnamed: 0,ID,Name_norm,index,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count
241254,b96911037,bluedec,"[345549, 345550, 345551, 345552, 345553, 345554]",,bluedec s.l.,,es,,es523,bluedec,49
218790,b84044205,imagicvision,[313116],,imagic vision s.l.,madrid,es,28033.0,es43,imagic vision,10
184139,b50941855,agenciaaragonesadenoticias,[260591],,agencia aragonesa de noticias s.l.,,,,,agencia aragonesa de noticias,10
243852,b98134752,beberapid,[349790],,beberapid c.b.,,,,,beberapid,1
11181,07992498k,juangabrielrodriguezholgado,"[14027, 14028]",,juan gabriel rodriguez holgado,,,,,juan gabriel rodriguez holgado,2


In [None]:
# c_concat = pd.concat([c_nif, c_name])
# c_concat[~c_concat.duplicated(keep=False)]

In [280]:
set(emp_zgz["nif"]) - set(final_merged["ID"])

{'17410114f',
 '17732626v',
 '17763291t',
 '18426455m',
 '514863471',
 '5412400v',
 '7992498k',
 'a27178789b50113562',
 'a50004431',
 'a50016666',
 'a50032002',
 'a50169333',
 'a81196743',
 'a96141361',
 'b01644889',
 'b06851471',
 'b50349323',
 'b50709641',
 'b50755149',
 'b50931302',
 'b54941855',
 'b673750006',
 'b79309902',
 'b82381682',
 'b840444205',
 'b84049238',
 'b85508059',
 'b9691037',
 'b99399438',
 'b99412629',
 'b99532574',
 'bb09671256',
 'e98134752',
 'g50493097',
 'n0171609a',
 'q99118598',
 'siret56211521200077',
 'sl',
 'u02778884',
 'u99489411'}

In [282]:
# common_nif = set(df["nif"]) - set(final_merged["ID"])
# common_comp = final_merged[final_merged["ID"].isin(common_nif)][
#     ["ID", "Name"]
# ].drop_duplicates()
# common_emp = df[df["nif"].isin(common_nif)][["nif", "empresa"]].drop_duplicates()

In [None]:
# dup_emp = df["empresa"].duplicated(keep=False)
# dup_nif = df["nif"].duplicated(keep=False)