# Imports

In [1]:
import json
from collections import Counter
from itertools import chain
from pathlib import Path

import numpy as np
import pandas as pd
import regex

# from unidecode import unidecode

In [2]:
from src.companies.processor import clean_company_type, normalize_company_name
from src.companies.utils import replace_company_types
from src.nif_validation.validation import (
    get_nif_type,
    validate_nif,
    is_valid_nif,
    is_valid_cif,
    is_valid_dni,
    is_valid_nie,
)
from src.utils.utils import fill_na, fill_to_length

# Load info

In [3]:
with open(r"C:\Users\josea\Downloads\genCat_Junio_2023.json", "r") as f:
    gencat = pd.json_normalize(json.load(f))

In [4]:
# df = pd.read_csv(r"C:\Users\josea\Downloads\Contractaci__p_blica_a_Catalunya__publicacions_a_la_Plataforma_de_serveis_de_contractaci__p_blica.csv")
df_emp = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv")

  df_emp = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv")


In [5]:
# Load data
dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/insiders.parquet")
df_in = pd.read_parquet(dir_df)
dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/outsiders.parquet")
df_ou = pd.read_parquet(dir_df)
dir_df = Path("C:/Users/josea/Documents/Trabajo/data/metadata/minors.parquet")
df_mi = pd.read_parquet(dir_df)

In [6]:
df_in.columns = [".".join([el for el in c if el]) for c in df_in.columns]
df_ou.columns = [".".join([el for el in c if el]) for c in df_ou.columns]
df_mi.columns = [".".join([el for el in c if el]) for c in df_mi.columns]

# Aux functions
Functions necessary for processing the cells

In [7]:
from joblib import Parallel, delayed
from typing import Dict, List, Union


def parallelize_function(
    func, data: Union[pd.Series, List], workers=-1, prefer="threads", *args, **kwargs
):
    results = Parallel(
        n_jobs=workers,
        prefer=prefer,
        verbose=0,
        # backend="threading", # loky, multiprocessing, threading
        # return_as="generator", # list
        # timeout=None,
        # pre_dispatch="2 * n_jobs",
        batch_size="auto",
        # max_nbytes="1M",
        # mmap_mode="r",
        # require=None,
    )(delayed(func)(x, *args, **kwargs) for x in data)
    if isinstance(data, pd.Series):
        return pd.Series(results, index=data.index)
    return list(results)

In [8]:
def evaluate_cell(cell):
    if not isinstance(cell, (list, np.ndarray)):
        cell = [cell]
        # print("1", cell)
    if pd.isnull(cell[0]):
        return [None]
    elif isinstance(cell[0], str) and cell[0].startswith("[") and cell[0].endswith("]"):
        # print("2")
        return eval(cell[0])
    else:
        # print("3")
        return cell


def clean_df(df: pd.DataFrame):
    df = df.applymap(
        lambda x: regex.sub(r"((?<=\w+\W)\s+)|(\s+(?=\W\w+))", "", x)
        if not pd.isna(x)
        else None
    )
    # Validate NIF
    df["ID"] = df["ID"].apply(validate_nif)
    # Clean company type
    name = []
    for i, n in df[["ID", "Name"]].values:
        if pd.isna(n) or pd.isna(i):
            name.append(n)
        else:
            name.append(regex.sub(i, "", n) if not pd.isna(n) else None)
    df["Name"] = name
    df["Name"] = df["Name"].apply(clean_company_type, remove_type=False)
    # Remove company type
    df["Name_proc"] = df["Name"].apply(clean_company_type, remove_type=True)
    # Normalize company name
    df["Name_norm"] = df["Name_proc"].apply(normalize_company_name)

    # # Validate NIF
    # df["ID"] = parallelize_function(validate_nif, df["ID"])
    # # Clean company type
    # name = []
    # for i, n in df[["ID", "Name"]].values:
    #     if pd.isna(n) or pd.isna(i):
    #         name.append(n)
    #     else:
    #         name.append(regex.sub(i, "", n) if not pd.isna(n) else None)
    # df["Name"] = name
    # df["Name"] = parallelize_function(clean_company_type, df["Name"], remove_type=False)
    # # Remove company type
    # df["Name_proc"] = parallelize_function(clean_company_type, df["Name"], remove_type=True)
    # # Normalize company name
    # df["Name_norm"] = parallelize_function(normalize_company_name, df["Name_proc"])

    return df


def nif_from_name(name):
    name_spl = np.array(name.split())
    valid = np.array([bool(validate_nif(s)) for s in name_spl])
    new_name = " ".join(name_spl[~valid])
    new_nif = Counter(name_spl[valid]).most_common()[0][0] if valid.any() else np.nan
    return new_name, new_nif

In [9]:
# def clean_company_type(company_name: str, remove_type: bool = False):
#     """
#     Replace the company type if present in a text in any given format
#     (e.g.: "s.l.", "sl", "s. l.") into a standard form ("s.l.")
#     or remove it if `remove_type`=`True`.
#     """
#     if not company_name:
#         return None
#     company_name = replace_company_types(company_name, remove_type=remove_type)
#     company_name = regex.sub(r"[\s]+", " ", company_name)
#     company_name = company_name.strip("-, ")
#     return company_name

In [10]:
# pd.Series(["sd s.l.", None, "asd"]).apply(clean_company_type, remove_type=True)

# Merge data from different sources

In [11]:
# def merge_data(
#     dir_metadata: Path,
#     merge_dfs=["minors", "insiders", "outsiders"],
# ):
#     """
#     Merge original data parquet files into single dataframe
#     """
#     dfs = [pd.read_parquet(dir_metadata.joinpath(f"{d}.parquet")) for d in merge_dfs]

#     # Unify texts from all sources
#     dfs_companies = []
#     for df in dfs:
#         # Reset index and rename to common identifier
#         index_names = df.index.names
#         orig_cols = df.columns
#         df.reset_index(inplace=True)
#         df["identifier"] = df[index_names].astype(str).agg("/".join, axis=1)
#         # df.drop(index_names, inplace=True, axis=1)
#         df.set_index("identifier", inplace=True)
#         df = df[orig_cols]

#         # Select company columns from winning parties and rename them
#         join_str = lambda x: ".".join([el for el in x if el])
#         joint_cnames = {join_str(c): c for c in df.columns}
#         reverse_joint_cnames = {v: k for k, v in joint_cnames.items()}
#         comp_cols = sorted([v for k, v in joint_cnames.items() if "WinningParty" in k])
#         print(comp_cols)

#         df_companies = df.loc[:, comp_cols]
#         use_cols = [reverse_joint_cnames[c].split(".")[-1] for c in comp_cols]
#         print(use_cols)
#         df_companies.columns = use_cols

#         dfs_companies.append(df_companies)
#         print()
#     df_companies = pd.concat(dfs_companies)

#     # Normalize info (lists of strings)
#     df_companies = df_companies.applymap(fill_na, fill=[None])
#     for c in df_companies.columns:
#         df_companies[c] = (
#             df_companies[c]
#             .apply(evaluate_cell)
#             .apply(
#                 lambda x: [None] if not x[0] else [str(el).strip().lower() for el in x]
#             )
#         )
#     return df_companies

# df_companies = merge_data(
#     dir_metadata=Path("C:/Users/josea/Documents/Trabajo/data/metadata/")
# )
# df_companies.to_parquet("companies.parquet")

In [12]:
df_companies = pd.read_parquet("companies.parquet")

# Obtain individual companies

In [13]:
# Use only those where all dimensions match
# (e.g. same number of companies and companies ids)
# and drop NAs
df_companies = df_companies[
    df_companies[["ID", "Name"]]
    .applymap(lambda x: not pd.isna(x[0]))
    .apply(all, axis=1)
]
df_companies = df_companies[
    df_companies.applymap(lambda x: len(x) if x[0] else None).apply(
        lambda x: len(set([el for el in x if not pd.isnull(el)])) == 1,
        axis=1,
    )
]
companies_columns = list(df_companies.columns)
# Get number of companies by tender
df_companies["_len"] = df_companies["ID"].apply(len)

# Fill lists of None to have the same number of elements and explode later
companies = pd.DataFrame(
    df_companies.apply(
        lambda x: [fill_to_length(list(el), x[-1]) for el in x[:-1]], axis=1
    ).tolist(),
    columns=companies_columns,
)

# Split companies in rows
companies = companies.explode(companies_columns)
companies = companies.reset_index(drop=True)

In [14]:
companies.head()

Unnamed: 0,ID,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode
0,b30437347,,climayor s.l. b30437347,,,,
1,b60564309,,"gometrics, s.l.",,,,
2,g57694549,,associaciò alcem el c.i.n.e.,,,,
3,b73326019,,diseño y decoraciones j. peñalver s.l. b73326019,,,,
4,b28954170,,"thermo fisher scientific, s.l.",,,,


In [15]:
# import pandas as pd
# dig_control_num = "JABCDEFGHI".lower()
# secuenciaLetrasNIF = "TRWAGMYFPDXBNJZSQVHLCKE".lower()
# identificadoresDNI = "KLM".lower()
# identificadoresNIE = "XYZ".lower()

# # Load entity identifiers
# id_entidades = pd.read_csv("src/nif_validation/data/identificador_entidades.csv", sep=";")
# let2tipo = dict(id_entidades[["letra", "desc"]].values)

# # Differentiate NIF type
# def differentiate_nif(nif: str) -> str:
#     nif = nif.strip().lower()
#     if nif[0].isnumeric() or nif[0] in identificadoresDNI:
#         return "DNI"
#     elif nif[0] in identificadoresNIE:
#         return "NIE"
#     elif nif[0] in let2tipo:
#         return "CIF"
#     else:
#         return "UNKNOWN"


# # Correct DNI or NIE errors
# def correct_dni(nif: str) -> str:
#     if len(nif) == 8:
#         if nif[0] in identificadoresDNI or nif[0] in identificadoresNIE:
#             digits = nif[1:]
#         else:
#             digits = nif
#         if not digits.isalpha():
#             return nif
#         id_letra = int(digits) % 23
#         control = secuenciaLetrasNIF[id_letra]
#         return nif + control
#     return nif

# def correct_nie(nif: str) -> str:
#     return correct_dni(nif)

# def get_dni_control_digit(digits):
#     id_letra = int(digits) % 23
#     control = secuenciaLetrasNIF[id_letra]
#     return control

# def get_nie_control_digit(digits):
#     id_letra = int(digits) % 23
#     control = secuenciaLetrasNIF[id_letra]
#     return control

# def get_cif_control_digit(letter, digits):
#     # Compute the control digit
#     sum_A = sum(int(d) for i, d in enumerate(digits) if i % 2)
#     sum_B = sum(
#         sum(int(x) for x in str(int(d) * 2)) for i, d in enumerate(digits) if not i % 2
#     )
#     sum_C = sum_A + sum_B
#     digit_E = sum_C % 10
#     D = 10 - digit_E if digit_E != 0 else 0
#     # Check the control digit
#     if letter in ["p", "q", "r", "s", "w"] or digits[:2] == "00":
#         # Control should be a letter
#         control = dig_control_num[D]
#     elif letter in ["a", "b", "e", "h"]:
#         # Control should be a number
#         control = str(D)
#     else:
#         # letter and number are valid
#         control = str(D)
#         control = dig_control_num[D]

# # Correct CIF errors (basic implementation)
# def correct_cif(cif: str) -> str:
#     if len(cif) == 8:
#         letter, digits, control = cif[0], cif[1:8], cif[8]
#         # Check the letter
#         if not letter in let2tipo:
#             if verbose:
#                 print("Error: invalid letter.")
#             return False
#         # Check the province
#         if digits[:2] not in dig2prov:
#             if verbose:
#                 print("Error: invalid province code.")
#             return False
#         return cif + control
#     return cif


# # Correct NIF errors
# def correct_nif_errors(nif: str) -> str:
#     nif_type = differentiate_nif(nif)
#     if nif_type in ["DNI", "NIE"]:
#         return correct_dni_or_nie(nif)
#     elif nif_type == "CIF":
#         return correct_cif(nif)
#     else:
#         return nif

# examples = {
#     "74102334Z": "dni",
#     "32916774W": "dni",
#     "32887108Y": "dni",
#     "42291031B": "dni",
#     "42291031B": "dni",
#     "X2196856B": "nie",
#     "Y1430513L": "nie",
#     "Z4046636K": "nie",
#     "X7527331Y": "nie",
#     "B78337896": "cif",
#     "Q3847694A": "cif",
#     "U59328039": "cif",
#     "U24649782": "cif",
#     "B28118271": "cif",
#     "G74931494": "cif",
#     "H05065545": "cif",
# }
# for e, t in examples.items():
#     print(e, t, differentiate_nif(e))
# invalid = companies.loc[all_ids.isna(), "ID"]
# display(invalid.head())
# corrected = invalid.apply(correct_nif)
# display(corrected.head())

In [16]:
all_ids = companies["ID"].apply(validate_nif)
display(all_ids.apply(bool).value_counts())
display(companies.loc[all_ids.apply(bool)].head())

ID
True     2519165
False      65216
Name: count, dtype: int64

Unnamed: 0,ID,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode
0,b30437347,,climayor s.l. b30437347,,,,
1,b60564309,,"gometrics, s.l.",,,,
2,g57694549,,associaciò alcem el c.i.n.e.,,,,
3,b73326019,,diseño y decoraciones j. peñalver s.l. b73326019,,,,
4,b28954170,,"thermo fisher scientific, s.l.",,,,


In [17]:
companies_clean = companies.copy(deep=True)
companies_clean = companies_clean.applymap(
    lambda x: regex.sub(r"((?<=\w+\W)\s+)|(\s+(?=\W\w+))", "", x)
    if not pd.isna(x)
    else None
)

In [18]:
# Validate NIF
companies_clean["ID"] = companies_clean["ID"].apply(validate_nif)

In [19]:
# Clean company type
name = []
for i, n in companies_clean[["ID", "Name"]].values:
    if pd.isna(n) or pd.isna(i):
        name.append(n)
    else:
        name.append(regex.sub(i, "", n) if not pd.isna(n) else None)
companies_clean["Name"] = name
companies_clean["Name"] = companies_clean["Name"].apply(clean_company_type, remove_type=False)

In [20]:
# Remove company type
companies_clean["Name_proc"] = companies_clean["Name"].apply(clean_company_type, remove_type=True)

In [25]:
# Normalize company name
companies_clean["Name_norm"] = companies_clean["Name_proc"].apply(normalize_company_name)

In [26]:
# companies_clean = clean_df(companies)
# Aggregate company info in lists
companies_clean = (
    companies_clean
    # companies[["ID", "Name", "Name_proc", "Name_norm"]]
    .groupby(["ID", "Name_norm"])
    .agg(list)
    .reset_index()
)
companies_clean["count"] = companies_clean["Name_proc"].apply(len)
companies_clean = companies_clean.reset_index()

#### Unique names and IDs

In [None]:
# # companies_clean["Name_proc"] = companies_clean["Name_proc"].apply(Counter)
# companies_clean["Name_proc"] = companies_clean["Name"]

In [None]:
# companies_clean[companies_clean["count"]>1].values

In [None]:
# companies_clean.head()

In [209]:
# Unique names and IDs
# These companies have always appeared with the same (id-name) association
cols_vals = [
    c for c in companies_clean.columns if c not in ["ID", "Name_norm", "count"]
]
unique_ID = ~companies_clean["ID"].duplicated(keep=False)
unique_NAME = ~companies_clean["Name_norm"].duplicated(keep=False)

# Unique by ID and name
unique = companies_clean[unique_ID & unique_NAME]

# Non unique IDs
non_unique_ids = list(set(companies_clean["index"]) - set(unique["index"]))
non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]

unique["index"] = unique["index"].apply(lambda x: [x])
non_unique["index"] = non_unique["index"].apply(lambda x: [x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique["index"] = unique["index"].apply(lambda x: [x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_unique["index"] = non_unique["index"].apply(lambda x: [x])


In [211]:
unique.shape, non_unique.shape

((220884, 11), (192931, 11))

#### Repeated IDs and Names

In [212]:
# Choose definitive values
def suggest_value(elements):
    """
    Select elements based on appearance.
    If same number of appearances, choose the longest.
    If shorter elements are not included in the 'main' one, return all.
    """
    cnt = Counter(elements)
    cnt.pop(None, None)
    cnt = cnt.most_common()
    if cnt:
        max_cnt = cnt[0][1]
        els = sorted([k for k, v in cnt if v == max_cnt], key=lambda x: (-len(x), x))
        # return els[0]
        base = els.pop(0)
        return [base]
        # if all(
        #     [all(t in base for t in regex.sub(r"\W", " ", el).split()) for el in els]
        # ):
        #     return [base]
        # return [base] + els
    else:
        # return None
        return [None]


# Repeated IDs
def unify_repeated_col(df: pd.DataFrame, rep_col: str, un_col: str):
    """
    Takes a dataframe with duplicated values in one column that should be unique (e.g. repeated IDs)
    and another column that should also be unique given the previous one (e.g. title)
    and unifies it so that it chooses the best option.

    Parameters
    ----------
    df: pd.DataFrame
    rep_col: str
        Name of column with repeated values that will be unified
    un_col: str
        Name of column with non unique values
    """
    # Non-unique columns
    cols_vals = [c for c in df.columns if c not in [rep_col, "count", "index"]]
    repeated_rows = df[rep_col].duplicated(keep=False)
    repeated = df[repeated_rows]

    # Count times the values appear
    repeated.loc[repeated.index, [un_col]] = (
        repeated.loc[repeated.index, un_col].apply(lambda x: [x])
        * repeated.loc[repeated.index, "count"]
    )
    # Group by repeated
    repeated = repeated.reset_index()
    repeated = repeated.groupby(rep_col).agg(
        {
            # "index": list,
            "index": sum,
            **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
            "count": sum,
        }
    )
    # Get the most common values for each column
    repeated.loc[repeated.index, un_col] = (
        repeated.loc[repeated.index, un_col].apply(suggest_value).values
    )
    repeated = repeated.reset_index()

    # Concatenate unique
    use_index = repeated.loc[repeated[un_col].apply(len) == 1, un_col].index
    repeated.loc[use_index, un_col] = repeated.loc[use_index, un_col].apply(
        lambda x: x[0]
    )
    unified = repeated.loc[use_index]

    return unified

In [213]:
# sub = r"((?<=\w+\W)\s+)|(\s+(?=\W\w+))"
# print(regex.sub(sub, "", "amber-   louse   asdf sadf    s   kay"))
# print(regex.sub(sub, "", "apllication  &   licence"))

In [214]:
# Obtain unique ID-name
unified_ID = unify_repeated_col(non_unique, "ID", "Name_norm")
# Update non_unique
non_unique_ids = list(
    set(chain.from_iterable(non_unique["index"]))
    - set(chain.from_iterable(unified_ID["index"]))
)
# non_unique = companies_clean.loc[non_unique_ids]
non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
non_unique["index"] = non_unique["index"].apply(lambda x: [x])

In [219]:
# Obtain unique name-ID
unified_NAME = unify_repeated_col(non_unique, "Name_norm", "ID")
# Update non_unique
non_unique_ids = list(
    set(chain.from_iterable(non_unique["index"])) - set(chain.from_iterable(unified_NAME["index"]))
)
# non_unique = companies_clean.loc[non_unique_ids]
non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
non_unique["index"] = non_unique["index"].apply(lambda x: [x])

In [221]:
# Get the rest
# non_unique = non_unique.reset_index()
# non_unique["index"] = non_unique["index"].apply(lambda x: [x])

#### Companies info

In [223]:
# display(unique.head())
# display(unified_ID.head())
# display(unified_NAME.head())
# display(non_unique.head())

In [241]:
# len(
#     list(chain.from_iterable(unique["index"].values))
#     + list(chain.from_iterable(unified_ID["index"].values))
#     + list(chain.from_iterable(unified_NAME["index"].values))
#     + list(chain.from_iterable(non_unique["index"].values))
# )

In [251]:
# Global
# Merge unique+unifiedID+unifiedName+nonUnique
merged_global = pd.concat([unique, unified_ID, unified_NAME, non_unique])
cols_vals = [c for c in merged_global.columns if c not in ["ID", "Name_norm", "count", "index"]]
merged_global = merged_global.groupby(["ID", "Name_norm"]).agg(
    {
        # "index": lambda x: list(chain.from_iterable(x)),
        "index": sum,
        **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
        "count": sum,
    }
)
merged_global = merged_global.reset_index()

In [258]:
# merged_global[merged_global["Name_norm"]=="mediamarkt"]

In [252]:
# Unify all branches into one??
unified_ID2 = unify_repeated_col(merged_global, "ID", "Name_norm")
merged_global2 = merged_global[~merged_global["ID"].isin(unified_ID2["ID"])]
# non_unique = companies.loc[non_unique_ids]

unified_NAME2 = unify_repeated_col(merged_global2, "Name_norm", "ID")
merged_global2 = merged_global2[
    ~merged_global2["Name_norm"].isin(unified_NAME2["Name_norm"])
]
# # Update non_unique
# non_unique_ids = list(set(non_unique.index) - set(chain.from_iterable(unified_NAME["index"])))
# non_unique = companies.loc[non_unique_ids]

final_merged = pd.concat(
    [
        merged_global2,
        unified_ID2,
        unified_NAME2,
    ]
).reset_index(drop=True)

final_merged.loc[final_merged.index, cols_vals] = (
    final_merged.loc[final_merged.index, cols_vals]
    .applymap(suggest_value)
    .applymap(lambda x: x[0])
)

In [259]:
# final_merged[final_merged["Name_norm"]=="mediamarkt"]

In [266]:
# companies.loc[companies["ID"] == "d28337145"]

In [267]:
# companies_clean.loc[[356776, 356777, 356778, 356779, 356780, 356781]]

In [268]:
# final_merged.loc[final_merged["ID"] == "d28337145"]

In [269]:
# # Find UTEs based on name
# ute_n = (
#     final_merged[["Name", "Name_proc"]]
#     .apply(lambda x: regex.search(r"u.t.e.", " ".join(x)), axis=1)
#     .apply(bool)
# )
# # Find UTEs based on ID
# ute_i = final_merged["ID"].apply(lambda x: x.startswith("u"))

# # final_merged[ute_i | ute_n]["Name"].values
# # sum(ute_n), sum(ute_i), sum(ute_n & ute_i), sum(ute_n & ute_i)/min(sum(ute_n), sum(ute_i))

In [271]:
# final_merged[ute_i].head()

## Empresas Zaragoza

In [272]:
# df = pd.read_csv(r"C:\Users\josea\Downloads\empresas.csv", sep=";", header=0, nrows=64, index_col=False)
# df = pd.read_excel(r"C:\Users\josea\Downloads\empresas.xlsx")

with open(r"C:\Users\josea\Downloads\empresas_zgz.csv", "r", encoding="utf-8") as f:
    emp = [
        [el.replace('"', "").strip() for el in l.lower().strip().split(";", 4)]
        for l in f.readlines()
        if len(l) > 2
    ]
cols = emp[0]
data = emp[1:]
emp_zgz = pd.DataFrame(data=data, columns=cols)
emp_zgz = emp_zgz.applymap(lambda x: x if x else None)
emp_zgz = emp_zgz.dropna().drop_duplicates().reset_index(drop=True)
emp_zgz["empresa"] = emp_zgz["empresa"].apply(replace_company_types)
emp_zgz["nif"] = emp_zgz["nif"].apply(lambda x: regex.sub(r"\W", "", x))
emp_zgz["nif_type"] = emp_zgz["nif"].apply(get_nif_type)

In [273]:
emp_zgz.head()

Unnamed: 0,nombre,expediente,empresa,nif,texto,nif_type
0,alquiler de un equipo de iluminación para esce...,cul09-19,fuse records-fluge zaragoza s.l.,b99053506,https://contrataciondelestado.es/wps/poc?uri=d...,CIF
1,derribo de la estructura actual de polipasto y...,eco2019335-19,depuracion de aguas de mediterraneo s.l.,b96456553,https://contrataciondelestado.es/wps/poc?uri=d...,CIF
2,obras de acondicionamiento de la planta 3ª de ...,viv01-19,inardec construccion creativa s.l.,b99413874,https://contrataciondelestado.es/wps/poc?uri=d...,CIF
3,obras de acondicionamiento de la planta 3ª de ...,viv01-19,construcciones rubio morte s.a.,a50070663,https://contrataciondelestado.es/wps/poc?uri=d...,CIF
4,servicio para la realización del programa depo...,dep01-2019,serveo servicios s.a.u. antigua (ferrovial ser...,a80241789,https://contrataciondelestado.es/wps/poc?uri=d...,CIF


## Compare companies from Tenders and Zaragoza

In [274]:
# Same nif
common_nif = set(final_merged["ID"]) & set(emp_zgz["nif"])
common_comp_nif = final_merged[final_merged["ID"].isin(common_nif)][
    ["ID", "Name", "count"]
].drop_duplicates()
common_emp_nif = emp_zgz[emp_zgz["nif"].isin(common_nif)][
    ["nif", "empresa"]
].drop_duplicates()
c_nif = pd.merge(
    common_comp_nif, common_emp_nif, left_on="ID", right_on="nif"
).reset_index(drop=True)
c_nif.head()

Unnamed: 0,ID,Name,count,nif,empresa
0,07046603r,alba mª ortega altamirano,1,07046603r,alba m.ª ortega altamirano
1,07823314w,jesus gutierrez sierra,225,07823314w,jesús gutiérrez sierra
2,17144195h,jaime macipe gayarre,8,17144195h,jaime macipe gayarre
3,17215345y,pascual pardos sierra,1,17215345y,pascual pardos sierra
4,17747571n,dª maría nila atienza fanlo,1,17747571n,mª nila atienza fanlo


In [275]:
# Same name
common_name = set(final_merged["Name"]) & set(emp_zgz["empresa"])
common_comp_name = final_merged[final_merged["Name"].isin(common_name)][
    ["ID", "Name", "count"]
].drop_duplicates()
common_emp_name = emp_zgz[emp_zgz["empresa"].isin(common_name)][
    ["nif", "empresa"]
].drop_duplicates()
c_name = pd.merge(
    common_comp_name, common_emp_name, left_on="Name", right_on="empresa"
).reset_index(drop=True)
c_name.head()

Unnamed: 0,ID,Name,count,nif,empresa
0,07992498k,juan gabriel rodriguez holgado,2,7992498k,juan gabriel rodriguez holgado
1,17144195h,jaime macipe gayarre,8,17144195h,jaime macipe gayarre
2,17215345y,pascual pardos sierra,1,17215345y,pascual pardos sierra
3,25138444l,susana del río sanz,1,25138444l,susana del río sanz
4,25445647b,leonardo oro vargas,2,25445647b,leonardo oro vargas


In [276]:
common_comp_name

Unnamed: 0,ID,Name,count
11181,07992498k,juan gabriel rodriguez holgado,2
22379,17144195h,jaime macipe gayarre,8
22525,17215345y,pascual pardos sierra,1
34881,25138444l,susana del río sanz,1
35641,25445647b,leonardo oro vargas,2
...,...,...,...
287829,a79252219,securitas seguridad españa s.a.,878
287853,a25027145,servicios microinformatica s.a.,1610
287916,a08602815,talher s.a.,243
287996,a81356313,unitronics comunicaciones s.a.,162


In [277]:
# Common name, different nif
c_name_diff_nif_comp_ids = list(
    set(common_comp_name.index) - set(common_comp_nif.index)
)
c_name_diff_nif_emp_ids = list(set(common_emp_name.index) - set(common_emp_nif.index))

In [278]:
emp_zgz.loc[c_name_diff_nif_emp_ids].head()

Unnamed: 0,nombre,expediente,empresa,nif,texto,nif_type
450,suministro e instalación de cuatro pantallas d...,0021483-21,imagic vision s.l.,b840444205,https://contrataciondelestado.es/wps/poc?uri=d...,
387,servicio de mantenimiento del diseño gráfico d...,0020126-21,ht publicidad grupo tafalla s.l.,g50493097,https://contrataciondelestado.es/wps/poc?uri=d...,CIF
39,explotación del servicio de bares y hostelería...,dep08-19,beberapid c.b.,e98134752,https://contrataciondelestado.es/wps/poc?uri=d...,CIF
552,contratación de los servicios de bar-cafetería...,0076573-21,ansamater s.l.,bb09671256,https://contrataciondelestado.es/wps/poc?uri=d...,
16,"suministro de reactivos, material fungible, me...",0529565-19,sanilabo s.l.,a96141361,https://contrataciondelestado.es/wps/poc?uri=d...,CIF


In [279]:
final_merged.loc[c_name_diff_nif_comp_ids].head()

Unnamed: 0,ID,Name_norm,index,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count
241254,b96911037,bluedec,"[345549, 345550, 345551, 345552, 345553, 345554]",,bluedec s.l.,,es,,es523,bluedec,49
218790,b84044205,imagicvision,[313116],,imagic vision s.l.,madrid,es,28033.0,es43,imagic vision,10
184139,b50941855,agenciaaragonesadenoticias,[260591],,agencia aragonesa de noticias s.l.,,,,,agencia aragonesa de noticias,10
243852,b98134752,beberapid,[349790],,beberapid c.b.,,,,,beberapid,1
11181,07992498k,juangabrielrodriguezholgado,"[14027, 14028]",,juan gabriel rodriguez holgado,,,,,juan gabriel rodriguez holgado,2


In [None]:
# c_concat = pd.concat([c_nif, c_name])
# c_concat[~c_concat.duplicated(keep=False)]

In [280]:
set(emp_zgz["nif"]) - set(final_merged["ID"])

{'17410114f',
 '17732626v',
 '17763291t',
 '18426455m',
 '514863471',
 '5412400v',
 '7992498k',
 'a27178789b50113562',
 'a50004431',
 'a50016666',
 'a50032002',
 'a50169333',
 'a81196743',
 'a96141361',
 'b01644889',
 'b06851471',
 'b50349323',
 'b50709641',
 'b50755149',
 'b50931302',
 'b54941855',
 'b673750006',
 'b79309902',
 'b82381682',
 'b840444205',
 'b84049238',
 'b85508059',
 'b9691037',
 'b99399438',
 'b99412629',
 'b99532574',
 'bb09671256',
 'e98134752',
 'g50493097',
 'n0171609a',
 'q99118598',
 'siret56211521200077',
 'sl',
 'u02778884',
 'u99489411'}

In [282]:
# common_nif = set(df["nif"]) - set(final_merged["ID"])
# common_comp = final_merged[final_merged["ID"].isin(common_nif)][
#     ["ID", "Name"]
# ].drop_duplicates()
# common_emp = df[df["nif"].isin(common_nif)][["nif", "empresa"]].drop_duplicates()

In [None]:
# dup_emp = df["empresa"].duplicated(keep=False)
# dup_nif = df["nif"].duplicated(keep=False)