# Imports

In [1]:
import json
import time
from collections import Counter
from itertools import chain
from pathlib import Path

import numpy as np
import pandas as pd
import re
import regex

from src.companies.processor import clean_company_type, normalize_company_name
from src.nif_validation.validation import (
    get_info_from_cif,
    get_nif_type,
    is_valid_nif,
    validate_nif,
)
from src.utils.utils import fill_to_length, merge_orig_dataframes
from src.utils.utils_parallelization import parallelize_function

  from .autonotebook import tqdm as notebook_tqdm


# Aux functions
Functions necessary for processing the cells

In [2]:
def nif_from_name(name):
    """Searches whether the NIF is included in the name and separates it"""
    name_spl = np.array(name.split())
    valid = np.array([bool(validate_nif(s)) for s in name_spl])
    new_name = " ".join(name_spl[~valid])
    new_nif = Counter(name_spl[valid]).most_common()[0][0] if valid.any() else np.nan
    return new_name, new_nif

In [3]:
import contextlib

@contextlib.contextmanager
def log_time(task_name: str):
    """Context manager to log the execution time of a block of code."""
    t0 = time.time()
    yield
    t1 = time.time()
    print(f"{task_name} - {t1-t0}")

    
def execute_function(func, data, prefer=None, workers=-1, *args, **kwargs):
    """Wrapper function to decide whether to use parallel processing or not."""
    if not prefer:
        return data.apply(func, *args, **kwargs)
    else:
        return parallelize_function(
            func,
            data,
            workers=workers,
            prefer=prefer,
            show_progress=True,
            leave=True,
            position=0,
            *args,
            **kwargs,
        )

    
def clean_df(df: pd.DataFrame, prefer=None, workers=-1):
    # Remove unwanted whitespace
    with log_time("Removing unwanted whitespace"):
        df = df.applymap(
            lambda x: regex.sub(r"((?<=\w+\W)\s+)|(\s+(?=\W\w+))", "", x)
            if not pd.isna(x)
            else None
        )

    # Validate NIF
    with log_time("Validating NIF"):
        df["ID"] = execute_function(validate_nif, df["ID"], prefer, workers)

    # Clean company type
    with log_time("Cleaning company type"):
        name = [
            regex.sub(i, "", n) if not (pd.isna(n) or pd.isna(i)) else n
            for i, n in df[["ID", "Name"]].values
        ]
        df["Name"] = execute_function(
            clean_company_type, name, prefer, workers, remove_type=False
        )

    # Remove company type
    with log_time("Removing company type"):
        df["Name_proc"] = execute_function(
            clean_company_type, df["Name"], prefer, workers, remove_type=True
        )

    # Normalize company name
    with log_time("Normalizing company name"):
        df["Name_norm"] = execute_function(
            normalize_company_name, df["Name_proc"], prefer, workers
        )

    return df

# Merge data from different sources

In [12]:
!ls

LICENSE				     data
OUTSIDERS_VERIFICAR_FORMATO.csv      detectar_idioma.ipynb
PLACE_datos_GENCAT.csv		     df_gencat_id.csv
PLACE_datos_GENCAT_27marzo.csv	     df_place_matched.csv
PLACE_datos_GENCAT_LAST_VERSION.csv  match_companies.ipynb
README.en.md			     match_tender_new_version.ipynb
README.md			     src
TABLA_COMPANIES.csv		     ute_resolver
Untitled.ipynb			     venv
construir_tabla_companies.py


In [6]:
df_out = pd.read_parquet("/export/usuarios_ml4ds/cggamella/NP-Company-Process/data/DESCARGAS_ENTREGABLES/outsiders.parquet")
#df_in =  pd.read_parquet("/export/usuarios_ml4ds/cggamella/NP-Company-Process/data/DESCARGAS_ENTREGABLES/insiders.parquet")
#df_min =  pd.read_parquet("/export/usuarios_ml4ds/cggamella/NP-Company-Process/data/DESCARGAS_ENTREGABLES/minors.parquet")

In [7]:
def unify_colname(col):
    return ".".join([el for el in col if el])

In [8]:
# Aplica la función a cada columna en el DataFrame MultiIndex
df_out.columns = [unify_colname(col) for col in df_out.columns]
#df_in.columns = [unify_colname(col) for col in df_in.columns]
#df_min.columns = [unify_colname(col) for col in df_min.columns]

In [23]:
df_out

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,link,summary,title,ContractFolderStatus.ContractFolderID,ContractFolderStatus.LocatedContractingParty.Party.PartyName.Name,ContractFolderStatus.ProcurementProject.Name,ContractFolderStatus.ProcurementProject.TypeCode,ContractFolderStatus.ProcurementProject.BudgetAmount.EstimatedOverallContractAmount,ContractFolderStatus.ProcurementProject.BudgetAmount.TaxExclusiveAmount,...,ContractFolderStatus.ValidNoticeInfo.AdditionalPublicationStatus.AdditionalPublicationRequest.SendTime,ContractFolderStatus.UUID,ContractFolderStatus.TenderResult.Contract.ID,ContractFolderStatus.TenderResult.Contract.IssueDate,ContractFolderStatus.ProcurementProjectLot.ID,ContractFolderStatus.ProcurementProjectLot.IDschemeName,ContractFolderStatus.ProcurementProjectLot.ProcurementProject.Name,updated,ContractFolderStatus.ContractFolderStatusCode,deleted_on
zip,file name,entry,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,https://perfildecontratante.sede.diputaciondev...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,1284/17,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,89917.95,89917.95,...,[nan],,[nan],[nan],[nan],[nan],[nan],[2018-01-02 08:01:52.024000+00:00],[RES],NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,https://perfildecontratante.sede.diputaciondev...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,1282/17,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,175708.46,175708.46,...,[nan],,[nan],[nan],[nan],[nan],[nan],[2018-01-02 08:02:24.833000+00:00],[RES],NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,451,https://contrataciondelestado.es/sindicacion/P...,https://perfildecontratante.sede.diputaciondev...,"Expediente: 1281/17, Entidad: Diputación Provi...",Refuerzo de firme en la VP 4013 Melgar de Arri...,1281/17,Diputación Provincial de Valladolid,Refuerzo de firme en la VP 4013 Melgar de Arri...,3.0,229259.52,229259.52,...,[nan],,[nan],[nan],[nan],[nan],[nan],[2018-01-02 08:02:51.744000+00:00],[RES],NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,448,https://contrataciondelestado.es/sindicacion/P...,https://apps.euskadi.eus/w32-1084/es/contenido...,Id licitación: 2017017; Órgano de Contratació...,Desarrollo del programa de intervención socioe...,2017017,Alcalde del Ayuntamiento de Eibar,Desarrollo del programa de intervención socioe...,2.0,704145.00,361100.00,...,[nan],,[nan],[nan],[nan],[nan],[nan],[2018-01-02 09:25:52.396000+00:00],[RES],NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,447,https://contrataciondelestado.es/sindicacion/P...,https://apps.euskadi.eus/w32-1084/es/contenido...,Id licitación: B2017002; Órgano de Contrataci...,STAND DE EUSKADI EN FITUR Y SUS POSIBLES ADAP...,B2017002,Dirección general de BASQUETOUR,"Diseño, construcción en régimen de alquiler, t...",2.0,1150000.00,175000.00,...,[nan],,[nan],[nan],[nan],[nan],[nan],[2018-01-02 09:25:52.501000+00:00],[RES],NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PlataformasAgregadasSinMenores_202401.zip,PlataformasAgregadasSinMenores.atom,4,https://contrataciondelestado.es/sindicacion/P...,https://www.juntadeandalucia.es/haciendayadmin...,Id licitación: CONTR 2023 0000922771; Órgano d...,concesion de servicios de peluquerías en 10 cp...,CONTR 2023 0000922771,"Delegación Territorial de Inclusión Social, Ju...",concesion de servicios de peluquerías en 10 cp...,1.0,1583600.00,0.00,...,[nan],,[nan],[nan],"[[1, 10, 2, 3, 4, 5, 6, 7, 8, 9]]","[['ID_LOTE', 'ID_LOTE', 'ID_LOTE', 'ID_LOTE', ...","[['lote 1 peluquería cpa buenos aires', 'lote ...",[2024-02-01 00:10:52.032000+00:00],[EV],NaT
PlataformasAgregadasSinMenores_202401.zip,PlataformasAgregadasSinMenores.atom,3,https://contrataciondelestado.es/sindicacion/P...,https://www.juntadeandalucia.es/haciendayadmin...,Id licitación: CR050-23-087B; Órgano de contra...,Servicios y suministros para la seguridad inte...,CR050-23-087B,"Verificaciones Industriales de Andalucía, S.A....",Servicios y suministros para la seguridad inte...,1.0,59737.50,59737.50,...,[nan],,[nan],[nan],[nan],[nan],[nan],"[2024-01-17 12:57:01.984000+00:00, 2024-02-01 ...","[PUB, EV]",NaT
PlataformasAgregadasSinMenores_202401.zip,PlataformasAgregadasSinMenores.atom,2,https://contrataciondelestado.es/sindicacion/P...,https://www.juntadeandalucia.es/haciendayadmin...,Id licitación: CONTR 2023 0001215494; Órgano d...,Contrato de concesión de servicios para la ges...,CONTR 2023 0001215494,Agencia de Innovación y Desarrollo de Andalucí...,Contrato de concesión de servicios para la ges...,1.0,5612618.18,0.00,...,[nan],,[nan],[nan],[nan],[nan],[nan],"[2024-01-04 11:12:05.923000+00:00, 2024-01-29 ...","[PUB, PUB, EV]",NaT
PlataformasAgregadasSinMenores_202401.zip,PlataformasAgregadasSinMenores.atom,1,https://contrataciondelestado.es/sindicacion/P...,https://www.juntadeandalucia.es/haciendayadmin...,Id licitación: CONTR 2023 0001095699; Órgano d...,"Investigación, validación, verificación y gest...",CONTR 2023 0001095699,Instituto Andaluz del Patrimonio Histórico,"Investigación, validación, verificación y gest...",1.0,34510.22,34510.22,...,[nan],,[nan],[nan],[nan],[nan],[nan],"[2024-01-17 13:12:59.213000+00:00, 2024-02-01 ...","[PUB, EV]",NaT


In [9]:
df_companies = merge_orig_dataframes(dir_metadata=Path("/export/usuarios_ml4ds/cggamella/NP-Company-Process/data/DESCARGAS_ENTREGABLES/"))
#df_companies.to_parquet("data/companies.parquet")

#df_companies = pd.read_parquet("data/companies.parquet")

In [10]:
df_companies

Unnamed: 0_level_0,SMEAwardedIndicator,ID,IDschemeName,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,id_tender
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
contratosMenoresPerfilesContratantes_2018.zip/contratosMenoresPerfilesContratantes_20190225_140722_12.atom/499,[None],[b30437347],[nif],[None],[climayor s.l. b30437347],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...
contratosMenoresPerfilesContratantes_2018.zip/contratosMenoresPerfilesContratantes_20190225_140722_12.atom/498,[None],[b60564309],[nif],[None],"[gometrics, s.l.]",[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...
contratosMenoresPerfilesContratantes_2018.zip/contratosMenoresPerfilesContratantes_20190225_140722_12.atom/497,[None],[g57694549],[nif],[None],[associaciò alcem el c.i.n.e.],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...
contratosMenoresPerfilesContratantes_2018.zip/contratosMenoresPerfilesContratantes_20190225_140722_12.atom/496,[None],[None],[None],[None],[None],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...
contratosMenoresPerfilesContratantes_2018.zip/contratosMenoresPerfilesContratantes_20190225_140722_12.atom/495,[None],[b73326019],[nif],[None],[diseño y decoraciones j. peñalver s.l. b73326...,[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...
...,...,...,...,...,...,...,...,...,...,...
PlataformasAgregadasSinMenores_202401.zip/PlataformasAgregadasSinMenores.atom/4,[None],[None],[None],[None],[None],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...
PlataformasAgregadasSinMenores_202401.zip/PlataformasAgregadasSinMenores.atom/3,[None],[None],[None],[None],[None],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...
PlataformasAgregadasSinMenores_202401.zip/PlataformasAgregadasSinMenores.atom/2,[None],[None],[None],[None],[None],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...
PlataformasAgregadasSinMenores_202401.zip/PlataformasAgregadasSinMenores.atom/1,[None],[None],[None],[None],[None],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...


In [7]:
#df_companies.to_csv("TABLA_COMPANIES.csv", index = False)

# Obtain individual companies

In [11]:
# Use only those where all dimensions match
# (e.g. same number of companies and companies ids) and drop NAs
df_companies = df_companies[
    df_companies[["ID", "Name"]]
    .applymap(lambda x: not pd.isna(x[0]))
    .apply(all, axis=1)
]
df_companies = df_companies[
    df_companies.applymap(lambda x: len(x) if x[0] else None).apply(
        lambda x: len(set([el for el in x if not pd.isnull(el)])) == 1,
        axis=1,
    )
]
companies_columns = list(df_companies.columns)
# Get number of companies by tender
df_companies["_len"] = df_companies["ID"].apply(len)

# Fill lists of None to have the same number of elements and explode later
companies = pd.DataFrame(
    df_companies.apply(
        lambda x: [fill_to_length(list(el), x[-1]) for el in x[:-1]], axis=1
    ).tolist(),
    columns=companies_columns,
)

# Split companies in rows
companies = companies.explode(companies_columns)
companies = companies.reset_index(drop=True)
display(companies.head())

Unnamed: 0,SMEAwardedIndicator,ID,IDschemeName,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,id_tender
0,,b30437347,nif,,climayor s.l. b30437347,,,,,https://contrataciondelestado.es/sindicacion/d...
1,,b60564309,nif,,"gometrics, s.l.",,,,,https://contrataciondelestado.es/sindicacion/d...
2,,g57694549,nif,,associaciò alcem el c.i.n.e.,,,,,https://contrataciondelestado.es/sindicacion/d...
3,,b73326019,nif,,diseño y decoraciones j. peñalver s.l. b73326019,,,,,https://contrataciondelestado.es/sindicacion/d...
4,,b28954170,nif,,"thermo fisher scientific, s.l.",,,,,https://contrataciondelestado.es/sindicacion/d...


### Foreign/European IDs

In [12]:
identificadores_pais = pd.read_csv("/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Company-Process/src/nif_validation/data/identificador_paises.csv")

In [13]:
companies_foreign = companies[
     companies["ID"].apply(
         lambda x: x[:2] in identificadores_pais["identificador"].values
     )
 ]
companies_foreign["ID"].apply(lambda x: x[:2]).value_counts()

ID
de    6126
fr    2938
nl    2359
es    1279
pt    1140
be    1126
ie    1036
it     886
se     324
at     304
lu     256
cz     220
dk     195
pl     147
fi      94
lt      72
el      57
bg      52
lv      48
hu      43
hr      41
si      38
cy      30
ro      24
ee      23
sk      22
mt      16
xi       2
Name: count, dtype: int64

In [14]:
# European companies that are from Spain
companies_foreign_es = companies_foreign[
     companies_foreign["ID"].apply(lambda x: x[:2] == "es")
]
companies_foreign_es[
     companies_foreign_es["ID"].apply(lambda x: validate_nif(x[2:])).isna()
]

Unnamed: 0,SMEAwardedIndicator,ID,IDschemeName,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,id_tender
5709,,esa4113203,otros,,"ayesa advanced technologies, s.a.",,,,,https://contrataciondelestado.es/sindicacion/d...
7343,,es000a46041711,otros,,becsa,,,,,https://contrataciondelestado.es/sindicacion/d...
9009,,es000a46041711,otros,,becsa,,,,,https://contrataciondelestado.es/sindicacion/d...
9062,,es000a46076873,otros,,durantia,,,,,https://contrataciondelestado.es/sindicacion/d...
9068,,es000a16134629,otros,,"alvaro villescusa, s.a.",,,,,https://contrataciondelestado.es/sindicacion/d...
...,...,...,...,...,...,...,...,...,...,...
2504449,true,esb84050140,otros,,"abaco asesores periciales, s.l.p.",,es,,,https://contrataciondelestado.es/sindicacion/l...
2546991,false,es-006232196-r y es-002237807-e,otros,,d. félix iniesta moreno-manzanaro y dª adoraci...,,es,,,https://contrataciondelestado.es/sindicacion/l...
2576174,true,es00002169646x,otros,,vila tejero maría dolores,,es,,,https://contrataciondelestado.es/sindicacion/l...
2649193,,esb6614930,otros,,"cardiosos global protection, s.l",,,,,https://contrataciondelestado.es/sindicacion/P...


In [15]:
#Companies with foreign ID that have a valid NIF in Spain
companies_foreign_valid = companies_foreign.loc[
     companies_foreign["ID"].apply(lambda x: validate_nif(x[2:])).dropna().index
]
companies_foreign_valid

Unnamed: 0,SMEAwardedIndicator,ID,IDschemeName,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,id_tender
21295,,esn0071290a,otros,,microsoft,,,,,https://contrataciondelestado.es/sindicacion/d...
23146,,esq2820002j,otros,,ciemat,,,,,https://contrataciondelestado.es/sindicacion/d...
31817,,esb06195044,otros,,borrego cintas e hijos. s.l.,,,,,https://contrataciondelestado.es/sindicacion/d...
35890,,esa28315539,otros,,"bruker española, sa",,,,,https://contrataciondelestado.es/sindicacion/d...
52084,,esa08431090,otros,,"gas natural servicios sdg, s.a.",,,,,https://contrataciondelestado.es/sindicacion/d...
...,...,...,...,...,...,...,...,...,...,...
2770135,,esw0393044c,otros,,"hitachi zosen inova, ag",,,,,https://contrataciondelestado.es/sindicacion/P...
2803722,,esa58112590,otros,,"salter sport, s.a.",,,,,https://contrataciondelestado.es/sindicacion/P...
2821839,,esb58521147,otros,,leica microsistemas s.l.u.,,,,,https://contrataciondelestado.es/sindicacion/P...
2822304,,esb65947814,otros,,sequentia biotech,,,,,https://contrataciondelestado.es/sindicacion/P...


### Clean companies info

In [16]:
with log_time("Clean df"):
    companies_clean = clean_df(companies, prefer="processes", workers=-1)

Removing unwanted whitespace - 80.44970393180847


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2833999/2833999 [00:36<00:00, 77756.50it/s]


Validating NIF - 36.891074895858765


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2833999/2833999 [00:45<00:00, 62175.18it/s]


Cleaning company type - 234.02746510505676


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2833999/2833999 [01:27<00:00, 32478.25it/s]


Removing company type - 87.85957860946655


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2833999/2833999 [00:34<00:00, 81522.41it/s]


Normalizing company name - 35.170982360839844
Clean df - 474.67882084846497


In [17]:
# Aggregate company info in lists
companies_clean["SMEAwardedIndicator"] = companies_clean["SMEAwardedIndicator"].apply(
    lambda x: None if not x else True if x == "true" else False
)
companies_clean = (
    companies_clean
    # companies[["ID", "Name", "Name_proc", "Name_norm"]]
    .groupby(["ID", "Name_norm"])
    .agg(list)
    .reset_index()
)
companies_clean["count"] = companies_clean["Name_proc"].apply(len)
companies_clean = companies_clean.reset_index()

In [18]:
companies_clean

Unnamed: 0,index,ID,Name_norm,SMEAwardedIndicator,IDschemeName,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,id_tender,Name_proc,count
0,0,00000969a,joanobradorspuigdellivol,[None],[otros],[None],[joan obradors puigdellivol],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...,[joan obradors puigdellivol],1
1,1,00001014w,elcorteingles,[False],[nif],[None],[el corte ingles s.a.],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...,[el corte ingles],1
2,2,00021492x,carmenbalgueriasjimenez,"[None, None]","[nif, nif]","[None, None]","[carmen balguerias jiménez, carmen balguerias ...","[None, None]","[None, None]","[None, None]","[None, None]",[https://contrataciondelestado.es/sindicacion/...,"[carmen balguerias jiménez, carmen balguerias ...",2
3,3,00035211k,palomasainzdelamazadelaserna,[None],[nif],[None],[paloma sáinz de la maza de la serna],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...,[paloma sáinz de la maza de la serna],1
4,4,00067665e,albertodelgadocebrian,[None],[nif],[None],[alberto delgado cebrián],[None],[None],[None],[None],[https://contrataciondelestado.es/sindicacion/...,[alberto delgado cebrián],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450314,450314,z0226636e,joseeduardocastrorojas,[False],[otros],[None],[jose eduardo castro rojas],[None],[es],[None],[None],[https://contrataciondelestado.es/sindicacion/...,[jose eduardo castro rojas],1
450315,450315,z0329585t,claudiadelpilargarciaguzman,[True],[nif],[None],[claudia del pilar garcia guzman],[san clemente],[es],[16600.0],[es423],[https://contrataciondelestado.es/sindicacion/...,[claudia del pilar garcia guzman],1
450316,450316,z0351144p,cristhelsteffyurbina,[True],[nif],[None],[cristhel steffy urbina],[higuera de vargas],[es],[6132.0],[es43],[https://contrataciondelestado.es/sindicacion/...,[cristhel steffy urbina],1
450317,450317,z0425299b,gabrielgardiol,[True],[otros],[None],[gabriel gardiol],[None],[es],[None],[None],[https://contrataciondelestado.es/sindicacion/...,[gabriel gardiol],1


#### Unique names and IDs

In [19]:
# Unique names and IDs
# These companies have always appeared with the same (id-name) association
cols_vals = [
    c for c in companies_clean.columns if c not in ["ID", "Name_norm", "count"]
]
unique_ID = ~companies_clean["ID"].duplicated(keep=False)
unique_NAME = ~companies_clean["Name_norm"].duplicated(keep=False)

# Unique by ID and name
unique = companies_clean[unique_ID & unique_NAME].copy()

# Non unique IDs
non_unique_ids = list(set(companies_clean["index"]) - set(unique["index"]))
non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)].copy()

unique["index"] = unique["index"].apply(lambda x: [x])
non_unique["index"] = non_unique["index"].apply(lambda x: [x])
print(unique.shape, non_unique.shape)

(238482, 14) (211837, 14)


#### Repeated IDs and Names

In [20]:
# Choose definitive values
def suggest_value(elements):
    """
    Select elements based on appearance.
    If same number of appearances, choose the longest.
    If shorter elements are not included in the 'main' one, return all.
    """
    cnt = Counter(elements)
    cnt.pop(None, None)
    cnt = cnt.most_common()
    if cnt:
        max_cnt = cnt[0][1]
        els = sorted([k for k, v in cnt if v == max_cnt], key=lambda x: (-len(x), x))
        # return els[0]
        base = els.pop(0)
        return [base]
        # if all(
        #     [all(t in base for t in regex.sub(r"\W", " ", el).split()) for el in els]
        # ):
        #     return [base]
        # return [base] + els
    else:
        # return None
        return [None]


# Repeated IDs
def unify_repeated_col(df: pd.DataFrame, rep_col: str, un_col: str):
    """
    Takes a dataframe with duplicated values in one column that should be unique (e.g. repeated IDs)
    and another column that should also be unique given the previous one (e.g. title)
    and unifies it so that it chooses the best option.

    Parameters
    ----------
    df: pd.DataFrame
    rep_col: str
        Name of column with repeated values that will be unified
    un_col: str
        Name of column with non unique values
    """
    # Non-unique columns
    cols_vals = [c for c in df.columns if c not in [rep_col, "count", "index"]]
    repeated_rows = df[rep_col].duplicated(keep=False)
    repeated = df[repeated_rows]

    # Count times the values appear
    repeated.loc[repeated.index, [un_col]] = (
        repeated.loc[repeated.index, un_col].apply(lambda x: [x])
        * repeated.loc[repeated.index, "count"]
    )
    # Group by repeated
    repeated = repeated.reset_index()
    repeated = repeated.groupby(rep_col).agg(
        {
            # "index": list,
            "index": sum,
            **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
            "count": sum,
        }
    )
    # Get the most common values for each column
    repeated.loc[repeated.index, un_col] = (
        repeated.loc[repeated.index, un_col].apply(suggest_value).values
    )
    repeated = repeated.reset_index()

    # Concatenate unique
    use_index = repeated.loc[repeated[un_col].apply(len) == 1, un_col].index
    repeated.loc[use_index, un_col] = repeated.loc[use_index, un_col].apply(
        lambda x: x[0]
    )
    unified = repeated.loc[use_index]

    return unified

In [21]:
# Obtain unique ID-name
unified_ID = unify_repeated_col(non_unique, "ID", "Name_norm")
# Update non_unique
non_unique_ids = list(
    set(chain.from_iterable(non_unique["index"]))
    - set(chain.from_iterable(unified_ID["index"]))
)
# non_unique = companies_clean.loc[non_unique_ids]
non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
non_unique["index"] = non_unique["index"].apply(lambda x: [x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_unique["index"] = non_unique["index"].apply(lambda x: [x])


In [22]:
# Obtain unique name-ID
unified_NAME = unify_repeated_col(non_unique, "Name_norm", "ID")
# Update non_unique
non_unique_ids = list(
    set(chain.from_iterable(non_unique["index"]))
    - set(chain.from_iterable(unified_NAME["index"]))
)
# non_unique = companies_clean.loc[non_unique_ids]
non_unique = companies_clean[companies_clean["index"].isin(non_unique_ids)]
non_unique["index"] = non_unique["index"].apply(lambda x: [x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_unique["index"] = non_unique["index"].apply(lambda x: [x])


#### Merge Companies info

In [23]:
# Global
# Merge unique+unifiedID+unifiedName+nonUnique
merged_global = pd.concat([unique, unified_ID, unified_NAME, non_unique])
cols_vals = [
    c
    for c in merged_global.columns
    if c not in ["ID", "Name_norm", "count", "index", "id_tender"]
]
merged_global = merged_global.groupby(["ID", "Name_norm"]).agg(
    {
        # "index": lambda x: list(chain.from_iterable(x)),
        "index": sum,
        "id_tender": sum,
        **{c: lambda x: list(chain.from_iterable(x)) for c in cols_vals},
        "count": sum,
    }
)
merged_global = merged_global.reset_index()
print(len(merged_global))
display(merged_global.head())

314762


Unnamed: 0,ID,Name_norm,index,id_tender,SMEAwardedIndicator,IDschemeName,CompanyTypeCode,Name,CityName,IdentificationCode,PostalZone,CountrySubentityCode,Name_proc,count
0,00000969a,joanobradorspuigdellivol,"[0, 140637]",[https://contrataciondelestado.es/sindicacion/...,"[None, None]","[otros, nif]","[None, None]","[joan obradors puigdellivol, joan obradors pui...","[None, None]","[None, None]","[None, None]","[None, None]","[joan obradors puigdellivol, joan obradors pui...",2
1,00001014w,elcorteingles,[1],[https://contrataciondelestado.es/sindicacion/...,[False],[nif],[None],[el corte ingles s.a.],[None],[None],[None],[None],[el corte ingles],1
2,00021492x,carmenbalgueriasjimenez,[2],[https://contrataciondelestado.es/sindicacion/...,"[None, None]","[nif, nif]","[None, None]","[carmen balguerias jiménez, carmen balguerias ...","[None, None]","[None, None]","[None, None]","[None, None]","[carmen balguerias jiménez, carmen balguerias ...",2
3,00035211k,palomasainzdelamazadelaserna,[3],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[paloma sáinz de la maza de la serna],[None],[None],[None],[None],[paloma sáinz de la maza de la serna],1
4,00067665e,albertodelgadocebrian,[4],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[alberto delgado cebrián],[None],[None],[None],[None],[alberto delgado cebrián],1


#### Unify found names

In [24]:
# Get all names found in the tenders
merged_global["UsedNames"] = (merged_global["Name"] + merged_global["Name_proc"]).apply(
    lambda x: sorted(list(set(x)))
)

#### Propose a final name

In [25]:
# Initial computations
data = merged_global["Name_proc"]
# local_frequencies = data.apply(lambda x: dict(Counter(x)))
local_frequencies = data.apply(lambda x: {k: v / len(x) for k, v in Counter(x).items()})
global_frequencies = data.explode().value_counts().to_dict()
global_frequencies = pd.Series(global_frequencies)
merged_global["Name_proposed"] = local_frequencies.apply(
    lambda x: sorted(x.items(), key=lambda el: el[1], reverse=True)[0][0]
)

In [None]:
# # Calculate weighted scores and suggest a name for each ID
# suggested_names = {}
# for id_, weights in local_frequencies.items():
#     max_score = float("-inf")
#     suggested_name = None

#     # Escale the frequencies in current item
#     _this_global_frequencies = {name: global_frequencies[name] for name in weights}
#     escale_global_frequencies = {
#         name: _this_global_frequencies[name] / sum(_this_global_frequencies.values())
#         for name in _this_global_frequencies
#     }
#     for name, local_weight in weights.items():
#         weighted_score = (local_weight) / (escale_global_frequencies[name])
#         if weighted_score > max_score:
#             max_score = weighted_score
#             suggested_name = name
#     suggested_names[id_] = suggested_name

# merged_global["Name_proposed2"] = suggested_names.values()

In [None]:
# merged_global.loc[
#     merged_global["Name_proposed2"] != merged_global["Name_proposed"],
#     ["Name_proposed2", "Name_proposed"],
# ].head()

#### Check if company is SME

In [45]:
def isPYME(SMEIndicators):
    # Evaluate if is SME based on the SMEAwardedIndicator appearances
    # TODO: make a better decision
    # Reemplaza todos los valores None por False
    SMEIndicators = [False if x is None else x for x in SMEIndicators]
    # Maneja el caso de una lista vacía
    if not SMEIndicators:  
        return False  
    
    sme_counts = Counter(SMEIndicators)
    if True in sme_counts and False in sme_counts:
        return False  # Retorna False cuando ambos, True y False, están presentes
    
    # Asegura que sme_counts no esté vacío y retorna el valor más común
    if sme_counts:
        return sme_counts.most_common(1)[0][0]
    return False  # Retorna False si sme_counts está vacío después de reemplazar None por False

merged_global["isPYME"] = merged_global["SMEAwardedIndicator"].apply(isPYME)

In [46]:
merged_global["isPYME"]

0         False
1         False
2         False
3         False
4         False
          ...  
314757    False
314758     True
314759     True
314760     True
314761    False
Name: isPYME, Length: 314762, dtype: bool

#### Check CityName and PostalZone

In [47]:
def get_city_name(CityName):
    # Evaluate the city name based on the CityName appearances
    # Get most common excluding None
    # TODO: make a better decision
    city_names = Counter(CityName)
    if None in city_names.keys():
        city_names.pop(None)
    if not len(city_names) == 1:
        return None
    return city_names.most_common(1)[0][0]

def get_postal_zone(PostalZone):
    # Evaluate the postal zone based on the PostalZone appearances
    # Get most common excluding None
    # TODO: make a better decision
    postal_zones = Counter(PostalZone)
    if None in postal_zones.keys():
        postal_zones.pop(None)
    if not len(postal_zones) == 1:
        return None
    return postal_zones.most_common(1)[0][0].split(".")[0]


merged_global["City"] = merged_global["CityName"].apply(get_city_name)
merged_global["PostalCode"] = merged_global["PostalZone"].apply(get_postal_zone)

In [48]:
merged_global

Unnamed: 0,ID,Name_norm,index,id_tender,SMEAwardedIndicator,IDschemeName,CompanyTypeCode,Name,CityName,IdentificationCode,...,UsedNames,Name_proposed,isPYME,City,PostalCode,NIF_type,prov,comp_type,comp_desc,FullName
0,00000969a,joanobradorspuigdellivol,"[0, 140637]",[https://contrataciondelestado.es/sindicacion/...,"[None, None]","[otros, nif]","[None, None]","[joan obradors puigdellivol, joan obradors pui...","[None, None]","[None, None]",...,"[joan obradors puigdellivol, joan obradors pui...",joan obradors puigdellivol,False,,,DNI,,,,joan obradors puigdellivol
1,00001014w,elcorteingles,[1],[https://contrataciondelestado.es/sindicacion/...,[False],[nif],[None],[el corte ingles s.a.],[None],[None],...,"[el corte ingles, el corte ingles s.a.]",el corte ingles,False,,,DNI,,,,el corte ingles s.a.
2,00021492x,carmenbalgueriasjimenez,[2],[https://contrataciondelestado.es/sindicacion/...,"[None, None]","[nif, nif]","[None, None]","[carmen balguerias jiménez, carmen balguerias ...","[None, None]","[None, None]",...,[carmen balguerias jiménez],carmen balguerias jiménez,False,,,DNI,,,,carmen balguerias jiménez
3,00035211k,palomasainzdelamazadelaserna,[3],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[paloma sáinz de la maza de la serna],[None],[None],...,[paloma sáinz de la maza de la serna],paloma sáinz de la maza de la serna,False,,,DNI,,,,paloma sáinz de la maza de la serna
4,00067665e,albertodelgadocebrian,[4],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[alberto delgado cebrián],[None],[None],...,[alberto delgado cebrián],alberto delgado cebrián,False,,,DNI,,,,alberto delgado cebrián
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314757,z0226636e,joseeduardocastrorojas,[450314],[https://contrataciondelestado.es/sindicacion/...,[False],[otros],[None],[jose eduardo castro rojas],[None],[es],...,[jose eduardo castro rojas],jose eduardo castro rojas,False,,,NIE,,,,jose eduardo castro rojas
314758,z0329585t,claudiadelpilargarciaguzman,[450315],[https://contrataciondelestado.es/sindicacion/...,[True],[nif],[None],[claudia del pilar garcia guzman],[san clemente],[es],...,[claudia del pilar garcia guzman],claudia del pilar garcia guzman,True,san clemente,16600,NIE,,,,claudia del pilar garcia guzman
314759,z0351144p,cristhelsteffyurbina,[450316],[https://contrataciondelestado.es/sindicacion/...,[True],[nif],[None],[cristhel steffy urbina],[higuera de vargas],[es],...,[cristhel steffy urbina],cristhel steffy urbina,True,higuera de vargas,6132,NIE,,,,cristhel steffy urbina
314760,z0425299b,gabrielgardiol,[450317],[https://contrataciondelestado.es/sindicacion/...,[True],[otros],[None],[gabriel gardiol],[None],[es],...,[gabriel gardiol],gabriel gardiol,True,,,NIE,,,,gabriel gardiol


### Add info

In [49]:
# Add information based on NIF
merged_global["NIF_type"] = merged_global["ID"].apply(get_nif_type)
merged_global["prov"], merged_global["comp_type"], merged_global["comp_desc"] = list(
    zip(*merged_global["ID"].apply(get_info_from_cif))
)
merged_global["comp_type"] = merged_global["comp_type"].apply(
    lambda x: x.split(",")[0] if not pd.isna(x) else None
)

### Find UTEs

In [50]:
values = merged_global["ID"].str.startswith("u")

# Filtrar y seleccionar los valores de la columna 'ID' que cumplen con la condición
ids_que_cumplen = merged_global.loc[values, "ID"]

# Filtrar y seleccionar las filas completas donde los valores de la columna 'ID' comienzan con "u"
filas_que_cumplen = merged_global.loc[values]

# Imprimir las filas completas que cumplen con la condición
print(filas_que_cumplen)

               ID                                          Name_norm  \
303814  u01523190                                   alegriasarasola2   
303815  u01548726                indenortpvproviseribericaieslizardi   
303816  u01561570                                 tranviauniversidad   
303817  u01563154                                    ceipkarmengoama   
303818  u01563196                                     indralkskz2018   
...           ...                                                ...   
309941  u99563744  indutecinstalacionesampersistemasmarcoinfrayme...   
309942  u99563751                    insaemainsaagromercadolosalamos   
309943  u99564239                         fccmedioambientefccaqualia   
309944  u99568222                   transfersmvsantiagoanguloaltemir   
309945  u99569147      insaeinfraestructurasmarcoinfraestructurasyma   

           index                                          id_tender  \
303814  [436282]  [https://contrataciondelestado.es/sindicacion/

In [51]:
merged_global['FullName'] = merged_global['UsedNames'].apply(lambda x: max(x, key=len))

In [31]:
merged_global

Unnamed: 0,ID,Name_norm,index,id_tender,SMEAwardedIndicator,IDschemeName,CompanyTypeCode,Name,CityName,IdentificationCode,...,UsedNames,Name_proposed,isPYME,City,PostalCode,NIF_type,prov,comp_type,comp_desc,FullName
0,00000969a,joanobradorspuigdellivol,"[0, 140637]",[https://contrataciondelestado.es/sindicacion/...,"[None, None]","[otros, nif]","[None, None]","[joan obradors puigdellivol, joan obradors pui...","[None, None]","[None, None]",...,"[joan obradors puigdellivol, joan obradors pui...",joan obradors puigdellivol,,,,DNI,,,,joan obradors puigdellivol
1,00001014w,elcorteingles,[1],[https://contrataciondelestado.es/sindicacion/...,[False],[nif],[None],[el corte ingles s.a.],[None],[None],...,"[el corte ingles, el corte ingles s.a.]",el corte ingles,False,,,DNI,,,,el corte ingles s.a.
2,00021492x,carmenbalgueriasjimenez,[2],[https://contrataciondelestado.es/sindicacion/...,"[None, None]","[nif, nif]","[None, None]","[carmen balguerias jiménez, carmen balguerias ...","[None, None]","[None, None]",...,[carmen balguerias jiménez],carmen balguerias jiménez,,,,DNI,,,,carmen balguerias jiménez
3,00035211k,palomasainzdelamazadelaserna,[3],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[paloma sáinz de la maza de la serna],[None],[None],...,[paloma sáinz de la maza de la serna],paloma sáinz de la maza de la serna,,,,DNI,,,,paloma sáinz de la maza de la serna
4,00067665e,albertodelgadocebrian,[4],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[alberto delgado cebrián],[None],[None],...,[alberto delgado cebrián],alberto delgado cebrián,,,,DNI,,,,alberto delgado cebrián
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314757,z0226636e,joseeduardocastrorojas,[450314],[https://contrataciondelestado.es/sindicacion/...,[False],[otros],[None],[jose eduardo castro rojas],[None],[es],...,[jose eduardo castro rojas],jose eduardo castro rojas,False,,,NIE,,,,jose eduardo castro rojas
314758,z0329585t,claudiadelpilargarciaguzman,[450315],[https://contrataciondelestado.es/sindicacion/...,[True],[nif],[None],[claudia del pilar garcia guzman],[san clemente],[es],...,[claudia del pilar garcia guzman],claudia del pilar garcia guzman,True,san clemente,16600,NIE,,,,claudia del pilar garcia guzman
314759,z0351144p,cristhelsteffyurbina,[450316],[https://contrataciondelestado.es/sindicacion/...,[True],[nif],[None],[cristhel steffy urbina],[higuera de vargas],[es],...,[cristhel steffy urbina],cristhel steffy urbina,True,higuera de vargas,6132,NIE,,,,cristhel steffy urbina
314760,z0425299b,gabrielgardiol,[450317],[https://contrataciondelestado.es/sindicacion/...,[True],[otros],[None],[gabriel gardiol],[None],[es],...,[gabriel gardiol],gabriel gardiol,True,,,NIE,,,,gabriel gardiol


In [52]:
# Ampliar la expresión regular precompilada para capturar "UTE" con variaciones, "union temporal empresas", y sus siglas
pattern = re.compile(r"\b(u(\.)?t(\.)?e|union temporal empresas|uniones temporales de empresas)\b", re.IGNORECASE)

# Búsqueda de UTEs basada en nombres en la columna 'UsedNames'
ute_n = merged_global["UsedNames"].apply(lambda x: bool(pattern.search(" ".join([word.lower() for word in x]))))

# Búsqueda de UTEs basada en ID
ute_i = merged_global["ID"].str.startswith("u")

# Aplicar filtros basados en las columnas 'comp_type' y 'comp_desc' usando la expresión regular
ute_c_type = merged_global["comp_type"].apply(lambda x: bool(pattern.search(x.lower()) if pd.notnull(x) else False))
ute_c_desc = merged_global["comp_desc"].apply(lambda x: bool(pattern.search(x.lower()) if pd.notnull(x) else False))

# Combinar todos los filtros para encontrar UTEs basados en nombres, ID, comp_type, y comp_desc
utes_combined = merged_global[ute_n | ute_i | ute_c_type | ute_c_desc]

# Eliminar duplicados basándose en la columna 'ID', manteniendo la primera aparición
utes = utes_combined.drop_duplicates(subset="ID")

In [53]:
utes

Unnamed: 0,ID,Name_norm,index,id_tender,SMEAwardedIndicator,IDschemeName,CompanyTypeCode,Name,CityName,IdentificationCode,...,UsedNames,Name_proposed,isPYME,City,PostalCode,NIF_type,prov,comp_type,comp_desc,FullName
170,00400871g,ferrannavazoherrerofaranazoliverfortezaferre,"[213, 214]",[https://contrataciondelestado.es/sindicacion/...,"[True, None]","[nif, nif]","[None, None]","[u.t.e. ferranherrero f.aranaz, u.t.e. ferran ...","[None, None]","[es, None]",...,[ferran navazo herrero f.aranaz oliver forteza...,ferranherrero f.aranaz,False,,,DNI,,,,u.t.e. ferran navazo herrero f.aranaz oliver f...
241,00416934j,getinsaverdascoarquitectos,[292],[https://contrataciondelestado.es/sindicacion/...,[False],[nif],[None],[u.t.e. getinsa+verdasco arquitectos],[madrid],[es],...,"[getinsa+verdasco arquitectos, u.t.e. getinsa+...",getinsa+verdasco arquitectos,False,madrid,28045,DNI,,,,u.t.e. getinsa+verdasco arquitectos
3010,03429732h,munozmarineroluisjoaquin,"[3679, 3680, 3681]",[https://contrataciondelestado.es/sindicacion/...,"[None, None, None, None, None, None, True, Non...","[nif, nif, nif, nif, nif, nif, nif, nif, nif, ...","[None, None, None, None, None, None, None, Non...","[u.t.e. cuéllar, luis joaquin muñoz marinero, ...","[None, None, None, None, cuellar, None, None, ...","[None, None, None, None, es, None, es, es, Non...",...,"[cuéllar, luis joaquin muñoz marinero, muñoz m...","muñoz marinero,luis joaquin",False,,44022,DNI,,,,luis joaquin muñoz marinero
4923,04200682p,saramorenosoriayunomasunologicomcley1882numero1,[5948],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[sara moreno soria y uno mas uno logico mc u.t...,[None],[None],...,[sara moreno soria y uno mas uno logico mc ley...,sara moreno soria y uno mas uno logico mc ley ...,False,,,DNI,,,,sara moreno soria y uno mas uno logico mc u.t....
10078,07016969z,fernandograndetureganoconstruccionessevillanevado,"[12493, 12494]",[https://contrataciondelestado.es/sindicacion/...,"[None, True]","[nif, nif]","[None, None]","[arqueologia y gestion del patrimonio ancora, ...","[None, None]","[None, es]",...,"[arqueologia y gestion del patrimonio ancora, ...",arqueologia y gestion del patrimonio ancora,False,,,DNI,,,,u.t.e. fernando grande turégano - construccion...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310401,v67273987,girocopisistemesdorganitzacio,[444314],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[girocopi s.l. sistemes d organització s.a u.t...,[None],[None],...,[girocopi s.l. sistemes d organització s.a u.t...,girocopi sistemes d organització,False,,,CIF,Gerona,s.a.t.,Sociedad Agraria de Transformación,girocopi s.l. sistemes d organització s.a u.t.e.
310502,v87879375,aurenauditoresspred2redconsultores,[444470],[https://contrataciondelestado.es/sindicacion/...,[None],[ute],[None],[u.t.e. auren auditores sp s.l.p. - red2red co...,[None],[None],...,"[auren auditores sp red2red consultores, u.t.e...",auren auditores sp red2red consultores,False,,,CIF,Madrid,s.a.t.,Sociedad Agraria de Transformación,u.t.e. auren auditores sp s.l.p. - red2red con...
310520,v88299789,ceviamepcycasasdelaalcarrialey181982de26demayo,[444491],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[ceviam epc s.l. y casas de la alcarria s.l u....,[None],[None],...,[ceviam epc s.l. y casas de la alcarria s.l u....,"ceviam epc y casas de la alcarria ,ley 18 1982...",False,,,CIF,Madrid,s.a.t.,Sociedad Agraria de Transformación,ceviam epc s.l. y casas de la alcarria s.l u.t...
310541,v90441049,caem,[444518],[https://contrataciondelestado.es/sindicacion/...,[None],[nif],[None],[u.t.e. caem],[None],[None],...,"[caem, u.t.e. caem]",caem,False,,,CIF,Sevilla,s.a.t.,Sociedad Agraria de Transformación,u.t.e. caem


### Save data

In [54]:
provisional_utes_info = utes.rename(
    columns={
        "ID": "NIF",
        "id_tender": "id_tender",
        "Name_proposed": "Name2",
        "prov": "Province",
        "NIF_type": "NIFtype",
        "comp_type": "CompanyType",
        "comp_desc": "CompanyDescription",
        "isPYME": "isPYME",
    }
)[
    [
        "NIF",
        "FullName",
        "Name2",
        "Province",
        "NIFtype",
        "CompanyType",
        "CompanyDescription",
        "id_tender",
        "isPYME",
    ]
]

In [55]:
provisional_utes_info

Unnamed: 0,NIF,FullName,Name2,Province,NIFtype,CompanyType,CompanyDescription,id_tender,isPYME
170,00400871g,u.t.e. ferran navazo herrero f.aranaz oliver f...,ferranherrero f.aranaz,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
241,00416934j,u.t.e. getinsa+verdasco arquitectos,getinsa+verdasco arquitectos,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
3010,03429732h,luis joaquin muñoz marinero,"muñoz marinero,luis joaquin",,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
4923,04200682p,sara moreno soria y uno mas uno logico mc u.t....,sara moreno soria y uno mas uno logico mc ley ...,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
10078,07016969z,u.t.e. fernando grande turégano - construccion...,arqueologia y gestion del patrimonio ancora,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
...,...,...,...,...,...,...,...,...,...
310401,v67273987,girocopi s.l. sistemes d organització s.a u.t.e.,girocopi sistemes d organització,Gerona,CIF,s.a.t.,Sociedad Agraria de Transformación,[https://contrataciondelestado.es/sindicacion/...,False
310502,v87879375,u.t.e. auren auditores sp s.l.p. - red2red con...,auren auditores sp red2red consultores,Madrid,CIF,s.a.t.,Sociedad Agraria de Transformación,[https://contrataciondelestado.es/sindicacion/...,False
310520,v88299789,ceviam epc s.l. y casas de la alcarria s.l u.t...,"ceviam epc y casas de la alcarria ,ley 18 1982...",Madrid,CIF,s.a.t.,Sociedad Agraria de Transformación,[https://contrataciondelestado.es/sindicacion/...,False
310541,v90441049,u.t.e. caem,caem,Sevilla,CIF,s.a.t.,Sociedad Agraria de Transformación,[https://contrataciondelestado.es/sindicacion/...,False


In [56]:
# Ahora, renombrar la columna 'Name2' a 'Name'
provisional_utes_info = provisional_utes_info.rename(
    columns={
        "Name2": "Name"  
    }
)

In [57]:
provisional_utes_info

Unnamed: 0,NIF,FullName,Name,Province,NIFtype,CompanyType,CompanyDescription,id_tender,isPYME
170,00400871g,u.t.e. ferran navazo herrero f.aranaz oliver f...,ferranherrero f.aranaz,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
241,00416934j,u.t.e. getinsa+verdasco arquitectos,getinsa+verdasco arquitectos,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
3010,03429732h,luis joaquin muñoz marinero,"muñoz marinero,luis joaquin",,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
4923,04200682p,sara moreno soria y uno mas uno logico mc u.t....,sara moreno soria y uno mas uno logico mc ley ...,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
10078,07016969z,u.t.e. fernando grande turégano - construccion...,arqueologia y gestion del patrimonio ancora,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,False
...,...,...,...,...,...,...,...,...,...
310401,v67273987,girocopi s.l. sistemes d organització s.a u.t.e.,girocopi sistemes d organització,Gerona,CIF,s.a.t.,Sociedad Agraria de Transformación,[https://contrataciondelestado.es/sindicacion/...,False
310502,v87879375,u.t.e. auren auditores sp s.l.p. - red2red con...,auren auditores sp red2red consultores,Madrid,CIF,s.a.t.,Sociedad Agraria de Transformación,[https://contrataciondelestado.es/sindicacion/...,False
310520,v88299789,ceviam epc s.l. y casas de la alcarria s.l u.t...,"ceviam epc y casas de la alcarria ,ley 18 1982...",Madrid,CIF,s.a.t.,Sociedad Agraria de Transformación,[https://contrataciondelestado.es/sindicacion/...,False
310541,v90441049,u.t.e. caem,caem,Sevilla,CIF,s.a.t.,Sociedad Agraria de Transformación,[https://contrataciondelestado.es/sindicacion/...,False


In [58]:
provisional_company_info = merged_global.rename(
    columns={
        "ID": "NIF",
        "id_tender": "id_tender",
        "Name_proposed": "Name1",
        "prov": "Province",
        "NIF_type": "NIFtype",
        "comp_type": "CompanyType",
        "comp_desc": "CompanyDescription",
        "isPYME": "isPYME",
    }
)[
    [
        "NIF",
        "FullName",
        "Name1",
        "Province",
        "NIFtype",
        "CompanyType",
        "CompanyDescription",
        "id_tender",
        "isPYME",
    ]
]

In [59]:
provisional_company_info.id_tender[6]

['https://contrataciondelestado.es/sindicacion/datosAbiertosMenores/6088504',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2082755',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2190186',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2383963',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2498465',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2603759',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2675109',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2679987',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2710201',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2771137',
 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/2771136',
 'https://contrataciondelestado.es/sindica

In [60]:
provisional_company_info.iloc[6]

NIF                                                           00076938a
FullName                                               luis terán lópez
Name1                                                  luis terán lópez
Province                                                           None
NIFtype                                                             DNI
CompanyType                                                        None
CompanyDescription                                                 None
id_tender             [https://contrataciondelestado.es/sindicacion/...
isPYME                                                            False
Name: 6, dtype: object

In [61]:
# Ahora, renombrar la columna 'Name1' a 'Name'
provisional_company_info = provisional_company_info.rename(
    columns={
        "Name1": "Name"  # Renombrando Name1 a Name
    }
)

In [62]:
provisional_company_info.iloc[1]

NIF                                                           00001014w
FullName                                           el corte ingles s.a.
Name                                                    el corte ingles
Province                                                           None
NIFtype                                                             DNI
CompanyType                                                        None
CompanyDescription                                                 None
id_tender             [https://contrataciondelestado.es/sindicacion/...
isPYME                                                            False
Name: 1, dtype: object

In [63]:
nif_es_unico = provisional_company_info['NIF'].is_unique
if nif_es_unico:
    print("Todos los valores en la columna 'NIF' son únicos.")
else:
    print("Existen valores duplicados en la columna 'NIF'.")


Todos los valores en la columna 'NIF' son únicos.


In [None]:
provisional_company_info.FullName.str.contains("xylem water solutions españa s.l.u.").notna()

In [None]:
valores_no_nulos = provisional_company_info['CompanyType'].notna()

# Filtrar el DataFrame para obtener solo las filas con valores no nulos en la columna CompanyType
datos_no_nulos = provisional_company_info[valores_no_nulos]

# Convertir los valores no nulos en una lista si es necesario
valores_no_nulos_lista = datos_no_nulos['CompanyType'].tolist()

# Imprimir los valores no nulos
print(set(valores_no_nulos_lista))

In [None]:
valores_que_comienzan_con_xylem = provisional_company_info['Name'].str.startswith("xylem")

# Filtra el DataFrame para mostrar solo los valores que no son False
valores_verdaderos = provisional_company_info[valores_que_comienzan_con_xylem]

# Imprime los valores que cumplen con la condición
print(valores_verdaderos.Name.iloc[0])

In [None]:
# provisional_company_info.to_parquet("data/provisional_company_info.parquet")
provisional_utes_info.to_parquet("data/utes_nuevo.parquet")

# provisional_company_info = pd.read_parquet("data/provisional_company_info.parquet")
# utes = pd.read_parquet("data/utes.parquet")

In [None]:
df_antiguo = pd.read_parquet("/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Company-Process/data/company_info.parquet")

In [None]:
df_antiguo

In [None]:
provisional_company_info.id_tender.iloc[0]

In [None]:
# Hacemos un merge entre los dos dataframes especificando un outer join y el indicador
merged_df = pd.merge(provisional_company_info, df_antiguo, on='NIF', how='outer', indicator=True)

# Filtro para obtener solo las filas que están en 'df_nuevo_company' pero no en 'df_antiguo_company'
filas_unicas_en_nuevo = merged_df[merged_df['_merge'] == 'left_only']
filas_unicas_en_nuevo = filas_unicas_en_nuevo.drop(columns=['_merge'])

print(filas_unicas_en_nuevo)

In [None]:
data2 = df_antiguo['Name'].str.contains("zesauro traducciones")
provisional_company_info[data]

In [None]:
# Filtra directamente las filas donde 'CompanyDescription' comienza con "Sociedades de"
valores_que_comienzan_con_xylem = df_antiguo['CompanyDescription'].notna()

# Imprime los valores que cumplen con la condición
print(set(valores_que_comienzan_con_xylem))

In [None]:
df_antiguo

## Empresas Zaragoza

In [None]:
# Cargar las empresas de Zaragoza y darle el formato propio
emp = pd.read_excel("data/licitador_09_23.xls")
emp_zgz = emp[["ID_EMPRESA", "NIF", "NOMBRE", "UTE"]].copy()
# Limpieza de nombres
emp_zgz["empresa"] = emp_zgz["NOMBRE"].apply(clean_company_type)
emp_zgz["empresa_proc"] = emp_zgz["empresa"].apply(clean_company_type, remove_type=True)
# Creamos una columna con los nombres que se han usado para identificar la empresa
emp_zgz["UsedNames"] = emp_zgz[["empresa", "empresa_proc"]].apply(
    lambda x: list(set([x[0], x[1]])), axis=1
)
emp_zgz["nif"] = emp_zgz["NIF"].apply(
    lambda x: regex.sub(r"\W", "", x.lower()) if not pd.isna(x) else None
)
emp_zgz["valid_nif"] = emp_zgz["nif"].apply(lambda x: is_valid_nif(x) if x else None)
emp_zgz["nif_type"] = emp_zgz["nif"].apply(lambda x: get_nif_type(x) if x else None)

# Transformar UTE a bool
emp_zgz["UTE"] = emp_zgz["UTE"].apply(
    lambda x: True if x == "S" else False if x == "N" else None
)

# Stats
print("Elementos inválidos en los datos:")
emp_zgz.apply(pd.isna, axis=1).sum()

### Empresas repetidas

In [None]:
total_empresas = len(emp_zgz)

# Identificar NIF repetidos
emp_zgz_dup_nif = emp_zgz.loc[
    emp_zgz["nif"].duplicated(keep=False) & emp_zgz["nif"].notna(),
    ["ID_EMPRESA", "nif", "empresa_proc"],
].sort_values(by="nif")
print(f"Hay {len(emp_zgz_dup_nif)} NIFs duplicados:")
display(emp_zgz_dup_nif)
print(
    f"Los NIFs duplicados representan el {(len(emp_zgz_dup_nif) / total_empresas) * 100:.2f}% del total."
)

print()

# Identificar nombres repetidos
emp_zgz_dup_name = emp_zgz.loc[
    emp_zgz["empresa_proc"].duplicated(keep=False) & emp_zgz["empresa_proc"].notna(),
    ["ID_EMPRESA", "nif", "empresa_proc"],
].sort_values(by="empresa_proc")
print(f"Hay {len(emp_zgz_dup_name)} nombres duplicados:")
display(emp_zgz_dup_name)
print(
    f"Los nombres duplicados representan el {(len(emp_zgz_dup_name) / total_empresas) * 100:.2f}% del total."
)

### Search companies in own data

In [None]:
own_info = provisional_company_info[
    ["NIF", "UsedNames", "NameProposed", "isPYME"]
].rename(columns={"NIF": "nif"})
own_info["UsedNames"] = own_info["UsedNames"].apply(list)

In [None]:
# Buscar por NIF
nifs_usados = own_info.reset_index()
nif_zgz = emp_zgz[["ID_EMPRESA", "nif", "UTE", "empresa", "UsedNames"]]
found_by_nif = pd.merge(
    nif_zgz, nifs_usados, left_on="nif", right_on="nif", how="inner"
)
found_by_nif["UsedNames"] = found_by_nif[["UsedNames_x", "UsedNames_y"]].sum(axis=1)
found_by_nif = (
    found_by_nif.groupby("ID_EMPRESA")
    .agg(
        {
            "nif": set,
            "index": set,
            "UsedNames": sum,
            "NameProposed": set,
            "isPYME": set,
            "UTE": set,
        }
    )
    .applymap(lambda x: list(set(x)))
    .reset_index()
)
found_by_nif

In [None]:
# found_by_name.loc[found_by_name["UsedNames"].apply(lambda x: "reby rides" in x)]

In [None]:
# Buscar por nombre excluyendo las que hemos encontrado por NIF
nombres_usados = own_info.explode(column="UsedNames").reset_index()
nombres_zgz = emp_zgz[~emp_zgz["ID_EMPRESA"].isin(found_by_nif["ID_EMPRESA"])]
nombres_zgz = nombres_zgz[["ID_EMPRESA", "nif", "UTE", "empresa", "UsedNames"]].explode(
    column="UsedNames"
)
found_by_name = pd.merge(
    nombres_zgz, nombres_usados, left_on="UsedNames", right_on="UsedNames", how="inner"
)
found_by_name["nif"] = (
    found_by_name[["nif_x", "nif_y"]]
    .applymap(validate_nif)
    .apply(lambda x: list(set([el for el in x if el])), axis=1)
)
found_by_name = (
    found_by_name.groupby("ID_EMPRESA")
    .agg(
        {
            "nif": sum,
            "index": set,
            "UsedNames": set,
            "NameProposed": set,
            "isPYME": set,
            "UTE": set,
        }
    )
    .applymap(lambda x: list(set(x)))
    .reset_index()
)
found_by_name

In [None]:
# Unimos los dos dataframes
found = (
    pd.concat([found_by_nif, found_by_name])
    .groupby("ID_EMPRESA")
    .agg(sum)
    .applymap(lambda x: list(set(x)) if x else None)
    .reset_index()
)
not_found = emp_zgz[~emp_zgz["ID_EMPRESA"].isin(found["ID_EMPRESA"])]
# Stats
print(f"Se han encontrado {len(found)} empresas de Zaragoza en las empresas previas")
print("Elementos inválidos en los datos:")
print(found.apply(pd.isna, axis=1).sum().to_dict())
print(f"Hay {len(not_found)} empresas que no se han encontrado")
# found[["nif", "index", "UsedNames", "NameProposed", "isPYME", "UTE"]].applymap(
#     lambda x: x[0]
# )

## Análisis empresas encontradas y no encontradas

### Incorrect data

In [None]:
null_id_emp_ids = emp_zgz[emp_zgz["nif"].isna()]["ID_EMPRESA"].values

print(f"Null ids: {len(null_id_emp_ids)}")
print(f"Null ids found: {found['ID_EMPRESA'].isin(null_id_emp_ids).sum()}")
print(f"Null ids not found: {not_found['ID_EMPRESA'].isin(null_id_emp_ids).sum()}")

In [None]:
print(f"De las empresas que no se han encontrado ({len(not_found)})")
not_found_valid = not_found[not_found["valid_nif"].apply(bool)]
not_found_invalid = not_found[~not_found["valid_nif"].apply(bool)].copy()
print(
    f"Hay {len(not_found_valid)} que tienen un NIF válido y {len(not_found_invalid)} que no"
)

# Intento de corrección del nif
not_found_invalid["proposed_nif"] = (
    not_found_invalid["nif"].dropna().apply(validate_nif, correct=True)
)
print(f"De los no encontrados, que tienen NIF no nulo, se han podido corregir:")
not_found_invalid.dropna(subset="proposed_nif")

In [None]:
# Columnas que deberían tener valores únicos
unique_columns = [c for c in found.columns if c not in ["ID_EMPRESA", "UsedNames"]]
found[unique_columns] = found[unique_columns].applymap(
    lambda x: [el for el in x if not pd.isna(el)]
)

In [None]:
# Encontradas
unique = (
    found[unique_columns]
    .applymap(lambda x: len(x) if x else 1)
    .apply(lambda x: all(x == 1), axis=1)
)
# Válidas (todo son valores únicos)
valid_unique = found[unique].copy()
valid_unique[unique_columns] = valid_unique[unique_columns].applymap(
    lambda x: x[0] if x else None
)

# Inválidas (algún valor que debería ser único no lo es)
invalid_unique = found[~unique].copy()
# invalid_unique[unique_columns] = invalid_unique[unique_columns].applymap(lambda x: x if x else None)

In [None]:
# # TODO: Para Zaragoza, podemos sugerir un NIF en función de si la provincia es Zaragoza:
# invalid_unique["nif"].apply(
#     lambda x: [get_info_from_cif(el)[0] == "Zaragoza" for el in x]
# )

#### UTEs

In [None]:
emp_zgz.loc[emp_zgz["UTE"] == True, "empresa"].values

In [None]:
print(valid_unique["UTE"].value_counts().to_dict())
print(invalid_unique["UTE"].value_counts())

In [None]:
found["UTE"]

In [None]:
valid_unique[valid_unique["UTE"] == True]

In [None]:
# Empresas no encontradas en datos propios, con NIF válido
not_found = emp_zgz.loc[
    ~emp_zgz["ID_EMPRESA"].isin(found["ID_EMPRESA"]) & emp_zgz["valid_nif"].apply(bool)
]

In [None]:
# # Rellenar NIF:
# empty_nif = emp_zgz[
#     (emp_zgz["ID_EMPRESA"].isin(found["ID_EMPRESA"])) & (emp_zgz["nif"].isna())
# ]
# pd.merge(
#     empty_nif[["ID_EMPRESA", "empresa", "empresa_proc"]],
#     found,
#     left_on="ID_EMPRESA",
#     right_on="ID_EMPRESA",
#     how="inner",
# )

## OLD

In [None]:
# # df = pd.read_csv("data/empresas.csv", sep=";", header=0, nrows=64, index_col=False)
# # df = pd.read_excel("data/empresas.xlsx")

# with open("data/empresas_zgz.csv", "r", encoding="utf-8") as f:
#     emp = [
#         [el.replace('"', "").strip() for el in l.lower().strip().split(";", 4)]
#         for l in f.readlines()
#         if len(l) > 2
#     ]
# cols = emp[0]
# data = emp[1:]
# emp_zgz = pd.DataFrame(data=data, columns=cols)
# emp_zgz = emp_zgz.applymap(lambda x: x if x else None)
# emp_zgz = emp_zgz.dropna(how="all").drop_duplicates().reset_index(drop=True)
# emp_zgz["empresa"] = emp_zgz["empresa"].apply(clean_company_type)
# emp_zgz["empresa_proc"] = emp_zgz["empresa"].apply(clean_company_type, remove_type=True)
# emp_zgz["nif"] = emp_zgz["nif"].apply(lambda x: regex.sub(r"\W", "", x) if x else None)
# emp_zgz["nif_type"] = emp_zgz["nif"].apply(lambda x: get_nif_type(x) if x else None)