In [390]:
import pathlib
import pandas as pd
import re
from collections import Counter
from fuzzywuzzy import fuzz

## Read parquets with data

In [391]:
df_company = pd.read_parquet(pathlib.Path("../data/company_info.parquet"))
df_utes = pd.read_parquet(pathlib.Path("../data/utes.parquet"))

In [465]:
df_utes

Unnamed: 0,NIF,FullName,Name,Province,NIFtype,CompanyType,CompanyDescription,id_tender
20246,16628333t,u.t.e.dym areas caninas logroño,u.t.e.dym areas caninas logroño,,DNI,,,[https://contrataciondelestado.es/sindicacion/...
30010,23210609k,u.t.e.obras y pavimentos ruiz s.l. pavimentos ...,u.t.e.obras y pavimentos ruiz pavimentos asfál...,,DNI,,,[https://contrataciondelestado.es/sindicacion/...
30947,24235582e,u.t.e.andrés mata caro arturo abril sánchez,u.t.e.andrés mata caro arturo abril sánchez,,DNI,,,[https://contrataciondelestado.es/sindicacion/...
256864,u01523190,u.t.e. alegria - sarasola 2,alegria - sarasola 2,Álava,CIF,u.t.e.,Uniones Temporales de Empresas,[https://contrataciondelestado.es/sindicacion/...
256865,u01548726,u.t.e. indenort pv s.l. - proviser iberica s.l...,indenort pv proviser iberica ies lizardi,Álava,CIF,u.t.e.,Uniones Temporales de Empresas,[https://contrataciondelestado.es/sindicacion/...
...,...,...,...,...,...,...,...,...
267137,u01888080,u.t.e. telefónica de españa s.a.u. y telefónic...,telefónica de españa y telefónica móviles españa,Álava,CIF,u.t.e.,Uniones Temporales de Empresas,[https://contrataciondelestado.es/sindicacion/...
267138,u87302147,u.t.e. telefonica españa s.a.u. - telefonica m...,telefonica españa telefonica moviles españa,Madrid,CIF,u.t.e.,Uniones Temporales de Empresas,[https://contrataciondelestado.es/sindicacion/...
267144,u88418785,u.t.e. terminal granollers,terminal granollers,Madrid,CIF,u.t.e.,Uniones Temporales de Empresas,[https://contrataciondelestado.es/sindicacion/...
267146,u02841708,u.t.e. terra ingenieros s.l. oca construccione...,terra ingenieros - oca construcciones y proyectos,Albacete,CIF,u.t.e.,Uniones Temporales de Empresas,[https://contrataciondelestado.es/sindicacion/...


In [56]:
# Create a new dataframe with all the company names that are not in the utes
df_not_in_utes = df_company[~df_company['FullName'].isin(df_utes['FullName'])]
df_not_in_utes

Unnamed: 0,NIF,FullName,Name,Province,NIFtype,CompanyType,CompanyDescription,id_tender
0,00021492x,carmen balguerias jiménez,carmen balguerias jiménez,,DNI,,,[https://contrataciondelestado.es/sindicacion/...
1,00035211k,paloma sáinz de la maza de la serna,paloma sáinz de la maza de la serna,,DNI,,,[https://contrataciondelestado.es/sindicacion/...
2,00067665e,alberto delgado cebrián,alberto delgado cebrián,,DNI,,,[https://contrataciondelestado.es/sindicacion/...
3,00072839k,fernandez abad vicente,fernandez abad vicente,,DNI,,,[https://contrataciondelestado.es/sindicacion/...
4,00076938a,luis terán lópez,luis terán lópez,,DNI,,,[https://contrataciondelestado.es/sindicacion/...
...,...,...,...,...,...,...,...,...
267238,b28672764,xylem water solutions españa s.l.u.,xylem water solutions españa,Madrid,CIF,s.l.,Sociedades de responsabilidad limitada,[https://contrataciondelestado.es/sindicacion/...
267239,b99289795,zaragoza y eventos s.l.,zaragoza y eventos,Zaragoza,CIF,s.l.,Sociedades de responsabilidad limitada,[https://contrataciondelestado.es/sindicacion/...
267240,a28011153,zardoya otis s.a.,zardoya otis,Madrid,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...
267241,b45273406,zenith toledo,zenith toledo,Toledo,CIF,s.l.,Sociedades de responsabilidad limitada,[https://contrataciondelestado.es/sindicacion/...


In [580]:
df_company[df_company['FullName'].str.contains("reczyclia")]

Unnamed: 0,NIF,FullName,Name,Province,NIFtype,CompanyType,CompanyDescription,id_tender


## Get utes names split

### Auxiliary functions

In [10]:
def remove_substring(string, substring):
    """
    Remove substring from string

    Parameters
    ----------
    string : str
        String to be processed
    substring : str
        Substring to be removed from string

    Returns
    -------
    result : str
        String without the substring
    """
    pattern = re.escape(substring)
    result = re.sub(pattern, " ", string)
    return result


def eliminate_patterns(text):
    """
    Eliminate fixed patterns found in several utes (e.g., "ley" + month, "u.t.e." prefix/suffix, etc.)

    Parameters
    ----------
    text : str
        String to be processed

    Returns
    -------
    result : str
        String without the fixed patterns
    """

    # Patterns to be removed from the inut string
    PATTERNS = [
        r"ley.*?(enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre)",
        r"revista.*?\d+",
        r"expte.*?\d+",
        r"ley.*\d+",
        r"contr.*\d+"
    ]
    combined_pattern = re.compile("|".join(PATTERNS), flags=re.IGNORECASE)

    # Remove patterns
    result = re.sub(combined_pattern, "", text)

    # Remove additional strings
    EXP = ["u.t.e.", "u.t.e", "union temporal de empresas",
           "compromiso de ", "abreviadamente", "compromiso"]
    for substring in EXP:
        result = remove_substring(result, substring)

    return result.strip()


def remove_first_substring(text, substring):
    """Remove only the first occurrence of a substring from a text.

    Parameters
    ----------
    text : str
        Text to be processed
    substring : str
        Substring to be removed from text

    Returns
    -------
    result : str
        Text without the substring
    """

    index = text.find(substring)
    if index != -1:
        return text[:index] + text[index + len(substring):]
    return text


def extract_difference(str1, str2):
    """Extract the difference between two strings.

    Parameters
    ----------
    str1 : str
        String to be processed
    str2 : str
        String to be processed

    Returns
    -------
    result : str
        Difference between str1 and str2
    """

    names1 = str1.split()
    names2 = set(str2.split())
    difference = sorted(set(names1) - names2, key=lambda x: names1.index(x))
    result = ' '.join(difference)

    return result


def get_splits_additional_rules(ute_rem, split_rules):
    """Splits a given string (ute_rem) based on set of split_rules and returns a list of split names.

    Parameters
    ----------
    ute_rem : str
        The input string to be split using additional split rules.
    split_rules : list
        A list of split rules to be applied on the input string.

    Returns
    -------
    split_names : list
        A list of strings resulting from splitting the input string based on the given rules.
    """

    # Filter out only the relevant split rules that exist in ute_rem
    split_rules = [rule for rule in split_rules if rule in ute_rem]

    if len(split_rules) == 1:
        # If only one rule is found, split ute_rem using that rule and strip any leading/trailing whitespaces
        split_names = [name.strip() for name in ute_rem.split(split_rules[0])]
    else:
        # If multiple split rules are found, sort them based on their first occurrence in the ute_rem
        split_rules.sort(key=lambda rule: ute_rem.index(
            rule) if rule in ute_rem else len(ute_rem))

        # Create a list of split names by applying each split rule on the remaining ute_rem and stripping leading whitespaces
        split_names = []
        for i, split_rule in enumerate(split_rules):
            found = ute_rem.split(split_rule)[0].strip()
            split_names.append(found)
            ute_rem = remove_substring(ute_rem, found)
            if i == len(split_rules)-1:
                split_names.append(ute_rem)

    return split_names


def split_names(row):

    # Remove fixed patterns found in several utes (e.g., "ley" + month, "u.t.e." prefix/suffix, etc.)
    ute_rem = eliminate_patterns(row)

    # Look for utes in the form "letter - number word" or "word letter - number". If so, we keep it as it is
    if re.search(r"([a-zA-Z]) - (\d+) (\S+)", ute_rem) or re.search(r"(\b\w+\b) ([a-zA-Z]) - (\d+)", ute_rem):
        return [ute_rem.strip()]

    # Possible split rules
    look_first_split_rules = \
        [
            "s.l.p.", "s.l.p", "s.l.l.", "s.a.u.", "s.l.u.",
            "s.l.u", "s.l.u,", "slu", "s.l", "c.o.o.p.", "s.a",
            "sl.", "sccl", "s.coop.pequeña"
        ]
    split_rules = [el for el in df_company.CompanyType.unique().tolist(
    ) if el != None and el != 'u.t.e.'] + look_first_split_rules
    additional_split_rules = ["-", "_", ",", "+"]

    # Sort according to size and scape characters in split_rules
    split_rules = sorted(split_rules, key=len, reverse=True)
    scaped_split_rules = [rule.replace('.', '\.').replace(
        ' ', '\s') for rule in split_rules]

    # Find rules
    found_rules = []
    aux = ute_rem
    for rule, escaped_rule in zip(split_rules, scaped_split_rules):
        if re.search(rf'{escaped_rule}', aux):
            occurrences = len(re.findall(rf'{escaped_rule}', aux))
            for _ in range(occurrences):
                found_rules.append(rule)
                aux = remove_first_substring(aux, rule)

    # Check if there are additional rules
    has_additional_rule = [
        True if rule in ute_rem else False for rule in additional_split_rules]
    
    # If there are found rules according to ute type
    if len(found_rules) > 0:

        # Remove additional_split_rules followed by an ute split_rule
        for element in split_rules:
            for subelement in additional_split_rules:
                pattern = f'{subelement}{element}'
                if pattern in ute_rem:
                    ute_rem = ute_rem.replace(pattern, f' {element}')

        # Check if there are still additional rules
        has_additional_rule = [
            True if rule in ute_rem else False for rule in additional_split_rules]

        # Order according to appearance in ute_rem
        found_rules.sort(key=lambda rule: ute_rem.index(
            rule) if rule in ute_rem else len(ute_rem))

        # If there is only one rule and it is at the end of the string
        if len(found_rules) == 1 and ute_rem.endswith(found_rules[0]):
            # If there are no additional rules
            if not any(has_additional_rule):
                # If the rule is "y" we split the string in two; otherwise, we don't split the string
                if "y" in ute_rem:
                    split_names = get_splits_additional_rules(ute_rem, ["y"])
                else:
                    split_names = [ute_rem.strip()]
            else:
                # If there are additional rules, we split the string according to them
                split_names = get_splits_additional_rules(
                    ute_rem, additional_split_rules)

        else:
            if not any(has_additional_rule):
                split_names = []
                for i, split_rule in enumerate(found_rules):
                    found = ute_rem.split(split_rule)[
                        0].strip() + " " + split_rule
                    split_names.append(found)
                    ute_rem = remove_substring(ute_rem, found)
                    if i == len(split_rules)-1:
                        split_names.append(ute_rem)
            else:
                new_split_names = []
                split_names = get_splits_additional_rules(
                    ute_rem, additional_split_rules)
                for el in split_names:
                    go = [True if rule in el else False for rule in found_rules]
                    if any(go):
                        for i, split_rule in enumerate(found_rules):
                            found = el.split(split_rule)[
                                0].strip() + " " + split_rule
                            new_split_names.append(found)
                            el = remove_substring(el, found)

                            if i == len(found_rules) - 1:
                                new_split_names.append(el)
                    else:
                        new_split_names.append(el)
                split_names = new_split_names

    elif True in has_additional_rule:
        split_names = get_splits_additional_rules(
            ute_rem, additional_split_rules + [" y "])

    elif re.compile(r'\s+y\s+').search(ute_rem):
        split_names = get_splits_additional_rules(ute_rem, [" y "])

    else:

        substrings = [" ".join(ute_rem.split()[i:j]) for i in range(
            len(ute_rem.split())) for j in range(i + 1, len(ute_rem.split()) + 1)]

        split_names = [
            el for el in substrings if el in df_not_in_utes.FullName.values.tolist()]

        if len(split_names) == 1:
            split_names += [extract_difference(ute_rem, split_names[0])]

        elif len(split_names) == 0:
            split_names = [ute_rem.strip()]

    # Cleaning errors after splitting
    split_names = [name.strip("-_ ,,+y").strip()
                   for name in split_names if len(name.strip()) > 1]

    return split_names

In [None]:
df_utes['split_names'] = df_utes['FullName'].apply(split_names)
df_utes['utes_length'] = df_utes.split_names.apply(len)

In [42]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       'display.max_colwidth',None
                       ):
    display(df_utes[df_utes.utes_length<1][["FullName", "split_names"]])

Unnamed: 0,FullName,split_names
260127,u.t.e. ley 18 82 numero 1,[]


## Get companies' utes from splits

In [None]:
def get_company_utes_from_splits(row, df_utes):
    
    thr = 95
    utes = [
        df_utes.FullName.values[el]
        for el in range(len(df_utes))
        for comp in df_utes.split_names.values[el]
        if fuzz.ratio(row.FullName, comp) > thr
    ]

    return utes  

In [None]:
df_not_in_utes["utes"] = df_not_in_utes.apply(lambda row: get_company_utes_from_splits(row, df_utes), axis=1)
df_not_in_utes['utes_length'] = df_not_in_utes.utes.apply(len)
df_not_in_utes[(df_not_in_utes.utes_length>0)]

## Get companies' utes from raw

In [4]:
def encontrar_substring_similar(main_string, substring, threshold=95):
    
    for i in range(len(main_string) - len(substring) + 1):
        sub = main_string[i:i + len(substring)]
        if not re.search(r'\b' + re.escape(sub) + r'\b', main_string):
            continue

        similitud = fuzz.ratio(sub, substring)
        if similitud >= threshold:
            return True
    return False

def get_company_utes(row, df_utes):
    pattern = re.compile(r'\b' + re.escape(row['Name']) + r'\b', re.IGNORECASE)
    utes = [fullName for fullName in df_utes['FullName'] if encontrar_substring_similar(fullName, row['Name'])]
    return utes

In [None]:
df_not_in_utes["utes"] = df_not_in_utes.apply(lambda row: get_company_utes(row, df_utes), axis=1)
df_not_in_utes['utes_length'] = df_not_in_utes.utes.apply(len)
df_not_in_utes[(df_not_in_utes.utes_length>0)]

In [None]:
df_not_in_utes.to_parquet("data/df_not_in_utes_enriched.parquet")

### Sample

In [6]:
df_aux = df_not_in_utes.sample(frac=0.005, replace=True, random_state=1)
df_aux["utes"] = df_aux.apply(lambda row: get_company_utes(row, df_utes), axis=1)
df_aux['utes_length'] = df_aux.utes.apply(len)
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                      'display.max_colwidth',None
                       ):
    #display(df_aux[["FullName", "utes"]])
    display(df_aux[df_aux.utes_length > 1][["FullName", "utes"]])
print(len(df_aux[df_aux.utes_length < 1]) / len(df_aux))

Unnamed: 0,FullName,utes
132920,woover trends s.l.u.,"[u.t.e. vodafone españa s.a.u. everis spain s.l.u. woover trends s.l.u., u.t.e. deloitte consulting s.l.u. woover trends s.l.u.]"
145030,rotorsun s.l.,"[orthem servicios y actuaciones ambientales s.a.u. y rotorsun s.l., rotorsun s.l.y orthem s.a.u.ute, u.t.e. rotorsun,s.l.y ortem s.a.u.]"
178526,efs mantenimiento y servicios tecnicos s.l.,"[u.t.e. efs mantenimiento y servicios tecnicos s.l.u. - adelte transporte y servicios efs s.l., u.t.e. efs mantenimiento y servicios tecnicos s.l.u. eurofred s.a., u.t.e. efs mantenimiento y servicios técnicos s.l.u u.t.e. ley 18 82 26 - 05 u.t.e. efs adelte cond.mad]"
157991,"ingeniería,obras y tecnología europea s.l.","[u.t.e. ingenieria,obras y tecnologia europea s.l. jarypark s.l., u.t.e. ecocivil electromur g.e.s.l.e ingeniería,obras y tecnología europea s.l. abreviadamente u.t.e. vaguada .]"


0.9900763358778626


## Tests

In [51]:
df_aux = df_company[(df_company['FullName'].str.contains('orega s.l'))]# & (df_company['CompanyType'] != "u.t.e.")
df_aux = df_not_in_utes.sample(frac=0.005, replace=True, random_state=2)
df_aux["utes"] = df_aux.apply(lambda row: get_company_utes2(row, df_utes), axis=1)
df_aux['utes_length'] = df_aux.utes.apply(len)
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                      'display.max_colwidth',None
                       ):
    display(df_aux[df_aux.utes_length > 1][["FullName", "utes"]])
print(len(df_aux[df_aux.utes_length < 1]))

Unnamed: 0,FullName,utes
97014,construcciones sevilla nevado s.a.,"[u.t.e. construcciones sevilla nevado s.a. sebastián sevilla nevado s.l.u. josé carmona e hijos s.l., construcciones sevilla nevado s.a. cimasa empresa de construccion e ingenieria s.l. u.t.e]"
99542,indra sistemas s.a.,"[indra sistemas s.a.e. instalaciones, u.t.e. indra sistemas s.a.y alisea esco s.a u.t.e. jaen, kapsch trafficom transportation s.a. - indra sistgemas s.a. worldline iberia s.a u.t.e., u.t.e. telefonica de españa s.a.u. indra sistemas s.a., indra sistemas s.a. connectis consulting services,s.a.u.ute, u.t.e. indra sistemas s.a. - sistemas y montajes industriales s.a., u.t.e. indra sistemas s.a.y sotel it solutions s.l., indra sistemas s.a. aerum aviation group,s.l.ute, u.t.e. indra sistemas s.a.integracion tecnologica empresarial u.t.e. ley 18 1982 26 mayo 201853a1, u.t.e. alfatec sistemas s.l. - indra sistemas s.a.]"
106776,tecnologia de firmes s.a.,"[u.t.e. tecnologia de firmes s.a. - guerola transer s.l.u., u.t.e. tecnologia de firmes s.a. - asfaltecno obras y servicios s.a., u.t.e. tecnologia de firmes s.a. - construcciones y obras llorente s.a., tecnologia de firmes s.a. y eiffage infraestructuras s.a.u u.t.e., u.t.e. tecnologia de firmes s.a. - global de electricidad e instalaciones s.l. - artectum s.l., u.t.e. tecnologia de firmes s.a. y constructora consvial s.l u.t.e. ampliacion sala aeropuerto v, u.t.e. guerola transer s.l.u. - tecnologia de firmes s.a., u.t.e. tecnologia de firmes s.a. - gerola transfer s.l.u., u.t.e. tecnologias de firme s.a. arcadi ingenieria e instalaciones innovadoras s.l., u.t.e. tecnologa de firmes s.a. montajes electricos antonio godoy s.l., u.t.e. tecnologia de firmes s.a. eiffage infraestructuras s.a.u., asfaltos y const elsan s.a. tecnologias de firmes s.a.]"


1300


In [125]:
df_aux = df_company[(df_company['FullName'] == 'civit s.l.') | (df_company['FullName'] == 'construcciones sevilla nevado s.a.')]# & (df_company['CompanyType'] != "u.t.e.")
df_aux

Unnamed: 0,NIF,FullName,Name,Province,NIFtype,CompanyType,CompanyDescription,id_tender
97014,a10036424,construcciones sevilla nevado s.a.,construcciones sevilla nevado,Cáceres,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...
118563,b08452799,civit s.l.,civit,Barcelona,CIF,s.l.,Sociedades de responsabilidad limitada,[https://contrataciondelestado.es/sindicacion/...


In [None]:
#df_aux = df_company[(df_company['FullName'].str.contains('tecnologia de firmes s.a.'))]# & (df_company['CompanyType'] != "u.t.e.")
df_aux = df_not_in_utes.sample(frac=0.005, replace=True, random_state=1)
df_aux["utes"] = df_aux.apply(lambda row: get_company_utes(row, df_utes), axis=1)
df_aux['utes_length'] = df_aux.utes.apply(len)
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                      'display.max_colwidth',None
                       ):
    display(df_aux[["FullName", "utes"]])
    #display(df_aux[df_aux.utes_length > 1][["FullName", "utes"]])
print(len(df_aux[df_aux.utes_length < 1]))

In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                      'display.max_colwidth',None
                       ):
    display(df_aux[df_aux.utes_length > 1][["FullName", "utes"]])

In [None]:
# found before 1583
df_not_in_utes["utes"] = df_not_in_utes.apply(lambda row: get_company_utes2(row, df_utes), axis=1)
df_not_in_utes['utes_length'] = df_not_in_utes.utes.apply(len)
df_not_in_utes[(df_not_in_utes.utes_length>0)]

In [None]:
df_aux2 = df_not_in_utes.copy()
df_aux2["utes"] = df_aux2.apply(lambda row: get_company_utes2(row, df_utes), axis=1)
df_aux2['utes_length'] = df_aux2.utes.apply(len)
df_aux2[(df_aux2.utes_length>0)]

In [None]:
df_not_in_utes.to_parquet("data/df_not_in_utes_enriched.parquet")

## Check coverage

In [46]:
df_not_in_utes_enriched1 = pd.read_parquet(pathlib.Path("data/df_not_in_utes_enriched1.parquet"))
filtered = df_not_in_utes_enriched1[(df_not_in_utes_enriched1.utes_length>0)]
filtered

Unnamed: 0,NIF,FullName,Name,Province,NIFtype,CompanyType,CompanyDescription,id_tender,utes,utes_length
943,01939568r,alejandro,alejandro,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,[xoaquín monteagudo romero alejandro martín ló...,2
1229,02497147z,alicia torres gonzález,alicia torres gonzález,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,[u.t.e. alicia torres gonzález y artec 4 s.l.p.],1
8765,07270702b,manuel jesus piriz gil,manuel jesus piriz gil,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,[manuel jesus piriz gil - maria navarro cifuen...,1
12498,09331668q,joaquín,joaquín,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,[josep maria mezquida casases joaquín solé mir...,1
13477,10017354a,eloy santin castañeiras,eloy santin castañeiras,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,[alberto jose garcía martínez y eloy santín ca...,1
...,...,...,...,...,...,...,...,...,...,...
267192,a79524054,urbaser s.a.,urbaser,Madrid,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...,[u.t.e. biziss movilidad urbana sostenible s.l...,13
267215,a23434970,vialterra infraestructuras s.a.,vialterra infraestructuras,Jaén,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...,[u.t.e. vialterra infraestructuras s.a. kerkro...,4
267224,a80907397,vodafone españa s.a.u.,vodafone españa,Madrid,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...,[u.t.e. vodafone españa s.a.u. everis spain s....,6
267225,a62186556,vodafone ono s.a.u.,vodafone ono,Barcelona,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...,"[u.t.e. vodafone españa s.a.u. vodafone ono,s....",1


In [52]:
utes_found = [ute for el in filtered.utes.values.tolist() for ute in el]
utes_matching = (len(list(set(utes_found))) / len(df_utes)) * 100
print(utes_matching)

38.70905956713273


In [460]:
df_not_in_utes_enriched1 = pd.read_parquet(pathlib.Path("../data/utes_spark.parquet"))
df_not_in_utes_enriched1["utes_length"] = df_not_in_utes_enriched1["utes"].apply(len)
filtered = df_not_in_utes_enriched1[(df_not_in_utes_enriched1.utes_length>0)]
filtered

Unnamed: 0,NIF,FullName,Name,Province,NIFtype,CompanyType,CompanyDescription,id_tender,utes,utes_length
637,01178902z,rosa granados,rosa granados,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,[gesnaer consulting s.l.n.e. rosa granados u.t...,1
943,01939568r,alejandro,alejandro,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,"[u.t.e. antonio lópez sánchez,alejandro martín...",5
1213,02459022t,andres perea ortega,andres perea ortega,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,[euroestudios s.l. andres perea ortega u.t.e.],1
1229,02497147z,alicia torres gonzález,alicia torres gonzález,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,[u.t.e. alicia torres gonzález y artec 4 s.l.p.],1
8493,07016148k,construcciones perez,construcciones perez,,DNI,,,[https://contrataciondelestado.es/sindicacion/...,[u.t.e. construcciones perez jimenez s.l. alba...,2
...,...,...,...,...,...,...,...,...,...,...
261969,a79524054,urbaser s.a.,urbaser,Madrid,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...,[u.t.e. ambitec servicios ambientales s.a.u. y...,18
261992,a23434970,vialterra infraestructuras s.a.,vialterra infraestructuras,Jaén,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...,[u.t.e. vialterra infraestructuras s.a. desarr...,4
262001,a80907397,vodafone españa s.a.u.,vodafone españa,Madrid,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...,"[u.t.e. vodafone españa s.a.u. vodafone ono,s....",8
262002,a62186556,vodafone ono s.a.u.,vodafone ono,Barcelona,CIF,s.a.,Sociedades anónimas,[https://contrataciondelestado.es/sindicacion/...,"[u.t.e. vodafone españa s.a.u. vodafone ono,s....",2


In [463]:
filtered[filtered['FullName'].str.contains("novadays")]

Unnamed: 0,NIF,FullName,Name,Province,NIFtype,CompanyType,CompanyDescription,id_tender,utes,utes_length


In [61]:
utes_found = [ute for el in filtered.utes.values.tolist() for ute in el]
utes_matching = (len(list(set(utes_found))) / len(df_utes)) * 100
print(utes_matching)

56.61750622486114


In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                      'display.max_colwidth',None
                       ):
    display(filtered.head(30))

## Check UTEs sent

In [583]:
df_checks = pd.read_excel("../data/ejemplos_enrique.xlsx")
df_checks

  warn(msg)


Unnamed: 0.1,Unnamed: 0,Nº EXPEDIENTE,TÍTULO DEL CONTRATO,EMPRESAS PARTICIPANTES RAZÓN SOCIAL,NIF,EMPRESAS QUE LICITAN EN UTE,NIF COMPONENTES UTE,PYME,ENLACE AL DOCUMENTO ACTA APERTURA SOBRE A,ACTA SOBRE A IDENTIFICA UTE,TIPO DE ENTIDAD,TIPO DE SOCIEDAD,EMPRESA ADJUDICATARIA,ENLACE AL DOCUMENTO ANUNCIO FORMALIZACIÓN
0,1.0,300/2021/00680 LOTE 1,PRESTACIÓN DE SERVICIOS PARA LA PUESTA EN MARC...,GLOBAL INCUBFATOR SL,B85432037,,,SI,https://contrataciondelestado.es/wps/wcm/conne...,,Empresa,SL,NO,DOC_FORM2023-018297.pdf (contrataciondelestado...
1,,,,"UTE TYPSA, ESTADÍSTICA Y SERVICIOS SLU Y MAGIC...",TEMP 09454,TÉCNICA Y PROYECTOS SA (TYPSA),A28171288,NO,,"SÍ (Pág 3 ""(…) UTE TYPSA ESTADÍSTICA Y SERVICI...",Empresa,SLU,NO,
2,,,,,,MAGIC FENNEC SL,B99550667,SI,,,Empresa,SL,,
3,,,,"UTE VODAFONE ESPAÑA, SAU",U72479678,VODAFONE ESPAÑA SAU,A80907397,NO,,"SÍ (Pág 3 ""(…) UTE GAMERA, VODAFONE OWW GAMERS...",Empresa,SAU,SI,
4,,,,,,GAMERA NEST SL,B2882956K,SI,,,Empresa,SL,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,,,,"SERLINGO SOCIAL, S.L.U.",B86868593,,,SÍ,,,Empresa,Centro Especial de Empleo,NO,
99,,,,"TREBOL INTEGRACION SOCIAL, S.L.",B82349457,,,SÍ,,,Empresa,Centro Especial de Empleo,NO,
100,,,,,,,,,,,,,,
101,18.0,300/2021/00569,“DISEÑO Y PUESTA EN MARCHA DE UN\nLABORATORIO ...,FUNDACION JUAN XXIII-RONCALLI PARA LA DISCAPAC...,G78280880,,,NO,https://contrataciondelestado.es/wps/wcm/conne...,,Fundación,,SÍ,https://contrataciondelestado.es/wps/wcm/conne...


In [584]:
df_checks = df_checks.rename(columns={
    "EMPRESAS PARTICIPANTES RAZÓN SOCIAL": "utes",
    "EMPRESAS QUE LICITAN EN UTE": "company"
})[["utes", "company"]]
df_checks

Unnamed: 0,utes,company
0,GLOBAL INCUBFATOR SL,
1,"UTE TYPSA, ESTADÍSTICA Y SERVICIOS SLU Y MAGIC...",TÉCNICA Y PROYECTOS SA (TYPSA)
2,,MAGIC FENNEC SL
3,"UTE VODAFONE ESPAÑA, SAU",VODAFONE ESPAÑA SAU
4,,GAMERA NEST SL
...,...,...
98,"SERLINGO SOCIAL, S.L.U.",
99,"TREBOL INTEGRACION SOCIAL, S.L.",
100,,
101,FUNDACION JUAN XXIII-RONCALLI PARA LA DISCAPAC...,


In [585]:
len(utes_found)

16

In [588]:
# Get name of utes that went to nan after convesion from excel
for index, row in df_checks.iterrows():
    if pd.isna(row["utes"]):
        row["utes"] = df_checks.iloc[index-1].utes
        
# We do not consider ute/compnay pairs with nan value in compnay
df_checks = df_checks.dropna()

# Save utes in a list for processing with our methids
utes_found = df_checks.utes.unique().tolist()

# Create a new dataframe where for each company, we have a list with the utes in which it has participated
new_df_checks = df_checks.groupby('company')['utes'].apply(list).reset_index()
new_df_checks.columns = ['company', 'true_utes']

# Extract list of companies
companies_list = new_df_checks.company.values.tolist()

new_df_checks

Unnamed: 0,company,true_utes
0,ADD4U SOLUCIONES PARA EL DESARROLLO S.L.,[UTE HOLISTIC - LIGHTHOUSE - ADD4U]
1,ADD4U SOLUCIONES PARA GESTION Y DESARROLLO SL.,[UTE ADD4U - SIFDI]
2,ALTRAPO LAB S COOP MAD,[UTE PIC FUENCARRAL]
3,ARACAS DE MANTENIMIENTO INTEGRAL S.A.,[UTE ARACAS DE MANTENIMIENTO INTEGRAL S.A - AR...
4,"ARACAS MANTENIMIENTO Y SERVICIO, S.L.",[UTE ARACAS DE MANTENIMIENTO INTEGRAL S.A - AR...
5,BEL CONSULTORES S.L.,[UTE LICITACIONES BEL CONSULTORES S.L. - FUNDA...
6,"CONSTRUCCIONES FRANCISCO CARRASCO NOVILLO, S.A",[UTE CONSTRUCCIONES FRANCISCO CARRASCO NOVILLO...
7,CROWE ACCELERA MANAGEMENT,[UTE UNIVERSIDAD REY JUAN CARLOS - CROWE ACEL...
8,FERNÁNDEZ MOLINA OBRAS Y SERVICIOS S.A.,[UTE CONSTRUCCIONES FRANCISCO CARRASCO NOVILLO...
9,FUNDACION DELEGACIÓN FUNDACIÓN FINNOVA,[UTE FUNDACION DELEGACIÓN FUNDACIÓN FINNOVA - ...


In [612]:
SEPS_UTES = (df_company.CompanyType.unique().tolist() + \
    ["s.l.p.", "s.l.p", "s.l.l.", "s.a.u.", "s.l.u.", "s.l.u", "s.l.u,", "slu", "s.l", "c.o.o.p.", "s.a", "sl.", "sccl", "s.coop.pequeña"] +\
    ["sl", "slu", "s."])[1:]

OTHERS = ["UTE", "ute", "u.t.e.", "servicio", "servicios", "obras", "fundación", "información", "técnica", "proyectos", "y"] + ["-", "_", ",", "+"]
SEPS = SEPS_UTES + OTHERS

In [613]:
def encontrar_substring_similar(main_string, substring, threshold=90):
    len_substring = len(substring)
    len_main_string = len(main_string)

    # Calculate similarity only if the length of the substring is less than the main string
    if len_substring <= len_main_string:
        # Use a sliding window approach to compare substrings
        for i in range(len_main_string - len_substring + 1):
            sub = main_string[i:i + len_substring]
            similarity = fuzz.ratio(sub, substring)
            if similarity >= threshold:
                return True

    # Check for partial matches with some punctuation separation
    if re.search(r'([.,!?;:\s])', main_string):
        main_string_split = re.split(r'([.,!?;:\s-])', main_string.lower())
        main_string_split = [el.strip() for el in main_string_split if el.strip()]  # Remove empty strings
        for el in main_string_split:
            # Strip additional punctuation and check for matches
            el = el.strip(',').strip()
            if el not in SEPS and len(el) > 3 and el in [word.strip(',').strip().strip(")").strip("(") for word in substring.lower().split()]:
                return True
    return False

In [614]:
salida = []
for company_name in companies_list:
    for ute in utes_found:
        salida.append ({
            'ute':ute,
            'company_name':company_name,
            'ratio':encontrar_substring_similar(ute, company_name)
        })
all_utes_per_company = []
for company_name in companies_list:
    utes_for_company = [dato['ute'] for dato in salida if dato['company_name'] == company_name and dato['ratio']]
    all_utes_per_company.append(utes_for_company)

In [615]:
new_df_checks["utes_method2"] = all_utes_per_company
new_df_checks["len"] = new_df_checks["utes_method2"].apply(len)
print(len(new_df_checks[new_df_checks.len>0]))
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                      'display.max_colwidth',None
                       ):
    display(new_df_checks)

29


Unnamed: 0,company,true_utes,utes_method2,len
0,ADD4U SOLUCIONES PARA EL DESARROLLO S.L.,[UTE HOLISTIC - LIGHTHOUSE - ADD4U],"[UTE ADD4U - SIFDI, UTE HOLISTIC - LIGHTHOUSE - ADD4U]",2
1,ADD4U SOLUCIONES PARA GESTION Y DESARROLLO SL.,[UTE ADD4U - SIFDI],"[UTE ADD4U - SIFDI, UTE HOLISTIC - LIGHTHOUSE - ADD4U]",2
2,ALTRAPO LAB S COOP MAD,[UTE PIC FUENCARRAL],[],0
3,ARACAS DE MANTENIMIENTO INTEGRAL S.A.,[UTE ARACAS DE MANTENIMIENTO INTEGRAL S.A - ARACAS MANTENIMIENTO Y SERVICIO S.L.],[UTE ARACAS DE MANTENIMIENTO INTEGRAL S.A - ARACAS MANTENIMIENTO Y SERVICIO S.L.],1
4,"ARACAS MANTENIMIENTO Y SERVICIO, S.L.",[UTE ARACAS DE MANTENIMIENTO INTEGRAL S.A - ARACAS MANTENIMIENTO Y SERVICIO S.L.],[UTE ARACAS DE MANTENIMIENTO INTEGRAL S.A - ARACAS MANTENIMIENTO Y SERVICIO S.L.],1
5,BEL CONSULTORES S.L.,[UTE LICITACIONES BEL CONSULTORES S.L. - FUNDACIÓN CIDEAL - SECTOR 3 INFORMACIÓN Y SERVICIO - GESOR],[UTE LICITACIONES BEL CONSULTORES S.L. - FUNDACIÓN CIDEAL - SECTOR 3 INFORMACIÓN Y SERVICIO - GESOR],1
6,"CONSTRUCCIONES FRANCISCO CARRASCO NOVILLO, S.A","[UTE CONSTRUCCIONES FRANCISCO CARRASCO NOVILLO, S.A. - OBRAS Y SERVICIOS F. MOLINA]","[UTE CONSTRUCCIONES FRANCISCO CARRASCO NOVILLO, S.A. - OBRAS Y SERVICIOS F. MOLINA]",1
7,CROWE ACCELERA MANAGEMENT,[UTE UNIVERSIDAD REY JUAN CARLOS - CROWE ACELERA MANAGEMENT],[UTE UNIVERSIDAD REY JUAN CARLOS - CROWE ACELERA MANAGEMENT],1
8,FERNÁNDEZ MOLINA OBRAS Y SERVICIOS S.A.,"[UTE CONSTRUCCIONES FRANCISCO CARRASCO NOVILLO, S.A. - OBRAS Y SERVICIOS F. MOLINA]","[UTE CONSTRUCCIONES FRANCISCO CARRASCO NOVILLO, S.A. - OBRAS Y SERVICIOS F. MOLINA]",1
9,FUNDACION DELEGACIÓN FUNDACIÓN FINNOVA,[UTE FUNDACION DELEGACIÓN FUNDACIÓN FINNOVA - RECZYCLIA],"[UTE FUNDACIÓN DELEGACIÓN FUNDACIÓN FINNOVA - RECZYCLIA, UTE FUNDACION DELEGACIÓN FUNDACIÓN FINNOVA - RECZYCLIA]",2
