In [1]:
import pathlib
import pandas as pd
import numpy as np
import re
import time
import fasttext
fasttext.FastText.eprint = lambda x: None
from huggingface_hub import hf_hub_download
from tqdm import tqdm

In [19]:
def get_lang(
    df: pd.DataFrame,
    col_calculate_on: str
) -> str:
    """Detects the language of a text column in a DataFrame using langdetect.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the text column
    col_calculate_on : str
        Column name to calculate language on

    Returns
    -------
    df : pd.DataFrame
        DataFrame with the language column added to it as "lang"
    """

    def det(x: str) -> str:
        """
        Detects the language of a given text

        Parameters
        ----------
        x : str
            Text whose language is to be detected

        Returns
        -------
        lang : str
            Language of the text
        """

        try:
            lang = model.predict(x)[0][0].replace('__label__','')
        except Exception as e:
            print(e)
            lang = 'Other'
        return lang

    print(f"-- Detecting language...")
    start_time = time.time()

    df['lang'] = df[col_calculate_on].apply(det)

    print(f'-- -- Language detect finished in {(time.time() - start_time)}')

    return df

def normalize_string(s):
    # Convert to lower case
    #s = s.lower()
    # Remove punctuation and special characters
    #s = re.sub(r'[^\w\s]', '', s)
    # Remove extra spaces
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [3]:
path_texts = pathlib.Path("/export/data_ml4ds/NextProcurement/PLACE/BSC_procesados")
paths = []
paths = [path for path in path_texts.iterdir()]
print(*paths)

/export/data_ml4ds/NextProcurement/PLACE/BSC_procesados/procurements_file_15_containing_252064_docs_mark_fixed.parq /export/data_ml4ds/NextProcurement/PLACE/BSC_procesados/procurements_file_9_containing_250416_docs_mark_fixed.parq /export/data_ml4ds/NextProcurement/PLACE/BSC_procesados/procurements_file_6_containing_251105_docs_mark_fixed.parq /export/data_ml4ds/NextProcurement/PLACE/BSC_procesados/procurements_file_10_containing_251208_docs_mark_fixed.parq /export/data_ml4ds/NextProcurement/PLACE/BSC_procesados/procurements_file_8_containing_251687_docs_mark_fixed.parq /export/data_ml4ds/NextProcurement/PLACE/BSC_procesados/procurements_file_1_containing_251067_docs_mark_fixed.parq /export/data_ml4ds/NextProcurement/PLACE/BSC_procesados/procurements_file_11_containing_252661_docs_mark_fixed.parq /export/data_ml4ds/NextProcurement/PLACE/BSC_procesados/procurements_file_16_containing_92602_docs_mark_fixed.parq /export/data_ml4ds/NextProcurement/PLACE/BSC_procesados/procurements_file_3_c

In [4]:
all_dfs = []
for path_ in tqdm(paths):
    df = pd.read_parquet(path_)
    all_dfs.append(df)

100%|██████████| 17/17 [19:43<00:00, 69.63s/it]


In [5]:
combined_df = pd.concat(all_dfs)

In [6]:
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path) 

In [7]:
combined_df["clean_extracted"] = combined_df["extracted"].apply(normalize_string)
combined_df["clean_extracted"] = combined_df["clean_extracted"].apply(lambda x: x[:1000])# keep first 1000 tokens to fasten language detection
combined_df = get_lang(combined_df, col_calculate_on = "clean_extracted")
combined_df

-- Detecting language...
-- -- Language detect finished in 194.43360948562622


Unnamed: 0,procurement_id,doc_name,extracted,extracted_tags,texto_heading,embeddings_heading,clean_extracted,lang
2,ntp00114200,ntp00114200_Pliego_Prescripciones_tecnicas_URI,g PLIEGO DE PRESCRI PCI ONES TÉCNI CAS\n S\n \...,"[document, section, heading, section, heading,...",[g PLIEGO DE PRESCRI PCI ONES TÉCNI CAS\n S\n ...,"[[-0.1855995, 0.10908591, 0.1287027, -0.085297...",g pliego de prescri pci ones técni cas s contr...,spa_Latn
10,ntp00729859,ntp00729859_Pliego_Prescripciones_tecnicas_URI,PLIEGO DE PRESCRIPCIONES TÉCNICAS QUE HA DE RE...,"[document, section, heading, section, heading,...",[PLIEGO DE PRESCRIPCIONES TÉCNICAS QUE HA DE R...,"[[-0.06488852, 0.03319587, 0.05720649, -0.2051...",pliego de prescripciones técnicas que ha de re...,spa_Latn
23,ntp01335443,ntp01335443_Pliego_Prescripciones_tecnicas_URI,PLIEGO DE PRESCRIPCIONES TÉCNICAS PARA LA CONT...,"[document, section, heading, body, p, p, p, p,...",[PLIEGO DE PRESCRIPCIONES TÉCNICAS PARA LA CON...,"[[0.11980498, -0.0047292123, 0.07927151, -0.05...",pliego de prescripciones técnicas para la cont...,spa_Latn
35,ntp00008052,ntp00008052_Pliego_Prescripciones_tecnicas_URI,íHJJ Agencia de I'Habitatge\n FUU de Catalunya...,"[document, section, heading, section, heading,...",[íHJJ Agencia de I'Habitatge\n FUU de Cataluny...,"[[0.050423447, 0.10861048, 0.12632288, 0.06832...",íhjj agencia de ihabitatge fuu de catalunya pl...,cat_Latn
40,ntp00196761,ntp00196761_Pliego_Prescripciones_tecnicas_URI,Ref: 50/088288.9/21\nAgencia de Vivienda Socia...,"[document, section, body, p, section, heading,...",[Agencia de Vivienda Social\n CONSEJERÍA DE VI...,"[[0.2535557, 0.06023139, 0.13293944, 0.0011294...",ref 50088288921 agencia de vivienda social con...,spa_Latn
...,...,...,...,...,...,...,...,...
250028,ntp00582515,ntp00582515_Pliego_Prescripciones_tecnicas_URI,sura\nPLIEGO DE PRESCRIPCIONES TÉCNICAS PARA L...,"[document, section, body, p, section, heading,...",[PLIEGO DE PRESCRIPCIONES TÉCNICAS PARA LA CON...,"[[-0.101150624, 0.29571363, -0.01729959, 0.204...",sura pliego de prescripciones técnicas para la...,ast_Latn
250038,ntp01351598,ntp01351598_Pliego_Prescripciones_tecnicas_URI,2020/02293/01 14/07/2020\n \n ...\nCOLEGIO DE ...,"[document, section, body, p, section, heading,...","[COLEGIO DE INGENIEROS DE CAMINOS,\n CANALES Y...","[[-0.041178588, -0.10482038, 0.20265633, 0.086...",20200229301 14072020 colegio de ingenieros de ...,spa_Latn
250067,ntp00566851,ntp00566851_Pliego_Prescripciones_tecnicas_URI,ANEX\nPliego de prescripciones técnicas que ha...,"[document, section, heading, body, p, p, p, p,...","[ANEX, Características técnicas de la aplicaci...","[[-0.02871781, 0.109616816, -0.03430078, 0.037...",anex pliego de prescripciones técnicas que ha ...,spa_Latn
250078,ntp00882149,ntp00882149_Pliego_Prescripciones_tecnicas_URI,a\n A\n S\nUNIVERSITAT\n POLITÉCNICA\n DE VALE...,"[document, section, body, p, section, heading,...",[UNIVERSITAT\n POLITÉCNICA\n DE VALENCIA\n \n ...,"[[0.04755034, -0.071341686, 0.07659534, 0.1800...",a a s universitat politécnica de valencia cont...,spa_Latn


In [8]:
combined_df.lang.unique()

array(['spa_Latn', 'cat_Latn', 'glg_Latn', 'som_Latn', 'por_Latn',
       'eus_Latn', 'oci_Latn', 'lmo_Latn', 'eng_Latn', 'wol_Latn',
       'ast_Latn', 'est_Latn', 'vec_Latn', 'bug_Latn', 'fra_Latn',
       'gaz_Latn', 'gom_Deva', 'slk_Latn', 'xho_Latn', 'ita_Latn',
       'epo_Latn', 'lim_Latn', 'hun_Latn', 'kor_Hang', 'ron_Latn',
       'ltz_Latn', 'fuv_Latn', 'vie_Latn', 'lus_Latn', 'yue_Hant',
       'lvs_Latn', 'nld_Latn', 'ces_Latn', 'nav_Latn', 'fin_Latn',
       'bos_Latn', 'gla_Latn', 'lit_Latn', 'quy_Latn', 'mos_Latn',
       'hat_Latn', 'yor_Latn', 'nso_Latn', 'cym_Latn', 'lij_Latn',
       'lug_Latn', 'aka_Latn', 'nno_Latn', 'scn_Latn', 'kab_Latn',
       'pol_Latn', 'ind_Latn', 'afr_Latn'], dtype=object)

In [9]:
combined_df.columns

Index(['procurement_id', 'doc_name', 'extracted', 'extracted_tags',
       'texto_heading', 'embeddings_heading', 'clean_extracted', 'lang'],
      dtype='object')

In [10]:
df_cat = combined_df[combined_df.lang=="cat_Latn"].sample(n=75, random_state=1234_75)

In [23]:
df_cat["text"] = df_cat["extracted"].apply(normalize_string)
df_cat["objetivo"] = [""]*len(df_cat)
df_cat.to_parquet("cat.parquet")
df_cat[["procurement_id", "doc_name", "text", "objetivo"]].to_excel("tarea_gencat.xlsx")
df_cat[["procurement_id", "doc_name", "text", "objetivo"]]

Unnamed: 0,procurement_id,doc_name,text,objetivo
80133,ntp01208301,ntp01208301_Pliego_Prescripciones_tecnicas_URI,Ajuntament de Manacor Exp. 524/2023 PLEC DE PR...,
151660,ntp00409894,ntp00409894_Pliego_Prescripciones_tecnicas_URI,Àrea de Serveis Generals Departament de Compra...,
92115,ntp00245710,ntp00245710_Pliego_Prescripciones_tecnicas_URI,I_ < > O C a < Exp. 905349/21 PLEC DE CONDICIO...,
65757,ntp00344652,ntp00344652_Pliego_Prescripciones_tecnicas_URI,Àrea de Presidència Subdirecció d'Imatge Corpo...,
120006,ntp00304919,ntp00304919_Pliego_Prescripciones_tecnicas_URI,Generalitat de Catalunya Departament de la Vic...,
...,...,...,...,...
242357,ntp01339344,ntp01339344_Pliego_Prescripciones_tecnicas_URI,Ajuntament É} de Palma PLEC DE PRESCRIPCIONS T...,
156018,ntp00298986,ntp00298986_Pliego_Prescripciones_tecnicas_URI,Servei Catala de la Salut SCS-2022-389 Submini...,
212516,ntp00008945,ntp00008945_Pliego_Prescripciones_tecnicas_URI,E PLC_ppt_legionel-losi_20180306 AJUNTAMENT DE...,
157670,ntp00389749,ntp00389749_Pliego_Prescripciones_tecnicas_URI,PRESCRIPCIONS TECNIQUES QUE REGIRAN A LA CONTR...,


In [12]:
df_spa = combined_df[combined_df.lang=="spa_Latn"].sample(n=150, random_state=1234_75)

In [26]:
df_spa["text"] = df_spa["extracted"].apply(normalize_string)
df_spa["objetivo"] = [""]*len(df_spa)
df_spa.to_parquet("spa.parquet")
df_spa[["procurement_id", "doc_name", "text", "objetivo"]].to_excel("tarea_zaragoza_madrid.xlsx")
df_spa[["procurement_id", "doc_name", "text", "objetivo"]]

Unnamed: 0,procurement_id,doc_name,text,objetivo
128403,ntp00617381,ntp00617381_Pliego_Prescripciones_tecnicas_URI,PLIEGO DE PRESCRIPCIONES TÉCNICAS PARTICULARES...,
152306,ntp00552454,ntp00552454_Pliego_Prescripciones_tecnicas_URI,de ERO EN — DE ARAGON E :' = EE Instiuto Arago...,
214074,ntp01175355,ntp01175355_Pliego_Prescripciones_tecnicas_URI,1ena Aeropuerto de Tenerife Norte- Ciudad de L...,
156951,ntp00348490,ntp00348490_Pliego_Prescripciones_tecnicas_URI,SEGURTASUN SAILA - DEPARTAMENTO DE SEGURIDAD A...,
108923,ntp00573106,ntp00573106_Pliego_Prescripciones_tecnicas_URI,PLIEGO DE PRESCRIPCIONES TÉCNICAS PARA LA CONT...,
...,...,...,...,...
112135,ntp00506461,ntp00506461_Pliego_Prescripciones_tecnicas_URI,AYUNTAMIENTO DE __ Unión Europea Fondo Europeo...,
77689,ntp11215851,ntp11215851_Pliego_Prescripciones_tecnicas_URI,s PROVINCIA EXCMO CONCELLO = % DE DE E LUGO PA...,
243516,ntp10327031,ntp10327031_Pliego_Prescripciones_tecnicas_URI,"$ Te MINISTERIO - Á |""€5J g Y SEGURIDAD SOCIAL...",
55806,ntp00438518,ntp00438518_Pliego_Prescripciones_tecnicas_URI,O P EXPEDIENTE N* INV 2018/04 . Universidad de...,


In [27]:
df_spa.iloc[0:75][["procurement_id", "doc_name", "text", "objetivo"]].to_excel("tarea_zaragoza.xlsx")
df_spa.iloc[75:][["procurement_id", "doc_name", "text", "objetivo"]].to_excel("tarea_madrid.xlsx")