In [119]:
import logging
import pathlib
import sys
from typing import List
import random
import os
import time
from subprocess import check_output
import scipy.sparse as sparse

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

In [120]:
mallet_path = pathlib.Path("/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/mallet-2.0.8/bin/mallet")

def infer_thetas(path_model, num_topics, docs):
    num_iterations = 1000
    doc_topic_thr = 0.0
    holdout_corpus = path_model / "infer_data" / "corpus.txt"
    with holdout_corpus.open("w", encoding="utf8") as fout:
        for i, t in docs:
            fout.write(f"{i} 0 {t}\n")
    print(f"-- -- Mallet corpus.txt for inference created.")

    # Get inferencer
    inferencer = path_model / "model_data" / "inferencer.mallet"

    # Files to be generated thoruogh Mallet
    corpus_mallet_inf = path_model / "infer_data" / "corpus_inf.mallet"
    doc_topics_file = path_model / "infer_data" / "doc-topics-inf.txt"


    # Extract pipe
    # Get corpus file
    path_corpus = path_model / "train_data" / "corpus.mallet"
    if not path_corpus.is_file():
        print(f"-- Pipe extraction: Could not locate corpus file")

    # Create auxiliary file with only first line from the original corpus file
    path_txt = path_model / "train_data" / "corpus.txt"
    with path_txt.open('r', encoding='utf8') as f:
        first_line = f.readline()
    path_aux = path_model / "train_data" / "corpus_aux.txt"
    with path_aux.open('w', encoding='utf8') as fout:
        fout.write(first_line + '\n')

    # We perform the import with the only goal to keep a small file containing the pipe
    print(f"-- Extracting pipeline")
    path_pipe = path_model / "train_data" / "import.pipe"

    cmd = mallet_path.as_posix() + \
        ' import-file --use-pipe-from %s --input %s --output %s'
    cmd = cmd % (path_corpus, path_aux, path_pipe)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- Failed to extract pipeline. Revise command')

    # Import data to mallet
    print('-- Inference: Mallet Data Import')

    #
    cmd = mallet_path.as_posix() + \
        ' import-file --use-pipe-from %s --input %s --output %s'
    cmd = cmd % (path_pipe, holdout_corpus, corpus_mallet_inf)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except Exception as e:
        print(e)
        print('-- Mallet failed to import data. Revise command')

    # Get topic proportions
    print('-- Inference: Inferring Topic Proportions')

    cmd = mallet_path.as_posix() + \
        ' infer-topics --inferencer %s --input %s --output-doc-topics %s ' + \
        ' --doc-topics-threshold ' + str(doc_topic_thr) + \
        ' --num-iterations ' + str(num_iterations)
    cmd = cmd % (inferencer, corpus_mallet_inf, doc_topics_file)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- Mallet inference failed. Revise command')

    cols = [k for k in np.arange(2, num_topics + 2)]
    thetas32 = np.loadtxt(doc_topics_file, delimiter='\t', dtype=np.float32, usecols=cols)
    thetas32[thetas32 < 3e-3] = 0
    thetas32 = normalize(thetas32, axis=1, norm='l1')
    thetas32 = sparse.csr_matrix(thetas32, copy=True)
    
    path_save = path_model / "infer_data" / "thetas.npz"
    sparse.save_npz(path_save, thetas32)
    
    return thetas32.shape
    

In [121]:
################
# Paths to data
################
path_parquets = pathlib.Path("/export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/all_processed")
path_place_without_lote = path_parquets / "minors_insiders_outsiders_origen_sin_lot_info.parquet"
path_place_esp = path_parquets / "df_esp_langid.parquet"
path_manual_stops = "/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/stopwords_sin_duplicados"
path_eq = "/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/eq.txt"

In [None]:
################
# Read data
################
print(f"-- -- Reading data from {path_place_esp} and {path_place_without_lote}")
processed = pd.read_parquet(path_place_esp)
cols = processed.columns.values.tolist()
print(f"-- -- Data read from {path_place_esp}: {len(processed)} rows.")
# set identifier as column so we dont loose it
processed['identifier'] = processed.index
print(f"-- -- Columns: {cols}")
place_without_lote = pd.read_parquet(path_place_without_lote)
print(f"-- -- Data read from {path_place_without_lote}: {len(place_without_lote)} rows.")

#########################
# Get additional metadata
#########################
# Merge 'processed' with 'place_without_lote' to get info about the source of the tender (minors, insiders, outsiders)
processed = pd.merge(processed, place_without_lote, how='left', on='id_tm')
processed.set_index('identifier', inplace=True)  # Preserved index
processed = processed[cols + ["origen"]]  #  Keep necessary columns
print(f"-- -- Data merged: {len(processed)} rows.")
#print(f"-- -- Sample: {processed.head()}")

-- -- Reading data from /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/all_processed/df_esp_langid.parquet and /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/all_processed/minors_insiders_outsiders_origen_sin_lot_info.parquet


In [None]:
#####################
# Filter stops /eqs #
#####################
# Filter stops
stopwords = set()
# Lista para registrar los nombres de los archivos procesados
archivos_procesados = []
# Iterar sobre cada archivo en el directorio especificado
for archivo in os.listdir(path_manual_stops):
    if archivo.endswith('.txt'):
        ruta_completa = os.path.join(path_manual_stops, archivo)
        with open(ruta_completa, 'r', encoding='utf-8') as f:
            stopwords.update(f.read().splitlines())
        # Registrar el archivo procesado
        archivos_procesados.append(archivo)
print(f"-- -- There are {len(stopwords)} stopwords")
def eliminar_stopwords(fila):
    return ' '.join([palabra for palabra in fila.split() if palabra not in stopwords])
start = time.time()
processed['lemmas'] = processed['lemmas'].apply(eliminar_stopwords)
print(f"-- -- Stops filtered in {time.time() - start}")

# Filter eqs
start = time.time()
pares_diccionario = {}
compiled_regexes = {}
with open(path_eq, 'r') as archivo:
    for linea in archivo:
        linea = linea.strip()
        palabras = linea.split(':')
        if len(palabras) < 2:
            print(f"Línea omitida o incompleta: '{linea}'")
            continue
        pares_diccionario[palabras[0]] = palabras[1]
pares_diccionario = \
    dict(sorted(pares_diccionario.items(), key=lambda x: x[0]))
print(f"-- -- There are {len(pares_diccionario)} equivalences")
print("-- -- Eq dict constructed in :", time.time() - start)

def replace_keywords(lst, keyword_dict):
    return " ".join([keyword_dict.get(word, word) for word in lst])

start = time.time()
processed["lemmas_split"] = processed['lemmas'].apply(lambda x: x.split())
processed['lemmas'] = processed['lemmas_split'].apply(
    lambda x: replace_keywords(x, pares_diccionario))
processed = processed.drop(columns=['lemmas_split'])
print("-- -- Eq substituted in:", time.time() - start)

In [None]:
pares_diccionario["nanociència_i"]

In [None]:
############################
# Filter by lemmas min len #
############################
min_lemmas = 1
min_lemmas_tm = 2
processed['len'] = processed['lemmas'].apply(lambda x: len(x.split()))
#processed = processed[processed['len'] >= min_lemmas]

In [None]:
all = processed.copy()
minors = all[all.origen == "minors"]
outsiders = all[all.origen == "outsiders"]
insiders = all[all.origen == "insiders"]

In [None]:
# Get tenders that were not included in the modeling process
all_not_tm = all[(all['len'] <= min_lemmas_tm) & (all['len'] >= min_lemmas)]
all_not_tm

In [None]:
all[all['len'] > min_lemmas_tm]

In [None]:
minors_not_tm = minors[(minors['len'] <= min_lemmas_tm) & (minors['len'] >= min_lemmas)]
minors_not_tm

In [None]:
minors[minors['len'] > min_lemmas_tm]

In [None]:
outsiders_not_tm = outsiders[(outsiders['len'] <= min_lemmas_tm) & (outsiders['len'] >= min_lemmas)]
outsiders_not_tm

In [None]:
outsiders[outsiders['len'] > min_lemmas_tm]

In [None]:
insiders_not_tm = insiders[(insiders['len'] <= min_lemmas_tm) & (insiders['len'] >= min_lemmas)]
insiders_not_tm

In [None]:
insiders[insiders['len'] > min_lemmas_tm]

## ALL

In [None]:
path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_all_55_topics_FINAL")
docs = all_not_tm[["id_tm", "lemmas"]].values
num_topics = 55

In [None]:
thetas = sparse.load_npz((path_model / "model_data/TMmodel" / "thetas.npz"))
thetas.toarray().shape

In [None]:
infer_thetas(path_model, num_topics, docs)

## OUTSIDERS

In [None]:
path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_outsiders_30_topics_FINAL")
docs = outsiders_not_tm[["id_tm", "lemmas"]].values
num_topics = 30

In [None]:
infer_thetas(path_model, num_topics, docs)

## INSIDERS

In [None]:
path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_insiders_12_topics_FINAL")
docs = insiders_not_tm[["id_tm", "lemmas"]].values
num_topics = 12

In [None]:
infer_thetas(path_model, num_topics, docs)

## MINORS

In [None]:
path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_minors_40_topics_FINAL")
docs = minors_not_tm[["id_tm", "lemmas"]].values
num_topics = 40

In [None]:
infer_thetas(path_model, num_topics, docs)