In [81]:
import pandas as pd
import pathlib
import numpy as np
import scipy.sparse as sparse
import re
import time

In [94]:
################
# Paths to data
################
path_parquets = pathlib.Path("/export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/all_processed")
path_place_without_lote = path_parquets / "minors_insiders_outsiders_origen_sin_lot_info.parquet"
path_place_esp = path_parquets / "df_esp_langid.parquet"
path_place_CPV = path_parquets / "completo_CPV.parquet"

################
# Read data
################
print(f"-- -- Reading data from {path_place_esp} and {path_place_without_lote}")
processed = pd.read_parquet(path_place_esp)
cols = processed.columns.values.tolist()
print(f"-- -- Data read from {path_place_esp}: {len(processed)} rows.")
# set identifier as column so we dont loose it
processed['identifier'] = processed.index
print(f"-- -- Columns: {cols}")
place_without_lote = pd.read_parquet(path_place_without_lote)
place_without_lote['doc_id'] = place_without_lote.index
print(f"-- -- Data read from {path_place_without_lote}: {len(place_without_lote)} rows.")
place_cpv = pd.read_parquet(path_place_CPV)
place_cpv['doc_id'] = place_cpv.index
print(f"-- -- Data read from {path_place_CPV}: {len(place_cpv)} rows.")

-- -- Reading data from /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/all_processed/df_esp_langid.parquet and /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/all_processed/minors_insiders_outsiders_origen_sin_lot_info.parquet
-- -- Data read from /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/all_processed/df_esp_langid.parquet: 2618584 rows.
-- -- Columns: ['id_tm', 'raw_text', 'lemmas', 'lang']
-- -- Data read from /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/all_processed/minors_insiders_outsiders_origen_sin_lot_info.parquet: 3110261 rows.
-- -- Data read from /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/all_processed/completo_CPV.parquet: 1570211 rows.


In [118]:
def get_df_eval(path_model, df_raw_corpus, path_save, merge_on):
    # Load thetas
    thetas = sparse.load_npz((path_model / "model_data/TMmodel" / "thetas_orig.npz"))
    thetas = thetas.toarray()

    # Load betas
    betas = np.load((path_model / "model_data/TMmodel" / "betas.npy"))

    # Load topic-keys
    with (path_model / "model_data/TMmodel" / "tpc_descriptions.txt").open('r', encoding='utf8') as fin:
        topics_keys = [el.strip() for el in fin.readlines()]

    # Load topic labels
    with (path_model / "model_data/TMmodel" / "tpc_labels.txt").open('r', encoding='utf8') as fin:
        topics_labels = [el.strip() for el in fin.readlines()]

    # Load alphas and number of active documents
    alphas = np.round(np.load((path_model / "model_data/TMmodel" / "alphas.npy")) * 100, 2)
    ndocs_active = np.load((path_model / "model_data/TMmodel" / "ndocs_active.npy"))

    # Load docs
    corpusFile =  path_model / 'train_data/corpus.txt'
    with corpusFile.open("r", encoding="utf-8") as f:
        lines = f.readlines()  
    try:
        corpus = [line.rsplit(" 0 ")[1].strip().split() for line in lines]
    except:
        corpus = [line.rsplit("\t0\t")[1].strip().split() for line in lines]
    
    ids = [line.split("\t0\t")[0] for line in lines]
    if merge_on == "id_tm":
        ids = [int(id_) for id_ in ids]
        
    df_corpus = pd.DataFrame({"lemmas": [" ".join(doc) for doc in corpus]})
    df_corpus[merge_on] = ids
    
    df_corpus["len"] = df_corpus['lemmas'].apply(lambda x: len(x.split()))
    df_corpus = df_corpus.merge(df_raw_corpus, how="inner", on=merge_on)[["lemmas", "text", "title", "summary", "len", "doc_id"]]

    # Load vocab dictionaries
    vocab_w2id = {}
    vocab_id2w = {}
    with path_model.joinpath('model_data/TMmodel/vocab.txt').open('r', encoding='utf8') as fin:
        for i, line in enumerate(fin):
            wd = line.strip()
            vocab_w2id[wd] = i
            vocab_id2w[str(i)] = wd

    print("Calculating approach 3...")
    start = time.time()
    S3 = np.zeros((len(thetas), len(betas)))

    # For each document
    for doc in range(len(thetas)):
        # For each topic
        for topic in range(thetas.shape[1]):

            # ids of the words of document doc in the vocabulary
            wd_ids = []
            for word in corpus[doc]:
                try:
                    wd_ids.append(vocab_w2id[word])
                except Exception as e:
                    #print(f"Word {word} not found in vocabulary") 
                    continue

            # sum of the weights that topic assings to each word in the document
            S3[doc, topic] = np.sum(betas[topic, wd_ids])

    print(f"S3 shape: {S3.shape}")

    S3_sparse = sparse.csr_matrix(S3)
    print(f"Time elapsed: {time.time() - start}")

    # Find the most representative document for each topic
    top_docs_per_topic = []

    for s3_ in S3.T:  
        sorted_docs_indices = np.argsort(s3_)[::-1]  ## Sort the documents based on their proportion for the current topic in descending order
        top = sorted_docs_indices[:3]
        top_docs_per_topic.append(top)

    # get text and summary for each top doc
    top_docs_per_topic_text = []
    for topic_docs in top_docs_per_topic:
        docs = [df_corpus.iloc[doc].summary + " " + df_corpus.iloc[doc].title for doc in topic_docs]
        top_docs_per_topic_text.append(docs)

    top_docs_0 = [docs[0] for docs in top_docs_per_topic_text]
    top_docs_1 = [docs[1] for docs in top_docs_per_topic_text]
    top_docs_2 = [docs[2] for docs in top_docs_per_topic_text]

    df = pd.DataFrame(
        {
            "ID del tópico": range(len(topics_keys)),
            "Etiqueta del tópico": topics_labels,
            "Tamaño del tópico (%)": alphas,
            "Nº documentos activos": ndocs_active,
            "Palabras más representativas": topics_keys,
            "Documento más significativo 1": top_docs_0,
            "Documento más significativo 2": top_docs_1,
            "Documento más significativo 3": top_docs_2,
        }
    )

    df.to_excel(path_save)

In [84]:
################
# Models
################

########
# CPV  #
########
cpv_models = [
    "/export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_df_merged_14_topics_45_ENTREGABLE",
    "/export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_df_merged_25_topics_45_ENTREGABLE",
    "/export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_df_merged_5_topics_79_ENTREGABLE",
    "/export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_df_merged_15_topics_79_ENTREGABLE",
    
]
cpv_models = [pathlib.Path(model) for model in cpv_models]

########
#OTHERS#
########
other_models = [
    "/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_all_55_topics_FINAL",
    "/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_outsiders_30_topics_FINAL",
    "/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_insiders_12_topics_FINAL",
    "/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_minors_40_topics_FINAL"
]

other_models = [pathlib.Path(model) for model in other_models]

In [116]:
for model in cpv_models:
    
    df_raw_corpus, path_save, merge_on = place_cpv, (model.name + ".xlsx"), "doc_id"
    
    print(f"-- -- Processing {model.as_posix()}")

    get_df_eval(model, df_raw_corpus, path_save, merge_on)

-- -- Processing /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_df_merged_14_topics_45_ENTREGABLE
Calculating approach 3...
S3 shape: (34257, 14)
Time elapsed: 2.3139612674713135
-- -- Processing /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_df_merged_25_topics_45_ENTREGABLE
Calculating approach 3...
S3 shape: (34257, 25)
Time elapsed: 4.09037709236145
-- -- Processing /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_df_merged_5_topics_79_ENTREGABLE
Calculating approach 3...
S3 shape: (39579, 5)
Time elapsed: 0.9824609756469727
-- -- Processing /export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_df_merged_15_topics_79_ENTREGABLE
Calculating approach 3...
S3 shape: (39579, 15)
Time elapsed: 2.851377248764038


In [None]:
for model in other_models:
    df_raw_corpus, path_save, merge_on = place_without_lote,  (model.name + ".xlsx"), "id_tm"
    
    print(f"-- -- Processing {model.as_posix()}")

    get_df_eval(model, df_raw_corpus, path_save, merge_on)

-- -- Processing /export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_all_55_topics_FINAL
Calculating approach 3...
