# PR Processing Workflow

Dec. 16, 2023: Refactoring the code of `read_files.ipynb`, to define the pipeline. The code below covers 4 steps (plus one extra step, not used in the final analysis) and assumes that the excel files containing the PR text have already been produced. To extract the text from the html and pdf files, see `read_files.ipynb`. We thus assume that the folder `data` contains one file for each company, with name `{company}.xlsx`, with a column `text` containing the processed text of each PR.

In [94]:
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import nltk
from nltk.tokenize import sent_tokenize
import pickle
from tqdm.notebook import tqdm
from tabulate import tabulate

from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance

from sklearn.metrics.pairwise import cosine_similarity

%load_ext lab_black
%load_ext jupyterlab_notify

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The jupyterlab_notify extension is already loaded. To reload it, use:
  %reload_ext jupyterlab_notify


In [95]:
companies = [
    "Acerinox",
    "ACS",
    "Bancosantander",
    "Bankinter",
    "BBVA",
    "Caixa",
    "Colonial",
    "Enagas",
    "Endesa",
    "Ferrovial",
    "Grifols",
    "IAG",
    "Iberdrola",
    "Inditex",
    "Acciona",
    "Arcelormittal",
    "Bancosabadell",
    "Cellnex",
    "Fluidra",
    "Indra",
    "Logista",
    "Melia",
    "Merlin",
    "Naturgy",
    "Red",
    "Repsol",
    "Rovi",
    "Sacyr",
    "Solaria",
    "Telefonica",
]
print(f"Analyzing {len(companies)} companies.")

Analyzing 30 companies.


## Code and Auxiliary Functions

In [102]:
def get_sentences(docs_list, stopwords):
    sentences = []
    for doc in docs_list:
        aux = sent_tokenize(doc)
        for s in aux:
            # print("s = ", s)
            tokens = nltk.tokenize.wordpunct_tokenize(s)
            s = " ".join([w for w in tokens if w.isalnum() and w not in stopwords])
            sentences.append(s)
            # print(sentences)
            # input("aka")
    return sentences


def get_embeddings(docs):
    sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    embeddings = sentence_model.encode(docs, show_progress_bar=True)
    return embeddings


def get_topics(language, sentences, embeddings, nr_topics=10):

    vectorizer_model = CountVectorizer(
        stop_words="english"
    )  # should be ignored in spanish (stop_words='spanish' does not exist)
    ctfidf_model = ClassTfidfTransformer(
        reduce_frequent_words=True, bm25_weighting=True
    )
    # representation_model = KeyBERTInspired()
    representation_model = MaximalMarginalRelevance(diversity=0.3)
    # topic_model = BERTopic(embedding_model= 'paraphrase-multilingual-MiniLM-L12-v2', representation_model=representation_model,
    #                       vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, nr_topics=nr_topics, reduce_frequent_words=True, verbose=True)
    topic_model = BERTopic(
        embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
        representation_model=representation_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        nr_topics=nr_topics,
        verbose=True,
    )

    # topic_model = BERTopic(embedding_model= 'LaBSE', representation_model=representation_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, nr_topics=nr_topics)

    topics, probs = topic_model.fit_transform(sentences, embeddings)

    return topic_model, topics, probs


def topics_modeling(company):

    for label, language, bert_language in zip(labels, languages, bert_languages):
        print(f"\t - Language : {language}")
        filename = os.path.join(
            "data", "sentences", ("sentences_" + company + "_" + label + ".pkl")
        )
        sentences = pickle.load(open(filename, "rb"))
        print("[S1.] \t Sentences Imported.")

        filename = os.path.join("data", "embeddings", (company + "_" + label + ".pkl"))
        embs = pickle.load(open(filename, "rb"))
        print(
            f"[S2.] \t Embeddings Imported. Starting topic modeling using transformer 'paraphrase-multilingual-MiniLM-L12-v2'..."
        )
        model, topic, prob = get_topics(
            language=language, sentences=sentences, embeddings=embs
        )
        print("[S2.] \t Topics Modeling done.")
        print(model.get_topic_info())
        filename = os.path.join("models", (company + "_" + label + ".bert"))
        model.save(filename)
        print(f"[S3.] \t Model {filename} saved to disk.")
        print("\n")


def sentence_tokenize(company, df):

    print("\n[S1.] Tonekize sentences")
    for label, language, bert_language in zip(labels, languages, bert_languages):
        print(f"\t - Language : {language}")
        doc_list = df[df.language == label].text.values
        stopwords = set(nltk.corpus.stopwords.words(language))
        sentences = get_sentences(doc_list, stopwords)
        filename = os.path.join(
            "data", "sentences", ("sentences_" + company + "_" + label + ".pkl")
        )
        pickle.dump(sentences, open(filename, "wb"))
        print(
            f"\t[S1-{label}.] \t Sentences tonenized saved to disk. Total nr. sentences = {len(sentences)}."
        )
    print("[S1.] Done with tonekize sentences")


def embeddings_creation(company):
    print("\n[S2.] Embeddings creation")
    for label, language, bert_language in zip(labels, languages, bert_languages):
        filename = os.path.join(
            "data", "sentences", ("sentences_" + company + "_" + label + ".pkl")
        )
        sentences = pickle.load(open(filename, "rb"))
        print(f"[S2-{label}.] \t Starting with embeddings creation.")
        embs = get_embeddings(sentences)
        filename = os.path.join("data", "embeddings", (company + "_" + label + ".pkl"))
        print(f"[S2-{label}.] \t Embeddings file {filename} saved to disk.")
        pickle.dump(embs, open(filename, "wb"))
    print("[S2.] Done with embeddings creation")


def pr_2_pr(companies, transformer_type="laBSE", en_to_es=True, save_file=False):
    # sentence transformer to find 1:1 match
    # use spanish as corpus and english for queries

    print(
        f"\t [NOTE] Similarity scores computed using '{transformer_type}' transformer"
    )
    model = SentenceTransformer(transformer_type)

    similarity_scores = []
    nr_ens = []
    nr_ess = []

    print(f"** Sentence Transformers (match) for Company '{company}' **")
    name = company + ".xlsx"
    df = pd.read_excel(os.path.join("data", name))
    df = df[~df.text.isna()]
    nr_en = df[df.language == "en"].shape[0]
    nr_es = df[df.language == "es"].shape[0]
    en_list = df[df.language == "en"].text.to_list()
    en_files = df[df.language == "en"].filename.to_list()
    es_list = df[df.language == "es"].text.to_list()
    es_files = df[df.language == "es"].filename.to_list()
    # stopwords = set(nltk.corpus.stopwords.words("spanish"))

    if en_to_es:
        corpus_embedding = model.encode(es_list, convert_to_tensor=True)
        top_k = min(1, len(es_list))
        corpus_list = es_list
        corpus_files_list = es_files
        query_list = en_list
        query_files_list = en_files
        name = "comparison_" + company + "_en2es.xlsx"
    else:
        corpus_embedding = model.encode(en_list, convert_to_tensor=True)
        top_k = min(1, len(en_list))
        corpus_list = en_list
        corpus_files_list = en_files
        query_list = es_list
        query_files_list = es_files
        name = "comparison_" + company + "_es2en.xlsx"

    best_match = []
    best_score = []
    best_name = []
    for query in tqdm(query_list):
        query_embedding = model.encode(query, convert_to_tensor=True)

        cos_scores = util.cos_sim(query_embedding, corpus_embedding)[0]
        top_results = torch.topk(cos_scores, k=top_k)
        # print("Query:", query)
        # print("---------------------------")
        for score, idx in zip(top_results[0], top_results[1]):
            # print(f'[{idx:4d}]\t{round(score.item(), 3)} | {corpus_list[idx]}')
            best_match.append(idx)
            best_score.append(score.item())
            best_name.append(corpus_files_list[idx])
    if en_to_es:
        print(
            f"{company:15s}\t similarity score from 'en' to 'es' = {np.mean(best_score):.3f}."
        )
    else:
        print(
            f"{company:15s}\t similarity score from 'es' to 'en' = {np.mean(best_score):.3f}."
        )

    similarity_scores.append(np.mean(best_score))
    nr_ens.append(nr_en)
    nr_ess.append(nr_es)
    if save_file:
        df_comparison = (
            pd.DataFrame(
                {
                    "similarity": best_score,
                    "query": query_list,
                    "match": [corpus_list[i] for i in best_match],
                    "idx_match": [int(i) for i in best_match],
                    "query_filename": query_files_list,
                    "match_filename": best_name,
                }
            )
            .reset_index()
            .rename(columns={"index": "idx_query"})
        )
        filename = os.path.join("results/best_matching/", name)
        df_comparison.to_excel(filename, index=False)
        print("Comparison file saved to disk : '{}'.".format(filename))
    return similarity_scores, nr_ens, nr_ess


def get_similarity_scores(company, transformer_type, with_printing=True):
    """Get pr_2_pr similarity score, using a query and finding the best match.

    NOTE: For Rovi, Santander, and Solaria, manually change header of
    column B in the data/company.xlslx excel file (to "filename")"""

    from_en_to_es = [True, False]
    suffixes = ["en2es", "es_to_en"]

    for en_to_es, suffix in zip(from_en_to_es, suffixes):
        similarity_scores, nr_ens, nr_ess = pr_2_pr(
            company, transformer_type, en_to_es=en_to_es, save_file=True
        )

        df_sim = pd.DataFrame(
            {
                "company": company,
                "similarity": similarity_scores,
                "nr_en": nr_ens,
                "nr_es": nr_ess,
            }
        )
        if with_printing:
            print(tabulate(df_sim, headers="keys", tablefmt="fancy_grid"))


def get_summary_similarity(companies):
    nr_ess = np.empty(len(companies))
    nr_ens = np.empty(len(companies))
    sim_en2es = np.empty(len(companies))
    sim_es2en = np.empty(len(companies))

    for i, company in enumerate(tqdm(companies)):
        df_en_es = pd.read_excel(
            f"results/best_matching/comparison_{company}_en2es.xlsx"
        )
        df_es_en = pd.read_excel(
            f"results/best_matching/comparison_{company}_es2en.xlsx"
        )
        nr_ess[i] = df_es_en.shape[0]
        nr_ens[i] = df_en_es.shape[0]
        sim_en2es[i] = df_en_es.similarity.mean()
        sim_es2en[i] = df_es_en.similarity.mean()

    df_temp = pd.DataFrame(
        {
            "company": companies,
            "nr_en": nr_ens,
            "nr_es": nr_ess,
            "similarity_en_to_es": sim_en2es,
            "similarity_es_to_en": sim_es2en,
        }
    ).sort_values(by="company")
    filename = "results/avg_similarity_company_both.xlsx"
    df_temp.to_excel(filename, index=False)
    print(f"Saving summary table to disk file: '{filename}'.")
    print(tabulate(df_temp, headers="keys", tablefmt="fancy_grid"))


def match_topics(company):
    """Try to find the best match between topics of the same company in English and Spanish."""

    for company in companies:
        model_en = BERTopic.load(f"models/{company}_en.bert")
        model_es = BERTopic.load(f"models/{company}_es.bert")
        M = cosine_similarity(
            model_en.topic_embeddings_[1:], model_es.topic_embeddings_[1:]
        )
        idx = np.argmax(M, axis=1).tolist()  # best match for each english topic
        dfE = pd.DataFrame(model_en.get_topic_info()[1:]).rename(
            columns={"Name": "Name EN"}
        )
        dfS = (
            pd.DataFrame(model_es.get_topic_info().loc[[i + 1 for i in idx]])
            .reset_index()
            .rename(columns={"Name": "Best Match ES"})
        )
        dfMatch = (
            pd.concat(
                [dfE.reset_index(), dfS.reset_index(), pd.DataFrame(M.max(axis=1))],
                axis=1,
            )
            .rename(columns={0: "similarity"})
            .drop(columns={"index", "level_0"})
        )
        print(
            f"[{company:>20s}]\tTotal similarity score = {dfMatch.similarity.mean():.3f}"
        )
        dfMatch.to_excel(f"results/topics/topics_match_{company}.xlsx", index=False)
        print(f"Saving file 'results/topics/topics_match_{company}.xlsx' to disk.")

        cols = ["topic_" + str(i) for i in range(dfMatch.shape[0])]
        df_en = pd.DataFrame(model_en.get_topics()).iloc[:, 1:]
        df_es = pd.DataFrame(model_es.get_topics())[idx]
        df_en.columns = cols
        df_es.columns = cols
        pd.concat([df_en, df_es], axis=0).reset_index().to_excel(
            f"results/topics/topics_details_{company}.xlsx", index=False
        )
        print(f"Saving file 'results/topics/topics_details_{company}.xlsx' to disk.")


def topics_on_low_similarity_pr(company, with_printing=True):
    """Extract topics only using PR with low similarity.

    A PR has a low similarity if the similarity score is in the botton 20th percentile.
    """

    suffixes = ["en2es", "es2en"]
    print(f"** Extracting topics on low similarity PR for Company '{company}' **")

    for label, language, bert_language, suffix in zip(
        labels, languages, bert_languages, suffixes
    ):
        print(f"\t - Language : {language}")
        df = pd.read_excel(f"results/best_matching/comparison_{company}_{suffix}.xlsx")
        doc_list = df[df.similarity < np.percentile(df.similarity, q=20)][
            "query"
        ].values
        print(f"\tExtracting {len(doc_list)} PR with low similarity...")

        stopwords = set(nltk.corpus.stopwords.words(language))
        print("[S1-s.] \t Tonekize sentences")
        sentences = get_sentences(doc_list, stopwords)
        filename = os.path.join(
            "data", "sentences", ("sentences_lowsim_" + company + "_" + label + ".pkl")
        )
        pickle.dump(sentences, open(filename, "wb"))
        print(
            "[S1-e.] \t Sentences tonenized saved to disk. Total nr. sentences = {}.".format(
                len(sentences)
            )
        )

        print("[S2-s.] \t Starting with embeddings creation.")
        embs = get_embeddings(sentences)
        filename = os.path.join(
            "data", "embeddings", (company + "_lowsim_" + label + ".pkl")
        )
        print(f"[S2-e.] \t Embeddings file {filename} saved to disk.")
        pickle.dump(embs, open(filename, "wb"))

        embs = pickle.load(open(filename, "rb"))
        print(
            f"[S2.] \t Embeddings imported from {filename}. Starting topic modeling..."
        )
        model, topic, prob = get_topics(
            language=language, sentences=sentences, embeddings=embs
        )
        print("[S2.] \t Topics Modeled")
        print(model.get_topic_info())
        df_topic = pd.DataFrame(model.get_topics()).iloc[:, 1:]
        namefile = f"results/topics/topics_details_lowsim_{company}_{label}.xlsx"
        df_topic.to_excel(namefile, index=False)
        print(f"Topics details for company {company} saved to disk. File '{namefile}'")
        if with_printing:
            print(df_topic)

## Main

1. Sentence tokenize: Save sentences in file `data/sentences/company_lang_.pkl`
2. Embeddings creation: Read the file created in step 1, and create embeddings. Saved in file `data/embeddings/company_lang.pkl`
3. Get similarity score for `en2es` and `es2en`. This produces two files, `comparison_company_lang2lang.xlsx`, where `lang2lang`indicates the language of the query (first language) and the best match (second language). **NOTE**: We can choose between two transformers; however, for this task, the `LaBSE` transfomer seems to be the best.
4. Get topics using low similarity PR. For each language, we identify the PR with a similarity score in the bottom 20th percentile, in each language. For this subset of PR, we extract the most representative topics, up to a maximum of 10 topics. **NOTE**: For this task, we use the model `paraphrase-multilingual-MiniLM-L12-v2`, since it seems to provide better performance for the topic modeling task.
5. Extra Tasks:
>- Topic modeling for each company. We store the top 10 topics in an excel file.
>- Topics match for each company. We try to match each english topic with the best match (closest cosine similarity score) among the spanish topics.
Note that these extra tasks do not seem to be too informative. 

In [99]:
%%time
#  companies = ["Endesa"]
labels = ["en", "es"]
languages = ["english", "spanish"]
bert_languages = ["english", "spanish"]

topics_analysis = False  # does not seem to be too informative

for count, company in enumerate(companies):
    print(f"[{count:2.0f}/{len(companies)}]\t** Sentences and Embeddings Creation for Company '{company}' **")
    name = company + ".xlsx"
    filename = os.path.join("data", name)
    df = pd.read_excel(filename)
    df = df[~df.text.isna()]

    sentence_tokenize(company, df)
    embeddings_creation(company)
    
    #  NOTE: For Rovi, Santander, and Solaria, see comment above
    get_similarity_scores(
      company, transformer_type="laBSE"
    )  # models = "laBSE", "paraphrase-multilingual-MiniLM-L12-v2"

    topics_on_low_similarity_pr(company, with_printing=False)

    if topics_analysis:
        topics_modeling(company)
        match_topics(company)
        
print_summary_similarity(companies_all)

[ 0/30]	** Sentences and Embeddings Creation for Company 'Acerinox' **
** Extracting topics on low similarity PR for Company 'Acerinox' **
	 - Language : english
	Extracting 46 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 495.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Acerinox_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:14:58,146 - BERTopic - Reduced dimensionality
2023-12-16 15:14:58,153 - BERTopic - Clustered reduced embeddings
2023-12-16 15:15:00,439 - BERTopic - Reduced number of topics from 10 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                         Name
0     -1     24         -1_amounted_payments_million_service
1      0     19          0_dividend_payment_distributed_euro
2      1     15          1_sustainable_banks_financing_green
3      2     14                2_metals_vdm_competition_2020
4      3     40               3_ebitda_million_quarter_taxes
5      4     20           4_donations_gibraltar_aid_pandemic
6      5     72  5_director_chairman_directors_reappointment
7      6    239             6_steel_market_production_safety
8      7     32         7_improvement_savings_costs_recovery
9      8     20         8_digital_360_lighting_excellence360
Topics details for company Acerinox saved to disk. File 'results/topics/topics_details_lowsim_Acerinox_en.xlsx'
	 - Language : spanish
	Extracting 46 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 509.
[S2-s.] 	 Starting with embed

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Acerinox_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:15:07,489 - BERTopic - Reduced dimensionality
2023-12-16 15:15:07,498 - BERTopic - Clustered reduced embeddings
2023-12-16 15:15:14,662 - BERTopic - Reduced number of topics from 20 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                            Name
0     -1     60            -1_ventas_factoría_reducir_emisiones
1      0    113                0_inoxidable_mercado_acero_euros
2      1     81    1_sostenible_sostenibilidad_gestión_gobierno
3      2     49          2_harvard_business_consejera_ingeniero
4      3     46      3_presidente_igualdad_diversidad_propondrá
5      4     45      4_gibraltar_emisiones_pandemia_neutralidad
6      5     44  5_excellence_digital_transformación_aprovechar
7      6     34         6_accionistas_general_dividendo_capital
8      7     26     7_estratégicas_mejorarán_velázquez_delegado
9      8     11         8_impuestos_921_minoritarios_resultados
Topics details for company Acerinox saved to disk. File 'results/topics/topics_details_lowsim_Acerinox_es.xlsx'
[ 1/30]	** Sentences and Embeddings Creation for Company 'ACS' **
** Extracting topics on low similarity PR for Company 'ACS' **
	 - Language : english
	Extracti

Batches:   0%|          | 0/62 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/ACS_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:15:33,022 - BERTopic - Reduced dimensionality
2023-12-16 15:15:33,055 - BERTopic - Clustered reduced embeddings
2023-12-16 15:15:40,087 - BERTopic - Reduced number of topics from 47 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                            Name
0     -1    319                   -1_profit_capital_cash_shares
1      0    689                  0_profit_ebitda_euro_accounted
2      1    469         1_cleaning_board_shareholders_corporate
3      2    188         2_states_highway_australia_construction
4      3     90                      3_debt_bank_evolution_year
5      4     70               4_plant_capacity_transmission_epc
6      5     61  5_pacific_diversification_geographical_africa1
7      6     29               6_stake_shares_dividend_dividends
8      7     24                           7_oficina_consultas__
9      8     22                  8_spanish_rebound_growth_boost
Topics details for company ACS saved to disk. File 'results/topics/topics_details_lowsim_ACS_en.xlsx'
	 - Language : spanish
	Extracting 65 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 1886.
[S2-s

Batches:   0%|          | 0/59 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/ACS_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:15:58,450 - BERTopic - Reduced dimensionality
2023-12-16 15:15:58,478 - BERTopic - Clustered reduced embeddings
2023-12-16 15:16:05,724 - BERTopic - Reduced number of topics from 46 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    412      -1_estados_financiación_capital_participación
1      0    696  0_accionistas_sociedad_información_administración
2      1    247               1_total_internacionales_meses_ventas
3      2    149          2_australia_autopista_proyecto_ampliación
4      3    126                   3_américa_limpieza_europa_áfrica
5      4    102          4_energía_transmisión_renovable_eléctrica
6      5     76        5_deuda_endeudamiento_financiera_dividendos
7      6     35                6_canadá_montreal_vancouver_ontario
8      7     30           7_corporación_magnitudes_beneficio_ebit1
9      8     13              8_dólar_depreciación_australiano_peso
Topics details for company ACS saved to disk. File 'results/topics/topics_details_lowsim_ACS_es.xlsx'
[ 2/30]	** Sentences and Embeddings Creation for Company 'Bancosantander' **
** Extracting topics on low similarity PR for Company 'Bancos

Batches:   0%|          | 0/89 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Bancosantander_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:16:27,598 - BERTopic - Reduced dimensionality
2023-12-16 15:16:27,646 - BERTopic - Clustered reduced embeddings
2023-12-16 15:16:36,871 - BERTopic - Reduced number of topics from 65 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                            Name
0     -1   1018           -1_financial_support_digital_services
1      0   1248              0_million_universities_polska_2020
2      1    158  1_disabilities_students_accessibility_language
3      2    100               2_ibiznes24_mail_channels_prasowe
4      3     99               3_carbon_emissions_renewable_wind
5      4     77                      4_tamaño_jpeg_mb_packaging
6      5     63          5_deforestation_amazon_brazil_ranchers
7      6     36                  6_café_mural_espresso_brooklyn
8      7     33     7_coronavirus_vaccination_vaccines_outbreak
9      8     14           8_tablet_identity_biometric_signature
Topics details for company Bancosantander saved to disk. File 'results/topics/topics_details_lowsim_Bancosantander_en.xlsx'
	 - Language : spanish
	Extracting 221 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. 

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Bancosantander_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:17:09,220 - BERTopic - Reduced dimensionality
2023-12-16 15:17:09,295 - BERTopic - Clustered reduced embeddings
2023-12-16 15:17:22,314 - BERTopic - Reduced number of topics from 95 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                             Name
0     -1   1218          -1_empresas_proyectos_tamaño_sostenible
1      0   2265                 0_banco_clientes_madrid_empresas
2      1    147             1_violencia_mujeres_integra_víctimas
3      2     89  2_coronavirus_pandemia_diagnóstico_enfermedades
4      3     58            3_hidrógeno_carbono_emisiones_dióxido
5      4     57                4_hipoteca_tae_hipotecas_préstamo
6      5     54         5_ciberseguridad_cyber_seguridad_podcast
7      6     22                      6_chile_2016_polonia_brasil
8      7     16           7_exterior_consolidar_exportar_exporta
9      8     16          8_oficinas_sector_reconversión_plantean
Topics details for company Bancosantander saved to disk. File 'results/topics/topics_details_lowsim_Bancosantander_es.xlsx'
[ 3/30]	** Sentences and Embeddings Creation for Company 'Bankinter' **
** Extracting topics on low similarity PR for Company 'Bankinter' 

Batches:   0%|          | 0/72 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Bankinter_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:17:39,227 - BERTopic - Reduced dimensionality
2023-12-16 15:17:39,266 - BERTopic - Clustered reduced embeddings
2023-12-16 15:17:46,726 - BERTopic - Reduced number of topics from 50 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    804              -1_account_insurance_investment_euros
1      0   1025                      0_euros_ratio_billion_banking
2      1    137           1_energy_buildings_efficiency_renovation
3      2    116  2_sustainability_environmental_carbon_sustainable
4      3     55  3_accessibility_disabilities_students_voluntee...
5      4     47                4_fixed_variable_mortgage_mortgages
6      5     39                5_shock_coronavirus_measures_crisis
7      6     33             6_ecosystem_observatory_silicon_valley
8      7     30              7_digital_app_digitalisation_browsing
9      8     10                         8_apple_iphone_ipad_sensor
Topics details for company Bankinter saved to disk. File 'results/topics/topics_details_lowsim_Bankinter_en.xlsx'
	 - Language : spanish
	Extracting 123 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized sa

Batches:   0%|          | 0/67 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Bankinter_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:18:03,717 - BERTopic - Reduced dimensionality
2023-12-16 15:18:03,753 - BERTopic - Clustered reduced embeddings
2023-12-16 15:18:10,908 - BERTopic - Reduced number of topics from 45 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                              Name
0     -1    765                    -1_euros_millones_banco_fondos
1      0    523                0_millones_crecimiento_euros_ratio
2      1    358             1_hipoteca_préstamos_crédito_tarjetas
3      2    166     2_sostenibilidad_energética_ambiental_carbono
4      3    152            3_liberty_vehículos_españoles_portugal
5      4     79          4_digitalización_digitales_móvil_digital
6      5     59  5_discapacidad_fundación_accesibilidad_formación
7      6     16             6_hoteles_hoteleros_bursátil_hotelero
8      7     13                         7_apple_iphone_ipad_watch
9      8     13         8_coronavirus_batería_familiares_epidemia
Topics details for company Bankinter saved to disk. File 'results/topics/topics_details_lowsim_Bankinter_es.xlsx'
[ 4/30]	** Sentences and Embeddings Creation for Company 'BBVA' **
** Extracting topics on low similarity PR for Company 'BBVA' **
	 - La

Batches:   0%|          | 0/120 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/BBVA_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:18:45,459 - BERTopic - Reduced dimensionality
2023-12-16 15:18:45,521 - BERTopic - Clustered reduced embeddings
2023-12-16 15:18:58,618 - BERTopic - Reduced number of topics from 98 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                      Name
0     -1    735                 -1_loans_sector_23_income
1      0   2569          0_income_financial_million_euros
2      1    205  1_sustainable_sustainability_carbon_2025
3      2    119       2_currencies_bitcoin_crypto_virtual
4      3     85          3_argentina_brazil_colombia_peru
5      4     42       4_unicorns_unicorn_bubble_companies
6      5     19  5_connected_internet_automation_remotely
7      6     14                 6_mexico_hiking_fed_hikes
8      7     14            7_drones_drone_aircraft_parrot
9      8     11        8_blood_tests_theranos_reliability
Topics details for company BBVA saved to disk. File 'results/topics/topics_details_lowsim_BBVA_en.xlsx'
	 - Language : spanish
	Extracting 105 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 10436.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/327 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/BBVA_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:20:01,573 - BERTopic - Reduced dimensionality
2023-12-16 15:20:01,763 - BERTopic - Clustered reduced embeddings
2023-12-16 15:20:39,199 - BERTopic - Reduced number of topics from 200 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                          Name
0     -1   3341                 -1_méxico_research_2020_datos
1      0   5912                             0_81_76_70_méxico
2      1    755        1_vivienda_hipotecario_ahorro_bancario
3      2    134             2_internet_usuarios_móvil_móviles
4      3    105     3_edificio_edificios_plaza_comunicaciones
5      4     52    4_manufacturas_metálicos_plástico_textiles
6      5     51   5_innovación_emprendedor_patentes_invención
7      6     41           6_virus_contagio_pandemia_contagios
8      7     29  7_repatriados_migrantes_mexicans_repatriated
9      8     16                                      8_int___
Topics details for company BBVA saved to disk. File 'results/topics/topics_details_lowsim_BBVA_es.xlsx'
[ 5/30]	** Sentences and Embeddings Creation for Company 'Caixa' **
** Extracting topics on low similarity PR for Company 'Caixa' **
	 - Language : english
	Extracting 129 PR with low similar

Batches:   0%|          | 0/660 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Caixa_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:22:20,952 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-12-16 15:22:22,424 - BERTopic - Clustered reduced embeddings
2023-12-16 15:23:57,550 - BERTopic - Reduced number of topics from 857 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                Name
0     -1   1495     -1_bank_capital_climate_banking
1      0  19313  0_bank_banking_customers_financial
2      1    144   1_cardiovascular_cancer_york_jobs
3      2     43      2_elderly_aging_ageing_dignity
4      3     33    3_moratorium_apply_mortgage_loan
5      4     26   4_turkish_ukrainian_turkey_charge
6      5     23    5_30am_30pm_timetables_schedules
7      6     17               6_open_branches_bank_
8      7     12                         7_driven___
9      8     11                      8_coupon_set__
Topics details for company Caixa saved to disk. File 'results/topics/topics_details_lowsim_Caixa_en.xlsx'
	 - Language : spanish
	Extracting 587 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 80133.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/2505 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Caixa_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:38:12,432 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-12-16 15:38:16,492 - BERTopic - Clustered reduced embeddings
2023-12-16 15:45:41,563 - BERTopic - Reduced number of topics from 2828 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                          Name
0     -1   3877         -1_director_caixabank_sector_clientes
1      0  75805          0_caixabank_clientes_empresas_sector
2      1    220  1_biotech_suanfarma_biotecnología_organismos
3      2     60           2_patriotismo_situada_celebró_santa
4      3     51     3_filantrópicos_euromoney_revista_impacto
5      4     49                                      4_30h___
6      5     19                  5_rumbao_pérez_300_empleados
7      6     18                6_partner_socio_fútbol_oficial
8      7     18                  7_base_principal_banco_mayor
9      8     16      8_escuelas_educación_cataluña_financiera
Topics details for company Caixa saved to disk. File 'results/topics/topics_details_lowsim_Caixa_es.xlsx'
[ 6/30]	** Sentences and Embeddings Creation for Company 'Colonial' **
** Extracting topics on low similarity PR for Company 'Colonial' **
	 - Language : english
	Extracting 7 PR with low s

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Colonial_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:45:47,987 - BERTopic - Reduced dimensionality
2023-12-16 15:45:47,993 - BERTopic - Clustered reduced embeddings
2023-12-16 15:45:48,625 - BERTopic - Reduced number of topics from 3 to 3


[S2.] 	 Topics Modeled
   Topic  Count                              Name
0      0     19        0_40732015_26104_246976_v4
1      1     44  1_deed_covenant_issuer_agreement
2      2    220  2_portfolio_assets_madrid_market
Topics details for company Colonial saved to disk. File 'results/topics/topics_details_lowsim_Colonial_en.xlsx'
	 - Language : spanish
	Extracting 7 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 264.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Colonial_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:45:54,344 - BERTopic - Reduced dimensionality
2023-12-16 15:45:54,349 - BERTopic - Clustered reduced embeddings
2023-12-16 15:45:55,595 - BERTopic - Reduced number of topics from 8 to 8


[S2.] 	 Topics Modeled
   Topic  Count                                     Name
0     -1    115              -1_clientes_calidad_www_cbd
1      0     12  0_carbono_emisiones_sustainability_2050
2      1     49         1_barcelona_parís_madrid_mercado
3      2     21    2_iniciativas_ayudas_social_servicios
4      3     16        3_bonos_verdes_green_financiación
5      4     19  4_sfl_accionistas_participación_capital
6      5     17           5_trimestre_2020_periodo_hotel
7      6     15                6_precios_erv_20_alquiler
Topics details for company Colonial saved to disk. File 'results/topics/topics_details_lowsim_Colonial_es.xlsx'
[ 7/30]	** Sentences and Embeddings Creation for Company 'Enagas' **
** Extracting topics on low similarity PR for Company 'Enagas' **
	 - Language : english
	Extracting 71 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 1909.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Enagas_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:46:11,846 - BERTopic - Reduced dimensionality
2023-12-16 15:46:11,875 - BERTopic - Clustered reduced embeddings
2023-12-16 15:46:19,740 - BERTopic - Reduced number of topics from 44 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    568  -1_transition_transmission_network_infrastructure
1      0    839                 0_hydrogen_energy_renewable_supply
2      1    149                      1_demand_gwh_natural_february
3      2    130                         2_euros_dividend_debt_line
4      3     90          3_directors_director_executive_innovation
5      4     53        4_pipelines_pipeline_regasification_germany
6      5     26                 5_reforestation_co₂_hectares_trees
7      6     24           6_biogas_biomethane_wastewater_renewable
8      7     17          7_photovoltaic_solar_solaria_construction
9      8     13       8_acciona_regenerative_economy_institutional
Topics details for company Enagas saved to disk. File 'results/topics/topics_details_lowsim_Enagas_en.xlsx'
	 - Language : spanish
	Extracting 78 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to 

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Enagas_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:46:33,988 - BERTopic - Reduced dimensionality
2023-12-16 15:46:34,013 - BERTopic - Clustered reduced embeddings
2023-12-16 15:46:34,986 - BERTopic - Reduced number of topics from 6 to 6


[S2.] 	 Topics Modeled
   Topic  Count                                           Name
0     -1     20          -1_electrónico_correo_bolsa_plinovodi
1      0   1521            0_gas_hidrógeno_millones_energética
2      1     21           1_institucionales_inversores_tel_985
3      2     10      2_com_información_fundacionrepsol_atrevia
4      3     26                3_tel_electrónico_madrid_dircom
5      4     28  4_relaciones_institucionales_tel_comunicación
Topics details for company Enagas saved to disk. File 'results/topics/topics_details_lowsim_Enagas_es.xlsx'
[ 8/30]	** Sentences and Embeddings Creation for Company 'Endesa' **
** Extracting topics on low similarity PR for Company 'Endesa' **
	 - Language : english
	Extracting 30 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 805.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Endesa_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:46:43,886 - BERTopic - Reduced dimensionality
2023-12-16 15:46:43,900 - BERTopic - Clustered reduced embeddings
2023-12-16 15:46:47,964 - BERTopic - Reduced number of topics from 21 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                          Name
0     -1    113             -1_circular_prices_energy_voltage
1      0    228  0_enel_electrification_electricity_renewable
2      1    160      1_employees_workforce_marijuana_training
3      2    100                 2_music_madrid_piano_concerts
4      3     61    3_species_birds_electrocution_conservation
5      4     57                  4_cathedral_2021_santa_mayor
6      5     25           5_hotels_charging_mobility_chargers
7      6     23                     6_wind_laser_beam_horizon
8      7     19               7_eruption_lava_island_volcanic
9      8     19        8_train_hydroelectric_transport_diesel
Topics details for company Endesa saved to disk. File 'results/topics/topics_details_lowsim_Endesa_en.xlsx'
	 - Language : spanish
	Extracting 29 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 676.
[S2-s.] 	 Starting wit

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Endesa_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:46:55,555 - BERTopic - Reduced dimensionality
2023-12-16 15:46:55,568 - BERTopic - Clustered reduced embeddings
2023-12-16 15:46:59,469 - BERTopic - Reduced number of topics from 21 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                        Name
0     -1    115    -1_director_sostenible_financiera_futuro
1      0    262  0_baloncesto_públicas_distribución_negocio
2      1     79           1_senior_alumnos_profesionales_50
3      2     74          2_energética_ahorro_emisiones_2040
4      3     48        3_río_colisión_4metering_dispositivo
5      4     31          4_hoteles_hotelera_hotelero_hotels
6      5     21             5_movilidad_pagar_juicepass_app
7      6     19    6_microalgas_producción_biotech_biorizon
8      7     17  7_biodiversidad_conservación_fauna_natural
9      8     10     8_premios_premio_elegidos_participantes
Topics details for company Endesa saved to disk. File 'results/topics/topics_details_lowsim_Endesa_es.xlsx'
[ 9/30]	** Sentences and Embeddings Creation for Company 'Ferrovial' **
** Extracting topics on low similarity PR for Company 'Ferrovial' **
	 - Language : english
	Extracting 73 PR with low similarity...
[S1-

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Ferrovial_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:47:12,376 - BERTopic - Reduced dimensionality
2023-12-16 15:47:12,402 - BERTopic - Clustered reduced embeddings
2023-12-16 15:47:18,257 - BERTopic - Reduced number of topics from 34 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                           Name
0     -1    498              -1_amey_contract_consortium_texas
1      0    479               0_backlog_revenues_euro_amounted
2      1    256                  1_traffic_roads_lanes_highway
3      2    102         2_airport_passengers_heathrow_airports
4      3    101                3_waste_services_water_cleaning
5      4    100                4_debt_liquidity_cash_financial
6      5     54  5_volunteers_commitment_sustainable_community
7      6     50                       6_24_2024_scheduled_plan
8      7     18       7_balconies_hope_photographed_reflection
9      8     11  8_hyperlooptt_hyperloop_leader_transportation
Topics details for company Ferrovial saved to disk. File 'results/topics/topics_details_lowsim_Ferrovial_en.xlsx'
	 - Language : spanish
	Extracting 73 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 1672.
[S2-

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Ferrovial_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:47:32,281 - BERTopic - Reduced dimensionality
2023-12-16 15:47:32,306 - BERTopic - Clustered reduced embeddings
2023-12-16 15:47:39,639 - BERTopic - Reduced number of topics from 45 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                            Name
0     -1    602         -1_autopista_dólares_contratos_división
1      0    527                   0_euros_407_servicios_negocio
2      1    196         1_autopistas_tráfico_tráficos_movilidad
3      2    151       2_pasajeros_aeropuertos_aeropuerto_denver
4      3     55          3_dividendo_derechos_bonos_accionistas
5      4     51          4_sociales_pandemia_empleados_usuarios
6      5     38     5_sheffield_carreteras_ministerio_británico
7      6     24           6_agua_saneamiento_áfrica_comunidades
8      7     14  7_hyperloop_hyperlooptt_tecnología_experiencia
9      8     14                 8_chile_estación_minero_chilena
Topics details for company Ferrovial saved to disk. File 'results/topics/topics_details_lowsim_Ferrovial_es.xlsx'
[10/30]	** Sentences and Embeddings Creation for Company 'Grifols' **
** Extracting topics on low similarity PR for Company 'Grifols' **
	 - Language : english

Batches:   0%|          | 0/95 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Grifols_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:48:04,246 - BERTopic - Reduced dimensionality
2023-12-16 15:48:04,300 - BERTopic - Clustered reduced embeddings
2023-12-16 15:48:17,924 - BERTopic - Reduced number of topics from 90 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                       Name
0     -1    701           -1_debt_patients_plasma_increase
1      0   1396      0_spanish_million_version_discrepancy
2      1    295        1_bioscience_plasma_revenues_growth
3      2    229      2_shareholders_directors_board_shares
4      3    165         3_blood_procleix_transfusion_virus
5      4     71       4_proteins_antitrypsin_alpha_protein
6      5     68      5_centers_exceeded_donation_employees
7      6     60     6_alzheimer_polio_survivors_prevalence
8      7     29           7_china_renewal_albumin_licenses
9      8     23  8_liver_cirrhosis_hemodialysis_transplant
Topics details for company Grifols saved to disk. File 'results/topics/topics_details_lowsim_Grifols_en.xlsx'
	 - Language : spanish
	Extracting 49 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 2759.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Grifols_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:48:42,365 - BERTopic - Reduced dimensionality
2023-12-16 15:48:42,409 - BERTopic - Clustered reduced embeddings
2023-12-16 15:48:55,741 - BERTopic - Reduced number of topics from 77 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    837           -1_millones_ventas_trimestre_crecimiento
1      0   1072              0_mercado_inversiones_ingresos_futuro
2      1    285    1_supplies_farmacéutico_medicamentos_bioscience
3      2    204                 2_plasma_deuda_financiación_costes
4      3    193  3_inmunoglobulina_transfusional_transfusión_sa...
5      4     51          4_alzheimer_alzhéimer_replacement_albumin
6      5     46                         5_virus_detección_zika_fda
7      6     32        6_duomocomunicacion_comunicación_tel_prensa
8      7     20                   7_europa_europeos_alemania_unión
9      8     19             8_latinoamérica_chile_argentina_méxico
Topics details for company Grifols saved to disk. File 'results/topics/topics_details_lowsim_Grifols_es.xlsx'
[11/30]	** Sentences and Embeddings Creation for Company 'IAG' **
** Extracting topics on low similarity PR for Company 'IAG' **
	

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/IAG_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:49:09,217 - BERTopic - Reduced dimensionality
2023-12-16 15:49:09,239 - BERTopic - Clustered reduced embeddings
2023-12-16 15:49:14,738 - BERTopic - Reduced number of topics from 36 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                       Name
0     -1    408              -1_km_revenue_cargo_passenger
1      0    378          0_statements_company_events_plans
2      1    157           1_cent_traffic_capacity_measured
3      2    135              2_reuters_thomson_offer_bonds
4      3     79  3_oneworld_airlines_colombia_destinations
5      4     61             4_billion_liquidity_cash_loans
6      5     57              5_dublin_ireland_aer_heathrow
7      6     44        6_travel_caa_expansion_restrictions
8      7     42          7_airways_flights_aircraft_pilots
9      8     32         8_millions_km_america_reclassified
Topics details for company IAG saved to disk. File 'results/topics/topics_details_lowsim_IAG_en.xlsx'
	 - Language : spanish
	Extracting 47 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 1339.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/42 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/IAG_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:49:28,339 - BERTopic - Reduced dimensionality
2023-12-16 15:49:28,360 - BERTopic - Clustered reduced embeddings
2023-12-16 15:49:34,095 - BERTopic - Reduced number of topics from 38 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                         Name
0     -1    350                   -1_bonos_euros_costes_2020
1      0    418                           0_lse_madrid_13_10
2      1    154          1_incrementó_demanda_tráfico_vuelos
3      2    125      2_gastos_combustible_ingresos_impuestos
4      3     94             3_crédito_liquidez_dólares_deuda
5      4     80             4_airways_heathrow_airbus_vuelos
6      5     51     5_aumentaron_constantes_ingresos_aumento
7      6     40  6_latinoamérica_norteamérica_áfrica_oriente
8      7     14                7_pensiones_airways_naps_plan
9      8     13                  8_dism_increme_capacidad_la
Topics details for company IAG saved to disk. File 'results/topics/topics_details_lowsim_IAG_es.xlsx'
[12/30]	** Sentences and Embeddings Creation for Company 'Iberdrola' **
** Extracting topics on low similarity PR for Company 'Iberdrola' **
	 - Language : english
	Extracting 190 PR with low similarity..

Batches:   0%|          | 0/129 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Iberdrola_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:50:06,261 - BERTopic - Reduced dimensionality
2023-12-16 15:50:06,339 - BERTopic - Clustered reduced embeddings
2023-12-16 15:50:31,853 - BERTopic - Reduced number of topics from 100 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                      Name
0     -1    894        -1_energy_renewable_company_photos
1      0   2333            0_billion_climate_energy_group
2      1    491              1_wind_offshore_baltic_eagle
3      2    151     2_mobility_electric_charging_stations
4      3     76            3_credit_financing_green_bonds
5      4     45        4_birds_species_migratory_habitats
6      5     42     5_museum_exhibition_artistic_castilla
7      6     41  6_volunteers_volunteer_food_volunteering
8      7     25       7_women_federations_karateka_medals
9      8     17     8_waste_recycling_processes_materials
Topics details for company Iberdrola saved to disk. File 'results/topics/topics_details_lowsim_Iberdrola_en.xlsx'
	 - Language : spanish
	Extracting 181 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 3676.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/115 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Iberdrola_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:51:06,413 - BERTopic - Reduced dimensionality
2023-12-16 15:51:06,480 - BERTopic - Clustered reduced embeddings
2023-12-16 15:51:17,613 - BERTopic - Reduced number of topics from 78 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1   1285             -1_proyectos_marino_desarrollo_eólicos
1      0   1909       0_millones_información_renovables_energética
2      1    146       1_dividendo_financiación_crédito_accionistas
3      2     81                2_innovation_innovación_smart_grids
4      3     53    3_biodiversidad_migración_conservación_birdlife
5      4     46  4_voluntariado_voluntarios_alimentación_solida...
6      5     45               5_mujeres_diversidad_igualdad_gender
7      6     42  6_movilidad_vehículos_transporte_interoperabil...
8      7     40                     7_hoteles_fibra_óptica_resorts
9      8     29           8_contrataciones_trabajadores_500_empleo
Topics details for company Iberdrola saved to disk. File 'results/topics/topics_details_lowsim_Iberdrola_es.xlsx'
[13/30]	** Sentences and Embeddings Creation for Company 'Inditex' **
** Extracting topics on low similarity PR for Company 'I

Batches:   0%|          | 0/23 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Inditex_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:51:26,480 - BERTopic - Reduced dimensionality
2023-12-16 15:51:26,493 - BERTopic - Clustered reduced embeddings
2023-12-16 15:51:30,685 - BERTopic - Reduced number of topics from 21 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    145                    -1_stores_brand_collection_pull
1      0    139   0_sustainability_cotton_sustainable_stradivarius
2      1    121                      1_growth_billion_sales_profit
3      2    109                        2_stores_openings_bear_jobs
4      3     60      3_disabilities_programmes_refugees_disability
5      4     47              4_industriall_union_workers_agreement
6      5     31                                5_reserved_rights__
7      6     20  6_reporttalentlife_approachleadershiphistoryet...
8      7     18                     7_rfid_radio_frequency_fishing
9      8     15                      8_barcelona_calle_store_spain
Topics details for company Inditex saved to disk. File 'results/topics/topics_details_lowsim_Inditex_en.xlsx'
	 - Language : spanish
	Extracting 31 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved t

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Inditex_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:51:39,311 - BERTopic - Reduced dimensionality
2023-12-16 15:51:39,321 - BERTopic - Clustered reduced embeddings
2023-12-16 15:51:39,697 - BERTopic - Reduced number of topics from 3 to 3


[S2.] 	 Topics Modeled
   Topic  Count                                 Name
0      0     32          0_rights_reserved_ver_anexo
1      1     19  1_2007_finlandia_bulgaria_eslovenia
2      2    605     2_inditex_tiendas_euros_millones
Topics details for company Inditex saved to disk. File 'results/topics/topics_details_lowsim_Inditex_es.xlsx'
[14/30]	** Sentences and Embeddings Creation for Company 'Acciona' **
** Extracting topics on low similarity PR for Company 'Acciona' **
	 - Language : english
	Extracting 178 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 2929.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/92 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Acciona_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:52:05,831 - BERTopic - Reduced dimensionality
2023-12-16 15:52:05,878 - BERTopic - Clustered reduced embeddings
2023-12-16 15:52:16,014 - BERTopic - Reduced number of topics from 58 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                    Name
0     -1    920       -1_energy_renewable_million_chile
1      0    990    0_ebitda_securities_revenues_climate
2      1    508    1_desalination_farm_solar_wastewater
3      2    212             2_rail_metro_railway_tunnel
4      3    127  3_exhibition_painting_qiang_volunteers
5      4     54        4_concrete_3d_materials_printing
6      5     50            5_species_vulture_kite_birds
7      6     31       6_scooters_scooter_mobility_users
8      7     22       7_battery_storage_batteries_texas
9      8     15                8_salmon_fish_sea_ladder
Topics details for company Acciona saved to disk. File 'results/topics/topics_details_lowsim_Acciona_en.xlsx'
	 - Language : spanish
	Extracting 215 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 2661.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/84 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Acciona_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:52:40,054 - BERTopic - Reduced dimensionality
2023-12-16 15:52:40,096 - BERTopic - Clustered reduced embeddings
2023-12-16 15:52:48,580 - BERTopic - Reduced number of topics from 52 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                        Name
0     -1    972  -1_madrid_construcción_proyecto_desarrollo
1      0   1012             0_millones_test_australia_metro
2      1    174           1_puente_túneles_túnel_kilómetros
3      2    166  2_emisiones_hidrógeno_renovable_renovables
4      3    152                3_agua_aguas_población_water
5      4    117         4_mujeres_violencia_igualdad_cáncer
6      5     24                     5_2019_2021_verano_2025
7      6     17      6_aeropuertos_aeropuerto_chile_airport
8      7     14             7_fibra_fibras_acero_materiales
9      8     13   8_ibex_interacciones_seguidores_instagram
Topics details for company Acciona saved to disk. File 'results/topics/topics_details_lowsim_Acciona_es.xlsx'
[15/30]	** Sentences and Embeddings Creation for Company 'Arcelormittal' **
** Extracting topics on low similarity PR for Company 'Arcelormittal' **
	 - Language : english
	Extracting 90 PR with low similari

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Arcelormittal_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:53:05,409 - BERTopic - Reduced dimensionality
2023-12-16 15:53:05,437 - BERTopic - Clustered reduced embeddings
2023-12-16 15:53:11,499 - BERTopic - Reduced number of topics from 38 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                            Name
0     -1    313                -1_earnings_europe_sales_results
1      0   1114             0_exchanges_million_iron_luxembourg
2      1    183              1_energy_renewable_electric_reduce
3      2     91                2_cleaner_stronger_steels_bumper
4      3     39           3_shares_shareholders_rights_warrants
5      4     36               4_dividend_schedule_euros_payment
6      5     34              5_strength_usibor_automotive_rails
7      6     34           6_executive_leadership_senior_elected
8      7     28  7_environmental_plant_commissioners_protection
9      8     18     8_statements_expectations_projected_predict
Topics details for company Arcelormittal saved to disk. File 'results/topics/topics_details_lowsim_Arcelormittal_en.xlsx'
	 - Language : spanish
	Extracting 39 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sen

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Arcelormittal_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:53:19,852 - BERTopic - Reduced dimensionality
2023-12-16 15:53:19,862 - BERTopic - Clustered reduced embeddings
2023-12-16 15:53:23,002 - BERTopic - Reduced number of topics from 12 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1     36          -1_información_investors_accidentes_tubos
1      0    228                   0_millones_carbono_emisiones_co2
2      1    149                         1_3d_diseño_printing_steel
3      2     74              2_hidrógeno_hydrogen_energy_renovable
4      3     40                   3_2021_resultados_trimestre_2022
5      4     40                 4_inglés_discrepancia_texto_prensa
6      5     38  5_facebooktwitterlinkedinemailprint_enlace_sos...
7      6     25          6_corporate_reglamento_luxemburgo_mercado
8      7     21                           7_deck_panel_magnelis_fm
9      8     16                      8_capital_accionista_venta_34
Topics details for company Arcelormittal saved to disk. File 'results/topics/topics_details_lowsim_Arcelormittal_es.xlsx'
[16/30]	** Sentences and Embeddings Creation for Company 'Bancosabadell' **
** Extracting topics on low similarity PR 

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Bancosabadell_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:53:36,315 - BERTopic - Reduced dimensionality
2023-12-16 15:53:36,336 - BERTopic - Clustered reduced embeddings
2023-12-16 15:53:41,204 - BERTopic - Reduced number of topics from 29 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                            Name
0     -1    427                -1_market_insurance_funds_growth
1      0    475                    0_income_million_euros_loans
2      1    126              1_bstartup_investment_venture_plan
3      2    100                    2_ratio_coverage_bps_capital
4      3     62  3_sustainability_sustainable_climate_framework
5      4     52                  4_cancer_biomedical_award_jury
6      5     40             5_mobile_app_mastercard_contactless
7      6     30             6_digital_awareness_sports_channels
8      7     15                   7_housing_social_families_250
9      8     11                     8_rating_bbb_outlook_credit
Topics details for company Bancosabadell saved to disk. File 'results/topics/topics_details_lowsim_Bancosabadell_en.xlsx'
	 - Language : spanish
	Extracting 154 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. se

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Bancosabadell_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:53:56,148 - BERTopic - Reduced dimensionality
2023-12-16 15:53:56,176 - BERTopic - Clustered reduced embeddings
2023-12-16 15:54:01,886 - BERTopic - Reduced number of topics from 34 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                              Name
0     -1    648                 -1_banco_iniciativa_barcelona_000
1      0    691                    0_banco_mujeres_jazz_barcelona
2      1     95          1_millennials_crédito_jubilación_ahorros
3      2     80              2_renting_vehículos_movilidad_viajes
4      3     78             3_bstartup_startups_inversión_venture
5      4     65            4_marruecos_española_empresas_exportar
6      5     23      5_alimentos_sostenibles_proteínas_ecológicos
7      6     21          6_maternidad_reproducción_embarazos_bebé
8      7     17               7_fármacos_cerebro_gate2brain_brain
9      8     12  8_solares_solarprofit_fotovoltaica_fotovoltaicas
Topics details for company Bancosabadell saved to disk. File 'results/topics/topics_details_lowsim_Bancosabadell_es.xlsx'
[17/30]	** Sentences and Embeddings Creation for Company 'Cellnex' **
** Extracting topics on low similarity PR for Company 'Cell

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Cellnex_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:54:14,626 - BERTopic - Reduced dimensionality
2023-12-16 15:54:14,648 - BERTopic - Clustered reduced embeddings
2023-12-16 15:54:20,235 - BERTopic - Reduced number of topics from 38 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    304            -1_network_5g_infrastructure_foundation
1      0    439        0_sustainability_growth_carbon_shareholders
2      1    183                 1_switzerland_italy_eurostoxx_ibex
3      2    145          2_broadcasting_networks_services_security
4      3     68                    3_debt_bonds_maturity_liquidity
5      4     62               4_parking_sensors_smart_applications
6      5     45         5_brighton_connectivity_passengers_railway
7      6     26  6_mentors_entrepreneurs_entrepreneurship_found...
8      7     19                          7_5g_fibre_antennas_arena
9      8     14                8_hotel_visitors_guests_hospitality
Topics details for company Cellnex saved to disk. File 'results/topics/topics_details_lowsim_Cellnex_en.xlsx'
	 - Language : spanish
	Extracting 48 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved t

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Cellnex_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:54:33,020 - BERTopic - Reduced dimensionality
2023-12-16 15:54:33,039 - BERTopic - Clustered reduced embeddings
2023-12-16 15:54:39,004 - BERTopic - Reduced number of topics from 39 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    337    -1_telecomunicaciones_operadores_euros_compañía
1      0    439           0_sostenibilidad_euros_disclosure_carbon
2      1    262  1_servicios_infraestructuras_redes_telecomunic...
3      2     73                    2_suiza_italia_portugal_austria
4      3     43                3_fibra_óptica_cellnextelecom_https
5      4     43                     4_bonos_deuda_coste_conversión
6      5     35                5_eurostoxx_ibex35_mercado_euronext
7      6     27             6_acciones_capital_ampliación_derechos
8      7     11                          7_5g_piloto_drones_mobile
9      8     11     8_diversidad_inclusión_discriminación_igualdad
Topics details for company Cellnex saved to disk. File 'results/topics/topics_details_lowsim_Cellnex_es.xlsx'
[18/30]	** Sentences and Embeddings Creation for Company 'Fluidra' **
** Extracting topics on low similarity PR for Company 'Fluid

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Fluidra_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:54:45,572 - BERTopic - Reduced dimensionality
2023-12-16 15:54:45,579 - BERTopic - Clustered reduced embeddings
2023-12-16 15:54:51,119 - BERTopic - Reduced number of topics from 12 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                          Name
0     -1    159             -1_zodiac_cash_companies_dividend
1      0     49                    0_euros_ebitda_profit_2020
2      1     44             1_equipment_wellness_outlook_2022
3      2     42  2_social_rating_environmental_sustainability
4      3     42         3_europe_growth_geographical_northern
5      4     37                4_debt_ratio_residential_units
6      5     29       5_planes_executive_chairman_appointment
7      6     28                6_water_swim_aquatics_sandwell
8      7     23            7_portfolio_customers_needs_brands
9      8     15       8_smith_acquisition_canby_headquartered
Topics details for company Fluidra saved to disk. File 'results/topics/topics_details_lowsim_Fluidra_en.xlsx'
	 - Language : spanish
	Extracting 32 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 473.
[S2-s.] 	 Starting w

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Fluidra_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:54:57,466 - BERTopic - Reduced dimensionality
2023-12-16 15:54:57,474 - BERTopic - Clustered reduced embeddings
2023-12-16 15:55:03,362 - BERTopic - Reduced number of topics from 14 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    139                         -1_euros_dividendo_pago_50
1      0    102              0_europa_crecimiento_trimestre_cambio
2      1     63                   1_piscinas_wellness_piscina_swim
3      2     39  2_necesidades_fabtronics_innovación_adquisiciones
4      3     34         3_millones_gastos_australianos_recurrentes
5      4     28      4_española_aplicaciones_sostenible_montenegro
6      5     21    5_ejecutivo_presidente_integración_rentabilidad
7      6     19  6_medioambientales_sostenibilidad_sociales_cir...
8      7     15                        7_smith_1932_canby_hamilton
9      8     13                   8_deuda_préstamo_ratio_reducción
Topics details for company Fluidra saved to disk. File 'results/topics/topics_details_lowsim_Fluidra_es.xlsx'
[19/30]	** Sentences and Embeddings Creation for Company 'Indra' **
** Extracting topics on low similarity PR for Company 'Indra' 

Batches:   0%|          | 0/138 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Indra_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:55:33,490 - BERTopic - Reduced dimensionality
2023-12-16 15:55:33,569 - BERTopic - Clustered reduced embeddings
2023-12-16 15:56:00,493 - BERTopic - Reduced number of topics from 103 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                         Name
0     -1   1334       -1_systems_management_traffic_european
1      0   2437    0_business_countries_download_proprietary
2      1    377           1_aircraft_helicopter_drone_drones
3      2     81     2_africa_china_modernization_electricity
4      3     57   3_patient_consultations_diagnosis_clinical
5      4     36  4_biometric_identity_passports_verification
6      5     27        5_election_polling_elections_stations
7      6     16            6_televisión_tv_hbbtv_audiovisual
8      7     13           7_robotic_retail_automation_robots
9      8     12            8_rated_uruguay_argentina_average
Topics details for company Indra saved to disk. File 'results/topics/topics_details_lowsim_Indra_en.xlsx'
	 - Language : spanish
	Extracting 314 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 8799.
[S2-s.] 	 Starting with embedding

Batches:   0%|          | 0/275 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Indra_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:56:52,918 - BERTopic - Reduced dimensionality
2023-12-16 15:56:53,104 - BERTopic - Clustered reduced embeddings
2023-12-16 15:57:25,863 - BERTopic - Reduced number of topics from 156 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                         Name
0     -1   2525            -1_gestión_empresas_medios_sector
1      0   5584            0_digital_indra_líder_consultoría
2      1    252           1_aéreo_aeronave_radar_aeropuertos
3      2    160   2_ciberseguridad_ataques_amenazas_phishing
4      3    115   3_chatbots_robotización_artificial_chatbot
5      4     69      4_elecciones_electoral_electorales_voto
6      5     33                 5_cloud_nube_computing_nubes
7      6     26         6_móvil_smartphone_móviles_bluetooth
8      7     18  7_justicia_judiciales_expedientes_nicaragua
9      8     17        8_bahréin_bahrein_delegación_visitado
Topics details for company Indra saved to disk. File 'results/topics/topics_details_lowsim_Indra_es.xlsx'
[20/30]	** Sentences and Embeddings Creation for Company 'Logista' **
** Extracting topics on low similarity PR for Company 'Logista' **
	 - Language : english
	Extracting 50 PR with low similarity...

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Logista_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:57:33,892 - BERTopic - Reduced dimensionality
2023-12-16 15:57:33,903 - BERTopic - Clustered reduced embeddings
2023-12-16 15:57:34,462 - BERTopic - Reduced number of topics from 4 to 4


[S2.] 	 Topics Modeled
   Topic  Count                                           Name
0      0     51                   0_media_print_press_download
1      1     24           1_andorra_fleet_franchises_platforms
2      2     36     2_services_retailers_proximity_distributor
3      3    587  3_nacex_distribution_companies_pharmaceutical
Topics details for company Logista saved to disk. File 'results/topics/topics_details_lowsim_Logista_en.xlsx'
	 - Language : spanish
	Extracting 50 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 702.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Logista_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:57:42,694 - BERTopic - Reduced dimensionality
2023-12-16 15:57:42,707 - BERTopic - Clustered reduced embeddings
2023-12-16 15:57:46,575 - BERTopic - Reduced number of topics from 21 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    160             -1_enfermedad_dent_grupo_investigación
1      0    166              0_euros_servicios_beneficio_dividendo
2      1     93       1_campaña_enfermedades_mensajería_solidarios
3      2     71       2_temperatura_pharma_farmacéutico_hospitales
4      3     68  3_climático_compañías_sostenibilidad_medioambi...
5      4     49                4_imprimir_contacto_media_descargar
6      5     37            5_andorra_flota_plataformas_franquicias
7      6     30         6_equipo_empleados_eficiente_profesionales
8      7     17                7_europa_distribuidor_líder_europeo
9      8     11       8_seguridad_legislación_certificación_recoge
Topics details for company Logista saved to disk. File 'results/topics/topics_details_lowsim_Logista_es.xlsx'
[21/30]	** Sentences and Embeddings Creation for Company 'Melia' **
** Extracting topics on low similarity PR for Company 'Melia' 

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Melia_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:57:55,102 - BERTopic - Reduced dimensionality
2023-12-16 15:57:55,114 - BERTopic - Clustered reduced embeddings
2023-12-16 15:57:55,348 - BERTopic - Reduced number of topics from 2 to 2


[S2.] 	 Topics Modeled
   Topic  Count                             Name
0      0     29  0_shares_download_retains_parts
1      1    617      1_hotels_hotel_guests_ibiza
Topics details for company Melia saved to disk. File 'results/topics/topics_details_lowsim_Melia_en.xlsx'
	 - Language : spanish
	Extracting 99 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 1151.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Melia_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:58:09,102 - BERTopic - Reduced dimensionality
2023-12-16 15:58:09,124 - BERTopic - Clustered reduced embeddings
2023-12-16 15:58:09,606 - BERTopic - Reduced number of topics from 4 to 4


[S2.] 	 Topics Modeled
   Topic  Count                                  Name
0      0     82    0_descargar_imagen_gmfenix_fpdeseo
1      1     22         1_cubano_tribunal_ley_juzgado
2      2     29   2_golf_circuito_ballesteros_amateur
3      3   1018  3_hotels_hotel_international_hoteles
Topics details for company Melia saved to disk. File 'results/topics/topics_details_lowsim_Melia_es.xlsx'
[22/30]	** Sentences and Embeddings Creation for Company 'Merlin' **
** Extracting topics on low similarity PR for Company 'Merlin' **
	 - Language : english
	Extracting 16 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 393.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Merlin_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:58:16,196 - BERTopic - Reduced dimensionality
2023-12-16 15:58:16,203 - BERTopic - Clustered reduced embeddings
2023-12-16 15:58:18,051 - BERTopic - Reduced number of topics from 8 to 8


[S2.] 	 Topics Modeled
   Topic  Count                                       Name
0     -1     36     -1_region_property_management_portugal
1      0     16  0_tinkle_www_information_merlinproperties
2      1     20                   1_gpr_index_core_invests
3      2     15              2_energy_data_carbon_charging
4      3    175          3_million_properties_bonds_merlin
5      4     82                4_rent_tenants_rents_growth
6      5     12          5_warehouse_logistics_spain_lease
7      6     37   6_plan_refurbishment_madrid_construction
Topics details for company Merlin saved to disk. File 'results/topics/topics_details_lowsim_Merlin_en.xlsx'
	 - Language : spanish
	Extracting 23 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 345.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Merlin_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:58:23,946 - BERTopic - Reduced dimensionality
2023-12-16 15:58:23,952 - BERTopic - Clustered reduced embeddings
2023-12-16 15:58:24,564 - BERTopic - Reduced number of topics from 2 to 2


[S2.] 	 Topics Modeled
   Topic  Count                                             Name
0      0     48  0_modificación_garantías_generalidades_licencia
1      1    297                    1_madrid_informe_breeam_datos
Topics details for company Merlin saved to disk. File 'results/topics/topics_details_lowsim_Merlin_es.xlsx'
[23/30]	** Sentences and Embeddings Creation for Company 'Naturgy' **
** Extracting topics on low similarity PR for Company 'Naturgy' **
	 - Language : english
	Extracting 5 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 112.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Naturgy_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:58:29,512 - BERTopic - Reduced dimensionality
2023-12-16 15:58:29,516 - BERTopic - Clustered reduced embeddings
2023-12-16 15:58:30,658 - BERTopic - Reduced number of topics from 4 to 4


[S2.] 	 Topics Modeled
   Topic  Count                                              Name
0     -1      4               -1_talks_burning_better_inventories
1      0     18  0_plant_construction_multinational_hydroelectric
2      1     50               1_pollution_government_spain_health
3      2     40                     2_hydrogen_fuel_aeh2_stations
Topics details for company Naturgy saved to disk. File 'results/topics/topics_details_lowsim_Naturgy_en.xlsx'
	 - Language : spanish
	Extracting 108 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 2184.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/69 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Naturgy_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:58:51,007 - BERTopic - Reduced dimensionality
2023-12-16 15:58:51,047 - BERTopic - Clustered reduced embeddings
2023-12-16 15:58:58,562 - BERTopic - Reduced number of topics from 51 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                            Name
0     -1    744                      -1_prensa_gas_general_2017
1      0    846  0_fundación_sostenibilidad_ambiente_energética
2      1    197                      1_natural_gas_grupo_españa
3      2    125   2_desmantelamiento_central_térmica_demolición
4      3     91           3_cliente_google_cloud_digitalización
5      4     66          4_bunkering_combustible_buques_puertos
6      5     36           5_lago_biodiversidad_minera_hectáreas
7      6     34                           6_2022_2021_2014_2016
8      7     31                 7_emisiones_co2_carbono_motores
9      8     14                8_trabajadores_horas_trabajo_290
Topics details for company Naturgy saved to disk. File 'results/topics/topics_details_lowsim_Naturgy_es.xlsx'
[24/30]	** Sentences and Embeddings Creation for Company 'Red' **
** Extracting topics on low similarity PR for Company 'Red' **
	 - Language : english
	Extracting

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Red_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:59:03,258 - BERTopic - Reduced dimensionality
2023-12-16 15:59:03,262 - BERTopic - Clustered reduced embeddings
2023-12-16 15:59:03,747 - BERTopic - Reduced number of topics from 2 to 2


[S2.] 	 Topics Modeled
   Topic  Count                                      Name
0      0     17            0_generation_energy_solar_wind
1      1     77  1_eléctrica_substation_ibiza_electricity
Topics details for company Red saved to disk. File 'results/topics/topics_details_lowsim_Red_en.xlsx'
	 - Language : spanish
	Extracting 53 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 1039.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/33 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Red_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:59:13,666 - BERTopic - Reduced dimensionality
2023-12-16 15:59:13,682 - BERTopic - Clustered reduced embeddings
2023-12-16 15:59:18,224 - BERTopic - Reduced number of topics from 23 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    359        -1_compañía_correos_sostenibilidad_millones
1      0    173                    0_rural_mujeres_igualdad_social
2      1    161           1_sistema_energética_eléctrico_eléctrica
3      2     83                    2_euros_dividendo_standard_poor
4      3     78    3_especies_biodiversidad_biodibal_invertebrados
5      4     57  4_propuestas_emprendedores_startups_emprendedoras
6      5     43                 5_fibra_óptica_hyperloop_movilidad
7      6     38                6_5g_inspección_drones_conectividad
8      7     26              7_suficientes_primavera_sacarte_space
9      8     21             8_león_castilla_guadalajara_valenciana
Topics details for company Red saved to disk. File 'results/topics/topics_details_lowsim_Red_es.xlsx'
[25/30]	** Sentences and Embeddings Creation for Company 'Repsol' **
** Extracting topics on low similarity PR for Company 'Repsol' **
	 -

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Repsol_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:59:27,867 - BERTopic - Reduced dimensionality
2023-12-16 15:59:27,882 - BERTopic - Clustered reduced embeddings
2023-12-16 15:59:31,731 - BERTopic - Reduced number of topics from 19 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                         Name
0     -1    278              -1_company_renewable_2025_stake
1      0    141              0_hydrogen_renewable_jobs_solar
2      1    108                       1_oil_prices_euros_gas
3      2    104         2_billion_shareholders_dividend_debt
4      3    102              3_2050_zero_emissions_strategic
5      4     95  4_green_reforestation_government_foundation
6      5     35   5_financing_bonds_sustainability_framework
7      6     28             6_biofuels_waste_synthetic_fuels
8      7     17               7_wind_delta_turbines_floating
9      8     14                  8_2026_2025_schedule_market
Topics details for company Repsol saved to disk. File 'results/topics/topics_details_lowsim_Repsol_en.xlsx'
	 - Language : spanish
	Extracting 82 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 638.
[S2-s.] 	 Starting with embedding

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Repsol_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 15:59:39,869 - BERTopic - Reduced dimensionality
2023-12-16 15:59:39,880 - BERTopic - Clustered reduced embeddings
2023-12-16 15:59:40,539 - BERTopic - Reduced number of topics from 5 to 5


[S2.] 	 Topics Modeled
   Topic  Count                                           Name
0      0     16     0_plataforma_aerogenerador_flotante_parque
1      1     11          1_windplus_renováveis_principle_power
2      2    576      2_energética_compañía_proyecto_estaciones
3      3     22  3_competición_motogp_motociclismo_combustible
4      4     13    4_marinas_diversificación_tecnológico_áreas
Topics details for company Repsol saved to disk. File 'results/topics/topics_details_lowsim_Repsol_es.xlsx'
[26/30]	** Sentences and Embeddings Creation for Company 'Rovi' **
** Extracting topics on low similarity PR for Company 'Rovi' **
	 - Language : english
	Extracting 41 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 3822.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/120 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Rovi_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 16:00:18,594 - BERTopic - Reduced dimensionality
2023-12-16 16:00:18,661 - BERTopic - Clustered reduced embeddings
2023-12-16 16:00:38,209 - BERTopic - Reduced number of topics from 135 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                             Name
0     -1    618                  -1_financial_report_assets_risk
1      0   2512                     0_margin_q1_profit_financial
2      1    396            1_ism_thymanax_schizophrenia_launched
3      2     72                 2_heart_chronic_corlentor_angina
4      3     72           3_countries_launch_registration_russia
5      4     50            4_heparin_heparins_lmwh_anticoagulant
6      5     46        5_deceleration_market_selective_inhibitor
7      6     19      6_imaging_tomography_resonance_computerized
8      7     19  7_hypercholesterolemia_statin_licenses_diabetic
9      8     18                  8_cell_lung_cancer_chemotherapy
Topics details for company Rovi saved to disk. File 'results/topics/topics_details_lowsim_Rovi_en.xlsx'
	 - Language : spanish
	Extracting 42 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences 

Batches:   0%|          | 0/111 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Rovi_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 16:01:12,426 - BERTopic - Reduced dimensionality
2023-12-16 16:01:12,483 - BERTopic - Clustered reduced embeddings
2023-12-16 16:01:30,074 - BERTopic - Reduced number of topics from 131 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1    498               -1_mercado_hbpm_enoxaparina_millones
1      0   2486              0_ingresos_financieros_9m_previsiones
2      1    289  1_esquizofrenia_prescripción_tratamiento_corle...
3      2     95  2_consejeros_secretario_nombramientos_administ...
4      3     50                          3_países_brasil_q3_bosnia
5      4     46                            4_nivel_independencia__
6      5     36                    5_cox_exxiv_selectivo_inhibidor
7      6     21         6_auditoría_auditores_experto_alternativas
8      7     15                7_vacuna_campaña_vacunas_vacunación
9      8     15            8_breezhaler_pulmonar_novartis_hirobriz
Topics details for company Rovi saved to disk. File 'results/topics/topics_details_lowsim_Rovi_es.xlsx'
[27/30]	** Sentences and Embeddings Creation for Company 'Sacyr' **
** Extracting topics on low similarity PR for Company 'Sacyr' **
	 -

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Sacyr_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 16:01:38,562 - BERTopic - Reduced dimensionality
2023-12-16 16:01:38,575 - BERTopic - Clustered reduced embeddings
2023-12-16 16:01:42,513 - BERTopic - Reduced number of topics from 20 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                             Name
0     -1    254              -1_beds_hospital_projects_proposals
1      0    130  0_sustainability_innovation_committee_corporate
2      1    110                    1_debt_million_shares_backlog
3      2     83                2_climate_emissions_carbon_energy
4      3     60                     3_highway_route_lane_traffic
5      4     39       4_infrastructures_industrial_divisions_epc
6      5     36                5_chile_colombia_financing_mexico
7      6     32          6_water_membranes_filtration_underwater
8      7     29                  7_waste_cleaning_urban_circular
9      8     14                    8_vessels_locks_canal_transit
Topics details for company Sacyr saved to disk. File 'results/topics/topics_details_lowsim_Sacyr_en.xlsx'
	 - Language : spanish
	Extracting 49 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentence

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Sacyr_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 16:01:51,131 - BERTopic - Reduced dimensionality
2023-12-16 16:01:51,144 - BERTopic - Clustered reduced embeddings
2023-12-16 16:01:55,229 - BERTopic - Reduced number of topics from 22 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                              Name
0     -1    211          -1_servicios_gestión_membranas_proyectos
1      0    265   0_sostenibilidad_negocio_innovación_corporativo
2      1     69                 1_carbono_emisiones_co2_climático
3      2     64                 2_autopista_ruta_kilómetros_rutas
4      3     59       3_chile_colombia_latinoamérica_financiación
5      4     41             4_agua_residuos_circular_conservación
6      5     34                  5_deuda_fondos_principles_cierre
7      6     14               6_buques_canal_barcos_embarcaciones
8      7     14  7_seguridad_confidencialidad_riesgos_continuidad
9      8     14                  8_m2_hospitales_hospitalaria_629
Topics details for company Sacyr saved to disk. File 'results/topics/topics_details_lowsim_Sacyr_es.xlsx'
[28/30]	** Sentences and Embeddings Creation for Company 'Solaria' **
** Extracting topics on low similarity PR for Company 'Solaria' **
	 - Lang

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Solaria_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 16:02:01,765 - BERTopic - Reduced dimensionality
2023-12-16 16:02:01,771 - BERTopic - Clustered reduced embeddings
2023-12-16 16:02:02,482 - BERTopic - Reduced number of topics from 3 to 3


[S2.] 	 Topics Modeled
   Topic  Count                                    Name
0      0     33  0_enrique_solaria_relation_commitments
1      1     38               1_euros_3q23_invests_save
2      2    255    2_company_photovoltaic_solaria_solar
Topics details for company Solaria saved to disk. File 'results/topics/topics_details_lowsim_Solaria_en.xlsx'
	 - Language : spanish
	Extracting 35 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 270.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Solaria_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 16:02:08,350 - BERTopic - Reduced dimensionality
2023-12-16 16:02:08,355 - BERTopic - Clustered reduced embeddings
2023-12-16 16:02:08,811 - BERTopic - Reduced number of topics from 2 to 2


[S2.] 	 Topics Modeled
   Topic  Count                               Name
0      0     47       0_2023_euros_solaria_octubre
1      1    223  1_solaria_energía_solar_proyectos
Topics details for company Solaria saved to disk. File 'results/topics/topics_details_lowsim_Solaria_es.xlsx'
[29/30]	** Sentences and Embeddings Creation for Company 'Telefonica' **
** Extracting topics on low similarity PR for Company 'Telefonica' **
	 - Language : english
	Extracting 154 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized saved to disk. Total nr. sentences = 4084.
[S2-s.] 	 Starting with embeddings creation.


Batches:   0%|          | 0/128 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Telefonica_lowsim_en.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 16:02:43,980 - BERTopic - Reduced dimensionality
2023-12-16 16:02:44,058 - BERTopic - Clustered reduced embeddings
2023-12-16 16:02:57,491 - BERTopic - Reduced number of topics from 90 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                               Name
0     -1   1402                  -1_services_data_companies_mobile
1      0   2173                         0_5g_mobile_network_growth
2      1    208            1_security_cybersecurity_threat_threats
3      2    137               2_energy_emissions_climate_renewable
4      3     49             3_open_entrepreneurship_wayra_startups
5      4     47                        4_tour_driver_wearer_riders
6      5     23                       5_chile_mexico_colombia_peru
7      6     18               6_airport_airports_accessibility_app
8      7     15                  7_water_watering_processes_canals
9      8     12  8_healthcare_telehealth_disabilities_nutrition...
Topics details for company Telefonica saved to disk. File 'results/topics/topics_details_lowsim_Telefonica_en.xlsx'
	 - Language : spanish
	Extracting 368 PR with low similarity...
[S1-s.] 	 Tonekize sentences
[S1-e.] 	 Sentences tonenized 

Batches:   0%|          | 0/241 [00:00<?, ?it/s]

[S2-e.] 	 Embeddings file data/embeddings/Telefonica_lowsim_es.pkl saved to disk.
[S2.] 	 Embeddings imported from {filename}. Starting topic modeling...


2023-12-16 16:03:46,094 - BERTopic - Reduced dimensionality
2023-12-16 16:03:46,247 - BERTopic - Clustered reduced embeddings
2023-12-16 16:04:23,002 - BERTopic - Reduced number of topics from 144 to 10


[S2.] 	 Topics Modeled
   Topic  Count                                              Name
0     -1   2929           -1_telefónica_madrid_digital_tecnología
1      0   4459            0_telefónica_museo_pdfdescargar_madrid
2      1    102    1_paciente_quirúrgico_temperatura_respiradores
3      2     79  2_blockchain_privacidad_ciberseguridad_antivirus
4      3     52               3_perú_colombia_chile_latinoamérica
5      4     24            4_snowboard_esquí_esquiadores_synergic
6      5     24      5_oftalmología_oftalmológico_corneal_ceguera
7      6     18                 6_ciclismo_ciclistas_mujeres_bike
8      7     14       7_desktop_citrix_virtualización_escritorios
9      8     10       8_emociones_emotions_emoción_esperanzadores
Topics details for company Telefonica saved to disk. File 'results/topics/topics_details_lowsim_Telefonica_es.xlsx'
CPU times: user 2h 45min 59s, sys: 38min 40s, total: 3h 24min 39s
Wall time: 49min 31s
