In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
from pathlib import Path
import tabulate
import tomotopy as tp
from tqdm.notebook import trange
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import json

KeyboardInterrupt: 

### Load data

In [2]:
dir_data = Path("../../data")
dir_metadata = dir_data.joinpath("metadata")
dir_partition = dir_metadata.joinpath("df_partition_pd.parquet")

In [3]:
# ddf = dd.read_parquet("data/minors_text_filtered.parquet")
# # ddf = ddf.sample(frac=0.01)
# ddf = ddf.repartition(100)
# # ddf["lang"] = ddf["text"].apply(lang_filter.identify_language, meta=(None, "object"))

# preprocessed_texts = pd.read_parquet("data/preprocessed_texts.parquet")
# preprocessed_texts.head()

In [4]:
# # Load data
# preprocessed_texts = pd.read_parquet("data/preprocessed_texts.parquet")
# full_texts = ddf.loc[ddf.index.isin(preprocessed_texts.index)]["text"].compute()
# data = pd.merge(
#     full_texts,
#     preprocessed_texts,
#     how="inner",
#     left_index=True,
#     right_index=True,
#     suffixes=["_full", "_preprocessed"],
# )


In [5]:
# Load Stopwords
stop_words = {}
with dir_data.joinpath("stopwords/administración.txt").open("r", encoding="utf-8") as f:
    stop_words = {*stop_words, *set([w.strip() for w in f.readlines()])}
with dir_data.joinpath("stopwords/municipios.txt").open("r", encoding="utf-8") as f:
    stop_words = {*stop_words, *set([w.strip() for w in f.readlines()])}
with dir_data.joinpath("stopwords/common_stopwords.txt").open(
    "r", encoding="utf-8"
) as f:
    stop_words = {*stop_words, *set([w.strip() for w in f.readlines()])}
stop_words = list(
    set(
        list(stop_words)
        + [w.lower() for w in stop_words]
        + [w.upper() for w in stop_words]
        + [" ".join([el.capitalize() for el in w.split()]) for w in stop_words]
    )
)
stop_words = sorted(stop_words, key=len, reverse=True)[::-1]

# Load Vocabulary
with dir_data.joinpath("RAE/vocabulary.json").open("r", encoding="utf8") as f:
    vocabulary = json.load(f)

In [12]:
df_partition = pd.read_parquet(dir_partition).dropna()[:50_000]

# Topic Models

In [13]:
num_topics = 50

# Word patterns
min_len = 2
# word_pattern = (
#     r"(?:[a-zA-Z\u00C0-\u024F]{1,})"
#     r"(?:(?:[\-\/\|\\]?)(?:(?:[a-zA-Z\u00C0-\u024F]+)|(?:[\d]{1,3})))"
# )
# word_pattern = (
    # f"(?<![a-zA-Z\u00C0-\u024F\d\-\\\/|])"
    # f"[a-zA-Z\u00C0-\u024F]"
    # f"(?:[a-zA-Z\u00C0-\u024F]|(?!\d{{4}})[\d]|[-\\\/|](?![-\\\/|])){{{min_len - 1},}}"
    # f"(?<![-\\\/|])[a-zA-Z\u00C0-\u024F\d]?"
    # f"(?![a-zA-Z\u00C0-\u024F\d])"
# )
word_pattern = (
    f"(?<![a-zA-Z\u00C0-\u024F\d\-])"
    f"[a-zA-Z\u00C0-\u024F]"
    f"(?:[a-zA-Z\u00C0-\u024F]|(?!\d{{4}})[\d]|[-](?![-])){{{min_len - 1},}}"
    f"(?<![-])[a-zA-Z\u00C0-\u024F\d]?"
    f"(?![a-zA-Z\u00C0-\u024F\d])"
)


# Step 1 - Extract embeddings
sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# import spacy
# sentence_model = spacy.load(
#     name="es_dep_news_trf",
#     exclude=["tagger", "parser", "ner", "attribute_ruler"],
# )

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine")

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_cluster_size=500, min_samples=2, metric="euclidean", prediction_data=True
)

# Step 4 - Tokenize topics
# stop_words = list({*stopwords.words("spanish"), *STOP_WORDS})
vectorizer_model = CountVectorizer(
    token_pattern=word_pattern,
    stop_words=stop_words,
    ngram_range=(1, 2),
    # vocabulary=vocabulary,
    max_df=0.8,
    min_df=0.01,
)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = MaximalMarginalRelevance(diversity=0.3, top_n_words=20)


## NMF

In [14]:
docs = [" ".join(d) for d in df_partition["preprocessed_text"]]


In [16]:
# Create document-term matrix
dtm = vectorizer_model.fit_transform(docs)

# Topic modeling with NMF
nmf_model = NMF(n_components=num_topics, random_state=42, max_iter=1000)
nmf_model.fit(dtm)

In [None]:
# Get top words for each topic
id2word = {v: k for k, v in vectorizer_model.vocabulary_.items()}
top_words_nmf = []
for topic_idx, topic in enumerate(nmf_model.components_):
    top_words_nmf.append([id2word[i] for i in np.argsort(topic)[:-21:-1]])

In [None]:
def topn_topics(nt:int=4):
    np.argsort(nmf_model.transform(dtm[:5]), axis=1)[:, ::-1][:, :nt]

In [21]:
def topic_words(top_words, nw:int=10):
    data = [t[:10] for i, t in enumerate(top_words)]
    columns=[f"Word {n}" for n in range(10)]
    df = pd.DataFrame(data=data, columns=columns)
    return df

In [22]:
topic_words(top_words_nmf, 10)

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
0,universidad,rectorado,rectorado universidad,universidad murciar,laboratorio,educación,biblioteca,base,estudio,car
1,material,diverso,laboratorio,vario,biblioteca,casa,infantil,fremap,residencia,aéreo
2,seguridad,seguridad social,mutua,prestación,infraestructura,asistencia,aéreo,diverso,insular tenerife,murciar
3,deporte,cultura deporte,educación cultura,universidad,fiesta,administrativo,diverso,instituto,sistema,educación
4,insular,tenerife,insular tenerife,agricultura,casa,defensa,seguridad,hacienda,administrativo,diverso
5,reparación,aire,vario,infraestructura,residencia,casa,diverso,san,car,murciar
6,instituto,tecnológico,laboratorio,estudio,infraestructura,deporte,técnica,vario,car,aire
7,aeropuerto,aena,aena aeropuerto,tenerife,vario,san,seguridad,aire,comunicación,car
8,mantenimiento,aire,car,sistema,comunicación,diverso,infraestructura,san,municipio,casa
9,salud,seguridad,tenerife,vario,san,estudio,deporte,pública,social,diverso


In [None]:
# Get top words for each topic
id2word = {v: k for k, v in vectorizer_model.vocabulary_.items()}
top_words_nmf = []
for topic_idx, topic in enumerate(nmf_model.components_):
    top_words_nmf.append([id2word[i] for i in np.argsort(topic)[:-21:-1]])




# Get top words for each topic
top_words_lda = []
for i, topic in lda_model.show_topics(num_topics=num_topics, formatted=False):
    topic_words = [word[0] for word in topic]
    top_words_lda.append(topic_words)


In [None]:
nt = 4
nw = 10
print(f"Top {nt} topics:")
print(
    tabulate.tabulate(
        np.argsort(nmf_model.transform(dtm[:5]), axis=1)[:, ::-1][:, :nt],
        headers=[f"Topic {i}" for i in range(nt)],
        tablefmt="grid",
    )
)


Top 4 topics:
+-----------+-----------+-----------+-----------+
|   Topic 0 |   Topic 1 |   Topic 2 |   Topic 3 |
|        39 |         5 |        49 |        12 |
+-----------+-----------+-----------+-----------+
|         7 |         5 |        26 |        33 |
+-----------+-----------+-----------+-----------+
|        10 |        15 |        40 |        43 |
+-----------+-----------+-----------+-----------+
|         3 |         9 |        36 |        45 |
+-----------+-----------+-----------+-----------+
|        49 |        12 |        22 |        21 |
+-----------+-----------+-----------+-----------+


In [None]:
print(
    tabulate.tabulate(
        [[i] + t[:10] for i, t in enumerate(top_words_nmf)],
        headers=["id"] + [f"Word {n}" for n in range(10)],
        tablefmt="heavy_grid",
    )
)


┏━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃   id ┃ Word 0        ┃ Word 1            ┃ Word 2                ┃ Word 3          ┃ Word 4            ┃ Word 5                ┃ Word 6            ┃ Word 7            ┃ Word 8            ┃ Word 9            ┃
┣━━━━━━╋━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━┫
┃    0 ┃ universidad   ┃ rectorado         ┃ rectorado universidad ┃ educación       ┃ universidad jaume ┃ asistencia            ┃ base              ┃ comunicación      ┃ material          ┃ protección        ┃
┣━━━━━━╋━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━

## GENSIM

In [None]:
docs = data["text_preprocessed"].tolist()


In [None]:
# Topic modeling with LDA
dictionary = Dictionary(docs)
# dictionary.filter_extremes(no_below=10)
# dictionary.filter_n_most_frequent(30)
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]


In [None]:
lda_model = LdaMulticore(
    bow_corpus,
    num_topics=num_topics,
    id2word=dictionary,
    passes=5,
    chunksize=10000,
    iterations=400,
)


In [None]:
# Get top words for each topic
top_words_lda = []
for i, topic in lda_model.show_topics(num_topics=num_topics, formatted=False):
    topic_words = [word[0] for word in topic]
    top_words_lda.append(topic_words)
print(
    tabulate.tabulate(
        [[i] + t[:10] for i, t in enumerate(top_words_lda)],
        headers=["id"] + [f"Word {n}" for n in range(10)],
        tablefmt="heavy_grid",
    )
)


┏━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃   id ┃ Word 0           ┃ Word 1             ┃ Word 2         ┃ Word 3          ┃ Word 4         ┃ Word 5                   ┃ Word 6          ┃ Word 7        ┃ Word 8          ┃ Word 9        ┃
┣━━━━━━╋━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━┫
┃    0 ┃ superior         ┃ investigaciones    ┃ científicas    ┃ medioambiental  ┃ investigación  ┃ tecnológico              ┃ energético      ┃ ciemat        ┃ puerta          ┃ reparación    ┃
┣━━━━━━╋━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━┫
┃    1 ┃ material   

## Tomotopy

In [None]:
docs = data["text_preprocessed"].tolist()


#### LDA

In [None]:
tp_lda_model = tp.LDAModel(min_cf=0, min_df=10, rm_top=20, k=num_topics, seed=42)
for d in docs:
    tp_lda_model.add_doc(d)

t = trange(0, 400, 10, desc="", leave=True)
for i in t:
    t.set_description(f"Iteration:{i}\tLL:{tp_lda_model.ll_per_word:.3f}")
    t.refresh()
    tp_lda_model.train(10)


  0%|          | 0/40 [00:00<?, ?it/s]

  tp_lda_model.train(10)


In [None]:
# Get top words for each topic
top_words_lda_tp = []
for k in range(tp_lda_model.k):
    topic_words = [word[0] for word in tp_lda_model.get_topic_words(k, top_n=10)]
    top_words_lda_tp.append(topic_words)
print(
    tabulate.tabulate(
        [[i] + t[:10] for i, t in enumerate(top_words_lda_tp)],
        headers=["id"] + [f"Word {n}" for n in range(10)],
        tablefmt="heavy_grid",
    )
)


┏━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃   id ┃ Word 0           ┃ Word 1             ┃ Word 2                   ┃ Word 3        ┃ Word 4         ┃ Word 5            ┃ Word 6          ┃ Word 7       ┃ Word 8           ┃ Word 9        ┃
┣━━━━━━╋━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━┫
┃    0 ┃ jaume            ┃ biomédico          ┃ ciber                    ┃ laboratorio   ┃ fungible       ┃ ug000             ┃ reactivo        ┃ mat          ┃ ug7              ┃ kit           ┃
┣━━━━━━╋━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━┫
┃    1 ┃ insula

#### CT

In [None]:
tp_ct_model = tp.CTModel(min_cf=0, min_df=10, rm_top=30, k=num_topics, seed=42)
for d in docs:
    tp_ct_model.add_doc(d)

t = trange(0, 400, 10, desc="", leave=True)
for i in t:
    t.set_description(f"Iteration:{i}\tLL:{tp_ct_model.ll_per_word:.3f}")
    t.refresh()
    tp_ct_model.train(10)


  0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
# Get top words for each topic
top_words_ct_tp = []
for k in range(tp_ct_model.k):
    topic_words = [word[0] for word in tp_ct_model.get_topic_words(k, top_n=10)]
    top_words_ct_tp.append(topic_words)
print(
    tabulate.tabulate(
        [[i] + t[:10] for i, t in enumerate(top_words_ct_tp)],
        headers=["id"] + [f"Word {n}" for n in range(10)],
        tablefmt="heavy_grid",
    )
)


┏━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃   id ┃ Word 0         ┃ Word 1         ┃ Word 2        ┃ Word 3          ┃ Word 4       ┃ Word 5         ┃ Word 6     ┃ Word 7     ┃ Word 8     ┃ Word 9        ┃
┣━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━━━━━┫
┃    0 ┃ educación      ┃ culturo        ┃ consejería    ┃ transporte      ┃ deporte      ┃ campo          ┃ consejeer  ┃ escolar    ┃ deportes   ┃ región        ┃
┣━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━━━━━┫
┃    1 ┃ curso          ┃ infantil       ┃ espectáculo   ┃ formación       ┃ teatro       ┃ programa       ┃ municipio  ┃ reparación ┃ consejería ┃ centro        ┃
┣━━━━━━╋━━━━━━━━

## BERTopic

In [None]:
use_data = data["text_full"][:500_000]
bertopic_model = BERTopic(
    language="multilingual",
    nr_topics=num_topics,
    low_memory=False,
    calculate_probabilities=False,
    seed_topic_list=None,
    embedding_model=sentence_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    verbose=True,
)


In [None]:
topics, probs = bertopic_model.fit_transform(use_data)


Batches:   0%|          | 0/15625 [00:00<?, ?it/s]

2023-03-30 11:27:26,904 - BERTopic - Transformed documents to Embeddings
2023-03-30 11:37:43,366 - BERTopic - Reduced dimensionality
2023-03-30 11:38:29,032 - BERTopic - Clustered reduced embeddings
2023-03-30 11:40:10,006 - BERTopic - Reduced number of topics from 209 to 50


In [None]:
bertopic_model.save("models/bertopic_model")


In [None]:
bertopic_model = BERTopic.load("models/bertopic_model")


In [None]:
bertopic_model.get_topics()


{-1: [('aeropuerto', 0.12710819153918174),
  ('aena', 0.12669635987763458),
  ('aena aeropuerto', 0.12288426600375438),
  ('publicidad', 0.11996696028909778),
  ('difusión', 0.10687919421375465),
  ('educación cultura', 0.10565045156523871),
  ('comercial', 0.0998837179847755),
  ('publicidad comunicación', 0.09914333860729842),
  ('cultura deportes', 0.0972133696546199),
  ('publicidad comercial', 0.09619281005636629)],
 0: [('ortopedia', 0.1655586672474643),
  ('ingeniería sistemas', 0.162884628841904),
  ('sistemas defensa', 0.1627859858172902),
  ('consejero ingeniería', 0.16278438669317655),
  ('ingeniería', 0.16116430678018176),
  ('material ortopedia', 0.16045648375282234),
  ('mutua seguridad', 0.1575592490691442),
  ('suministros material', 0.15403880482553145),
  ('profesionales seguridad', 0.15367079589866275),
  ('accidentes trabajo', 0.15118346012003822)],
 1: [('teatro', 0.2788177599748193),
  ('fiestas actividades', 0.26850522967427254),
  ('actividades recreativas', 0.2

In [None]:
# bertopic_model.calculate_probabilities = True
# pred_topic, pred_prob = bertopic_model.transform(documents=orig_texts[:10])


In [None]:
similar_topics, similarity = bertopic_model.find_topics("scn", top_n=5)
[bertopic_model.get_topic(t) for t in similar_topics], similarity


([[('astrofísica', 0.5409658003559884),
   ('instituto astrofísica', 0.5406689805991687),
   ('cm-c', 0.48899563769791615),
   ('cm-b', 0.4701535959460636),
   ('social cm-b', 0.4557982784066389),
   ('cm-or', 0.42003521641274),
   ('cm-e', 0.39674730916499285),
   ('cg-19', 0.3952966307416673),
   ('cg-19 instituto', 0.3952966307416673),
   ('cg-20 instituto', 0.3949502242666907)],
  [('ordenadores', 0.5587684910022904),
   ('ssd', 0.5061175502446136),
   ('i5', 0.4801596527999253),
   ('ordenador', 0.47616242870715747),
   ('ordenadores portátiles', 0.42157361911115293),
   ('intel', 0.41693711258006866),
   ('hp', 0.41671236849289706),
   ('gb', 0.41455788317602366),
   ('portátiles', 0.4131145616366825),
   ('monitor', 0.4034467124492865)],
  [('ortopedia', 0.1655586672474643),
   ('ingeniería sistemas', 0.162884628841904),
   ('sistemas defensa', 0.1627859858172902),
   ('consejero ingeniería', 0.16278438669317655),
   ('ingeniería', 0.16116430678018176),
   ('material ortopedia',

In [None]:
bertopic_model.get_representative_docs(similar_topics[0])


['Alquiler fotocopiadora . Id licitación: 306/2017; Órgano de Contratación: Dirección Gerente del Centro de Recuperación de Personas con Discapacidad Física (IMSERSO) Salamanca; Importe: 3100.36 EUR; Estado: RES',
 'Mantenimiento ascensores 2019. Id licitación: 19/00031; Órgano de Contratación: Dirección Gerente del Centro de Recuperación de Personas con Discapacidad Física (IMSERSO) Albacete; Importe: 8471.04 EUR; Estado: RES',
 'suministro de agua 2019. Id licitación: 2018/87M; Órgano de Contratación: Dirección Gerente del Centro de Recuperación de Personas con Discapacidad Física (IMSERSO) Cádiz; Importe: 16033 EUR; Estado: RES']

In [None]:
new_topics = bertopic_model.reduce_outliers(
    use_data, topics, probabilities=probs, strategy="embeddings", threshold=0.0
)
bertopic_model.update_topics(
    docs=use_data,
    topics=new_topics,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
)


In [None]:
bertopic_model.get_topics()


{0: [('ingeniería sistemas', 0.16397945884993326),
  ('consejero ingeniería', 0.16388888898211107),
  ('sistemas defensa', 0.16388850542286132),
  ('ingeniería', 0.16360906671592698),
  ('ortopedia', 0.1630579999705474),
  ('material ortopedia', 0.1580074198883862),
  ('suministros material', 0.15393742588895334),
  ('profesionales seguridad', 0.152817850767815),
  ('accidentes trabajo', 0.1504725350040825),
  ('ibermutuamur', 0.14372571444615648)],
 1: [('teatro', 0.29354623164247207),
  ('artes escénicas', 0.2604224146036061),
  ('escénicas música', 0.25166404975977635),
  ('fiestas actividades', 0.2500503934827548),
  ('actividades recreativas', 0.24996110147924883),
  ('recreativas', 0.24901355300108105),
  ('instituto artes', 0.24385760349972516),
  ('arts', 0.23993433806712158),
  ('carnaval', 0.23690015716224522),
  ('comercio artesanía', 0.2338453543066376)],
 2: [('atención primaria', 0.35902944637774625),
  ('clínico', 0.3541075890571825),
  ('hospital clínico', 0.34818403951

In [None]:
bertopic_model.visualize_documents(
    use_data,
    sample=0.01,
    hide_document_hover=False,
    hide_annotations=True,
)


In [None]:
bertopic_model.visualize_topics()
