### Instalar librerías

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!pip install transformers

In [None]:
!python -m nltk.downloader stopwords

In [None]:
!python -m spacy download es_core_news_sm

In [None]:
!pip install bertopic

In [None]:
!pip install bertopic[visualization]

In [None]:
!pip install umap-learn

In [None]:
!pip install datamapplot==0.3.0

In [None]:
!pip install -U sentence-transformers

In [None]:
!pip freeze | grep pandas

geopandas==1.0.1
pandas==2.2.2
pandas-datareader==0.10.0
pandas-gbq==0.28.0
pandas-stubs==2.2.2.240909
sklearn-pandas==2.2.0


In [None]:
!pip freeze | grep numpy

numpy==2.0.2


In [None]:
!pip freeze | grep bertopic

bertopic==0.17.0


In [None]:
!pip freeze | grep datamapplot

datamapplot==0.3.0


In [None]:
!pip freeze | grep umap-learn

umap-learn==0.5.7


In [None]:
!pip freeze | grep sentence-transformers

sentence-transformers==4.0.2


In [None]:
!pip freeze | grep cuml

cuml-cu12==25.2.1
libcuml-cu12==25.2.1


In [None]:
!pip freeze | grep transformers

sentence-transformers==4.0.2
transformers==4.50.3


# Topic Modeling Comments

### **Librerías**

In [None]:
import pandas as pd
import numpy as np
import random
import torch
import os

from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance
from bertopic import BERTopic

from transformers import pipeline

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP

from umap import UMAP

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [None]:
DIRECTORY = '/content/drive/MyDrive/ExperimentosNews'

FILENAME_COMMENTS = 'comments_topic.csv'
FILENAME_OTHERS = 'comments_timestamps.csv'

MODEL_NAME = 'model'

### **Leer dataset**

In [None]:
df = pd.read_csv(FILENAME_COMMENTS, usecols=['texto_lemma'])

In [None]:
text = df['texto_lemma'].values.tolist()

In [None]:
docs = text

### **Modelado de Topicos**

In [None]:
## ** REPRESENTATION **

# The main representation of a topic
# KeyBert --> Reduce the appearance of stop words, this also often improves the topic representation
# MMR --> To decrease this redundancy and improve the diversity of keywords

representation1 = [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.5)]

# Add all models together to be run in a single `fit`
representation_model = representation1

In [None]:
## ** Improving Default Representation **
stop_words = stopwords.words('spanish')

# Vectorizer
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 2), min_df=15)

In [None]:
# Create instances of GPU-accelerated UMAP and HDBSCAN
umap_model = UMAP(n_components=5, n_neighbors=50, verbose=True)
hdbscan_model = HDBSCAN(min_cluster_size=150, min_samples=25, gen_min_span_tree=True, prediction_data=False, verbose=True)

In [None]:
# Pre-calculate Embeddings
sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device='cuda')

In [None]:
embeddings = sentence_model.encode(docs, show_progress_bar=True)

In [None]:
# filename = os.path.join(DIRECTORY, f'{MODEL_NAME}_embeddings.npy')
# with open(filename, 'wb') as f:
#   np.save(f, embeddings)

In [None]:
NR_TOPICS = 'auto'
topic_model = BERTopic(language='spanish',
                      nr_topics=NR_TOPICS,
                      top_n_words=10,
                      embedding_model=sentence_model,
                      umap_model=umap_model,
                      hdbscan_model=hdbscan_model,
                      vectorizer_model=vectorizer_model,
                      representation_model=representation_model,
                      min_topic_size=300,
                      # low_memory=True,
                      verbose=True)

In [None]:
topic_model.fit(docs, embeddings)

In [None]:
totalObtenidos = pd.Series(topic_model.topics_).unique()
print(f'Total Temas: {len(totalObtenidos)}')

In [None]:
# ### guardar modelo sin embeddings
# filename = os.path.join(DIRECTORY, f'{MODEL_NAME}_SIN_n-{NR_TOPICS}')
# topic_model.save(filename, save_embedding_model=False)

In [None]:
topic_model.get_topic_freq()

In [None]:
topic_model.get_topic_info().to_csv(os.path.join(DIRECTORY, f'{MODEL_NAME}_topic_info.csv'))

In [None]:
topic_model.get_topic(0)

[('politico', np.float32(0.5517403)),
 ('presidenta', np.float32(0.50257134)),
 ('presidente', np.float32(0.4906162)),
 ('poder judicial', np.float32(0.4690089)),
 ('gobierno', np.float32(0.44749737)),
 ('corrupcion', np.float32(0.41573116)),
 ('reforma', np.float32(0.41070223)),
 ('voto', np.float32(0.39959288)),
 ('votar', np.float32(0.39321268)),
 ('corrupto', np.float32(0.39124447))]

In [None]:
topic_model.visualize_barchart(top_n_topics=40)