## 1. Cargar tuits

In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
%ls data/

covid_twitter_sample_2020-04-16.csv                  tweets_batch_001.tsv
covid_twitter_sample_2020-04-16.npy                  tweets_batch_002.tsv
covid_twitter_sample_topic_model.csv                 tweets_batch_003.tsv
covid_twitter_sample_topic_model_20p.csv             tweets_batch_004.tsv
covid_twitter_sample_topic_model_embeddings.npy      tweets_batch_005.tsv
covid_twitter_sample_topic_model_embeddings_100.npy  tweets_batch_006.tsv
covid_twitter_sample_topic_model_embeddings_20.npy   tweets_batch_007.tsv


In [3]:
df = pd.read_csv("data/covid_twitter_sample_2020-04-16.csv")

In [4]:
df.columns

Index(['tweet_id', 'user_id', 'created_at', 'full_text', 'created_date'], dtype='object')

In [5]:
df.shape

(971679, 5)

In [6]:
df_full = df

In [7]:
df_full.shape

(971679, 5)

In [8]:
import numpy as np

embeddings = np.load(f"data/covid_twitter_sample_2020-04-16.npy")

In [9]:
embeddings.shape

(971679, 384)

In [10]:
docs = df.full_text.dropna().values.tolist()

In [11]:
len(docs)

971679

In [12]:
docs[:2]

['Visitamos los hospitales de la provincia de Chimborazo, la emergencia por #Covid19 está controlada. Seguimos las directrices del presidente @Lenin Moreno de recorrer el territorio e inspeccionar las casas de salud para conocer sus realidades. \n#JuntosEcuador\n#QuédateEnCasa https://t.co/7fnk38WWiI',
 'Iker Jiménez (@navedelmisterio): "Estoy tratando el coronavirus como tratamos el Ébola, la avispa asiática, Chernóbil, Atapuerca... quien no conoce nuestra trayectoria nunca lo va a entender" https://t.co/KSh6JvbikZ https://t.co/lsrt53Y0Kf']

In [13]:
docs[0]

'Visitamos los hospitales de la provincia de Chimborazo, la emergencia por #Covid19 está controlada. Seguimos las directrices del presidente @Lenin Moreno de recorrer el territorio e inspeccionar las casas de salud para conocer sus realidades. \n#JuntosEcuador\n#QuédateEnCasa https://t.co/7fnk38WWiI'

3. Topic modeling

## Hashtags populares

In [14]:
import re

import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return str(only_ascii)

def extract_hashtags(s):
    return [
        remove_accents(ht.lower())
        for ht in re.findall(r"#(\w+)", s)
    ]

In [15]:
df.head()

Unnamed: 0,tweet_id,user_id,created_at,full_text,created_date
0,1250575030632710146,365860744,2020-04-16 00:01:36,Visitamos los hospitales de la provincia de Ch...,2020-04-16
1,1250575030980878336,2996409095,2020-04-16 00:01:36,"Iker Jiménez (@navedelmisterio): ""Estoy tratan...",2020-04-16
2,1250575031010238473,1324644966,2020-04-16 00:01:36,"La canciller de Alemania, Angela Merkel, desta...",2020-04-16
3,1250575031106600960,3895256173,2020-04-16 00:01:36,Una enfermera embarazada falleció tras contrae...,2020-04-16
4,1250575031085776896,1215630912211255296,2020-04-16 00:01:36,Casos de Covid-19 en Colombia superaron los 3....,2020-04-16


In [16]:
df["full_text"] = df.full_text.astype(str)

In [17]:
df["hashtags"] = df.full_text.apply(extract_hashtags)

In [18]:
df.head()

Unnamed: 0,tweet_id,user_id,created_at,full_text,created_date,hashtags
0,1250575030632710146,365860744,2020-04-16 00:01:36,Visitamos los hospitales de la provincia de Ch...,2020-04-16,"[b'covid19', b'juntosecuador', b'quedateencasa']"
1,1250575030980878336,2996409095,2020-04-16 00:01:36,"Iker Jiménez (@navedelmisterio): ""Estoy tratan...",2020-04-16,[]
2,1250575031010238473,1324644966,2020-04-16 00:01:36,"La canciller de Alemania, Angela Merkel, desta...",2020-04-16,[]
3,1250575031106600960,3895256173,2020-04-16 00:01:36,Una enfermera embarazada falleció tras contrae...,2020-04-16,[]
4,1250575031085776896,1215630912211255296,2020-04-16 00:01:36,Casos de Covid-19 en Colombia superaron los 3....,2020-04-16,[]


In [19]:
from collections import Counter

In [23]:
ht_counts = Counter()

In [24]:
for hts in df.hashtags.values:
    ht_counts.update(hts)

In [25]:
ht_counts.most_common(100)

[("b'covid19'", 316094),
 ("b'quedateencasa'", 47449),
 ("b'coronavirus'", 39412),
 ("b'covid'", 17113),
 ("b'cuba'", 15138),
 ("b'17anosbarrioadentro'", 12648),
 ("b'covid_19'", 11555),
 ("b'venezuelabellaenrevolucion'", 10256),
 ("b'16abr'", 7613),
 ("b'cubasalvavidas'", 7358),
 ("b'venezuela'", 6373),
 ("b'estevirusloparamosunidos'", 6072),
 ("b'15abr'", 5993),
 ("b'aecuadorlosacamostodos'", 5989),
 ("b'covid19mx'", 5708),
 ("b'yomequedoencasa'", 5193),
 ("b'eeuu'", 5093),
 ("b'fase3'", 5028),
 ("b'cuarentena'", 4907),
 ("b'cubaporlasalud'", 4403),
 ("b'emergenciasanitaria'", 4162),
 ("b'cuidarteescuidarnos'", 4137),
 ("b'pandemia'", 4092),
 ("b'mexico'", 3882),
 ("b'sinalimentos'", 3776),
 ("b'ultimahora'", 3708),
 ("b'barrioadentro17aniversario'", 3572),
 ("b'nomasley100genocida'", 3441),
 ("b'bloqueonosolidaridadsi'", 3381),
 ("b'urgente'", 3229),
 ("b'puebla'", 2867),
 ("b'sanadistancia'", 2777),
 ("b'envivo'", 2744),
 ("b'nicaragua'", 2738),
 ("b'conhambreymalgobierno'", 2722),

In [26]:
len(df)

971679

# BERTopic

In [27]:
from sentence_transformers import SentenceTransformer

In [28]:
embedding_model = "paraphrase-multilingual-MiniLM-L12-v2"

In [29]:
# Spanish multilingual model (lightweight)
model = SentenceTransformer(embedding_model)

In [30]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [31]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

# Create your representation model
representation_model = KeyBERTInspired()

In [57]:
sample_rate = 1.0

In [58]:
sample_inds = np.random.choice(len(docs), int(sample_rate * len(docs)), replace=False)

In [59]:
sample_inds.dtype

dtype('int64')

In [60]:
type(docs)

list

In [61]:
docs_sample = [docs[i] for i in sample_inds]

In [62]:
len(docs_sample)

971679

In [63]:
embeddings_sample = embeddings[sample_inds]

In [64]:
embeddings_sample.shape

(971679, 384)

In [65]:
import spacy
import string

# Load the Spanish library from SpaCy
nlp = spacy.load("es_core_news_sm")

# Create list of punctuation marks
punctuations = string.punctuation

# Create list of stopwords from spaCy
es_stopwords = spacy.lang.es.stop_words.STOP_WORDS

es_stopwords = list(es_stopwords) + ["https", "http", "com", "covid", "covid19", "19", "co", "coronavirus", "rt"]

In [66]:
# --- 4. Vectorizer for topic words (Spanish stopwords + unigrams/bigrams)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

vectorizer_model = CountVectorizer(
    stop_words=es_stopwords,
    ngram_range=(1, 2),        # include bigrams to catch phrases like "cambio climático"
    min_df=5                   # ignore rare words
)

In [67]:
# Create and fit the BERTopic model using the precomputed embeddings
topic_model = BERTopic(
    language="spanish",
    vectorizer_model=vectorizer_model,
    verbose=True
)

In [None]:
# topics, _ = topic_model.fit_transform(docs, embeddings)
topics, _ = topic_model.fit_transform(docs_sample, embeddings_sample)

2025-11-08 22:28:29,119 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


In [55]:
# View topic info
topic_info = topic_model.get_topic_info()

In [56]:
# Method 1 - safetensors
embedding_model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
topic_model.save("covid_twitter_topic_model_2020-04-16-100p", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)