1. Cargar tuits

In [1]:
import pandas as pd

In [2]:
chunk_size = 10000  # Adjust the chunk size as needed
all_chunks = []

# TODO: use all data
# for i in range(1,8):
for i in range(1,2):
    chunks = pd.read_csv(f'data/tweets_batch_00{i}.tsv', sep='\t',
                         usecols=["tweet_id", "user_id", "full_text"],
                         engine='python',
                         on_bad_lines='skip',
                         chunksize=chunk_size)
    all_chunks += chunks

df = pd.concat(all_chunks, ignore_index=True)

for f in ["user_id"]:
    df[f] = df[f].astype("Int64")

In [3]:
df

Unnamed: 0,tweet_id,user_id,full_text
0,1249125074495512576,415914708,Panorama en Quintana Roo \n\nHasta las 18 hora...
1,1249125074541649920,825227495242862592,@CamiFerrce @Pau_lin_a Prefiero el COVID 19-20...
2,1249125075179016192,1047598652510937088,App de COVID-19 pudo ser un mero sitio web inf...
3,1249125075195793408,210950346,Las pruebas r√°pidas de #Covid19 podr√≠an ayudar...
4,1249125075166662658,888404662038261760,#QueNoSeTePase M√©dicos del IMSS de Villa Alta ...
...,...,...,...
1000010,1249500587110137863,3007013667,Ayer fue mi primer guardia en un √°rea COVID en...
1000011,1249500587173072904,144316886,"Encontr√© esto en facebook:\n""El Covid no hizo ..."
1000012,1249500587688779776,796085582791208960,üìåEste es el resumen del reporte del #Covid19 d...
1000013,1249500587856666625,1245021641572302848,Una brigada de 11 colaboradores del Contingent...


In [4]:
len(df)

1000015

In [5]:
# TODO: use all data
docs = df
docs = docs.sample(100000)
docs = docs.full_text.dropna().values.tolist()

In [6]:
len(docs)

99999

In [7]:
docs[:2]

['#QuedateEnCasa ...... Crece n√∫meros de casos de Covid-19 a 83 en Sonora..... v√≠a https://t.co/rE8Op3C2de..... https://t.co/Jd6VZRWpPK https://t.co/Z6kPyx0aOk',
 'OJO! El gobierno sigue a tope con su censura en redes. Desde el @PSOE est√°n utilizando herramientas de bloqueo masivo para bloquear a todo el que siga a determinadas cuentas. Indiscriminadamente. Probad a ver si os ha bloqueado @susanadiaz. #COVID„Éº19 #COVID19 #StopBulos']

In [8]:
from copy import deepcopy
raw_docs = deepcopy(docs)

### 2. Preprocesar
- Tokenize.
- Lemmatize the tokens.
- Compute bigrams.
- Compute a bag-of-words representation of the data.

In [9]:
import string
import re
import spacy

# Load the Spanish library from SpaCy
nlp = spacy.load("es_core_news_sm")

# Create list of punctuation marks
punctuations = string.punctuation

# Create list of stopwords from spaCy
stopwords = spacy.lang.es.stop_words.STOP_WORDS

# Remove URLs
def remove_urls(text):
    text = re.sub(r"\S*https?:\S*", "", text, flags=re.MULTILINE)
    return text

# Creat tokenizer function
def spacy_tokenizer(sentence):
    # Create token object from spacy
    tokens = nlp(sentence)

    # Lemmatize each token and convert each token into lowercase
    # tokens = [word.lemma_.lower().strip() if word.lemma_ != "PROPN" else word.lower_ for word in tokens]
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "PROPN" else word.lower_ for word in tokens]
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    # tokens = [word for word in tokens if word not in punctuations]
    
    # Remove links
    tokens = [remove_urls(word) for word in tokens]
    
    # return preprocessed list of tokens
    return tokens

In [10]:
docs[0]

'#QuedateEnCasa ...... Crece n√∫meros de casos de Covid-19 a 83 en Sonora..... v√≠a https://t.co/rE8Op3C2de..... https://t.co/Jd6VZRWpPK https://t.co/Z6kPyx0aOk'

In [11]:
spacy_tokenizer(docs[0])

['quedateencasa',
 '......',
 'crecer',
 'n√∫mero',
 'caso',
 'covid-19',
 '83',
 'sonora',
 '.....',
 'v√≠a',
 '',
 '.....',
 '',
 '']

In [12]:
# !pip install nltk

#### - Tokenize + 

In [13]:
for idx in range(len(docs)):
    docs[idx] = spacy_tokenizer(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [14]:
docs[0][:10]

['quedateencasa',
 '......',
 'crecer',
 'n√∫mero',
 'caso',
 'covid-19',
 'sonora',
 '.....',
 'v√≠a',
 '.....']

In [15]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [16]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.3)

In [17]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

3. Topic modeling

In [18]:
# Train LDA model.
from gensim.models import LdaMulticore
# from gensim.models import LdaModel

# Set training parameters.
num_topics = 50
chunksize = 2000
passes = 30
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaMulticore(
    workers=10,
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='symmetric',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [19]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)


Average topic coherence: -5.7220.
[([(0.034963492, 'casa'),
   (0.025004478, 'grupo'),
   (0.02422518, 'rt'),
   (0.021017445, 'sacar'),
   (0.020824106, 'punto'),
   (0.019859623, 'vulnerable'),
   (0.019831812, 'poblaci√≥n'),
   (0.018808568, 'adulto'),
   (0.018414315, 'totalmente'),
   (0.018267266, 'gratuito'),
   (0.017949205, 'medicamento'),
   (0.017644811, 'etc.'),
   (0.017385453, 'domicilio'),
   (0.017073747, 'necesitar'),
   (0.016358647, 'poblaci√≥n_vulnerable'),
   (0.016284868, 'procedimiento'),
   (0.015861573, 'retiro'),
   (0.015279346, 'curaci√≥n'),
   (0.015120392, 'casa_adulto'),
   (0.015067533, 'casa_grupo')],
  -0.7238036311085941),
 ([(0.09190091, 'casa'),
   (0.05034206, 'salir'),
   (0.045854587, 'video'),
   (0.037234038, 'ayer'),
   (0.036641717, 'hilo'),
   (0.036027893, 'covid'),
   (0.03514157, 'mercado'),
   (0.034514286, 'amigo'),
   (0.03295672, 'sentir'),
   (0.03181278, 'hospital'),
   (0.031152317, 'abrir'),
   (0.030585594, 'paciente'),
   (0.030

4. Display topics

-
- top-10 tweets per topic

In [27]:
from random import sample

docinds_sample = sample(range(len(docs)), 1000)

In [28]:
corpus_sample = [corpus[i] for i in docinds_sample]

In [29]:
doc_topic_scores = list(model[corpus_sample])

In [40]:
from collections import defaultdict
import pandas as pd

topic_counts = defaultdict(int)
docinds_by_topic = defaultdict(list)
for i, scores in enumerate(doc_topic_scores):
    maxs = 0
    top_topic = None
    for j, score in scores:
        if score > maxs:
            maxs = score
            top_topic = j
    docinds_by_topic[top_topic].append(i)
    topic_counts[top_topic] += 1

topic_counts = pd.DataFrame(list(topic_counts.items()), columns=["topic", "n_docs"])
topic_counts["fraction"] = topic_counts.n_docs / topic_counts.n_docs.sum()
topic_counts.sort_values("fraction", ascending=False, inplace=True)
topic_counts

Unnamed: 0,topic,n_docs,fraction
9,7,55,0.055
1,39,49,0.049
4,6,44,0.044
13,30,33,0.033
0,26,32,0.032
20,20,27,0.027
34,14,27,0.027
23,34,27,0.027
12,25,25,0.025
14,43,24,0.024


In [41]:
MAX_DISPLAY_TWEETS_PER_TOPIC = 10
N_DISPLAY_TOPICS = 10

for t in topic_counts.topic[:N_DISPLAY_TOPICS]:
    docinds = docinds_by_topic[t]
    print(f"====== Topic #{t:02d} ====== ")
    if len(docinds) > MAX_DISPLAY_TWEETS_PER_TOPIC:
        docinds = sample(docinds, 10)
    for i in docinds:
        print(raw_docs[i] + '\n------\n')

Los militares estamos conscientes que el COVID-19 no tiene fronteras. Ante la pandemia, todos debemos estar del lado de la SOLIDARIDAD, sin odios ni ego√≠smos, dejando de lado las diferencias pol√≠ticas y poni√©ndonos a orden del bien com√∫n y del bienestar de nuestros pueblos. https://t.co/s9cgcQmJuw
------

Si las percoladoras cuestan 4 mil y dispensadores de agua 5 mil....Ya me imagino el precio de camillas, respiradores... mascarillas.
POR FAVOR SEAN HONESTO UNA VEZ EN LA VIDA.
------

La pandemia "no es una guerra" sino un "test de humanidad", afirma el presidente alem√°n.
https://t.co/ch31OGMCD8 https://t.co/re9DrY0TMz
------

Un tratamiento israel√≠ para el covid-19 tiene una tasa de supervivencia del 100%, seg√∫n datos preliminares

Israeli COVID-19 treatment shows 100% survival rate - preliminary data https://t.co/heGvlPL7xd
------

Prevengamos el COVID-19
¬øDe d√≥nde sacas tu energ√≠a?
Tu trabajo
Querer regresar a la oficina
El respaldo de tus jefes
¬°Qued√©monos en casa con 

- LDAviz?

In [144]:
1+1

2

## Hashtags populares

In [79]:
import re

import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return str(only_ascii)

def extract_hashtags(s):
    return [
        remove_accents(ht.lower())
        for ht in re.findall(r"#(\w+)", s)
    ]

In [80]:
df.head()

Unnamed: 0,tweet_id,user_id,full_text,hashtags
0,1249125074495512576,415914708,Panorama en Quintana Roo \n\nHasta las 18 hora...,[b'quedateencasa']
1,1249125074541649920,825227495242862592,@CamiFerrce @Pau_lin_a Prefiero el COVID 19-20...,[]
2,1249125075179016192,1047598652510937088,App de COVID-19 pudo ser un mero sitio web inf...,[]
3,1249125075195793408,210950346,Las pruebas r√°pidas de #Covid19 podr√≠an ayudar...,"[b'covid19', b'florida']"
4,1249125075166662658,888404662038261760,#QueNoSeTePase M√©dicos del IMSS de Villa Alta ...,"[b'quenosetepase', b'covid19']"


In [81]:
df["full_text"] = df.full_text.astype(str)

In [82]:
df["hashtags"] = df.full_text.apply(extract_hashtags)

In [83]:
df.head()

Unnamed: 0,tweet_id,user_id,full_text,hashtags
0,1249125074495512576,415914708,Panorama en Quintana Roo \n\nHasta las 18 hora...,[b'quedateencasa']
1,1249125074541649920,825227495242862592,@CamiFerrce @Pau_lin_a Prefiero el COVID 19-20...,[]
2,1249125075179016192,1047598652510937088,App de COVID-19 pudo ser un mero sitio web inf...,[]
3,1249125075195793408,210950346,Las pruebas r√°pidas de #Covid19 podr√≠an ayudar...,"[b'covid19', b'florida']"
4,1249125075166662658,888404662038261760,#QueNoSeTePase M√©dicos del IMSS de Villa Alta ...,"[b'quenosetepase', b'covid19']"


In [84]:
from collections import Counter

In [85]:
ht_counts = Counter()

In [86]:
for hts in df.hashtags.values:
    ht_counts.update(hts)

In [87]:
ht_counts.most_common(100)

[("b'covid19'", 283704),
 ("b'quedateencasa'", 50764),
 ("b'coronavirus'", 33950),
 ("b'laprevencioneslaclave'", 22872),
 ("b'covid'", 16287),
 ("b'cuba'", 13049),
 ("b'12abr'", 11367),
 ("b'hermandadenvanguardia'", 10954),
 ("b'11abril'", 9439),
 ("b'11abr'", 9191),
 ("b'covid_19'", 8323),
 ("b'estevirusloparamosunidos'", 6604),
 ("b'venezuela'", 6307),
 ("b'cubasalvavidas'", 5753),
 ("b'eeuu'", 5262),
 ("b'covid__19'", 5237),
 ("b'yomequedoencasa'", 5218),
 ("b'urgente'", 5105),
 ("b'ultimahora'", 4803),
 ("b'sanadistancia'", 4391),
 ("b'endsanctionssavelives'", 4223),
 ("b'abrildeunioncivicomilitar'", 4195),
 ("b'reporte'", 3969),
 ("b'cuarentena'", 3748),
 ("b'cuarentenaextendida'", 3530),
 ("b'covid19mx'", 3461),
 ("b'mexico'", 3455),
 ("b'felizdomingo'", 3028),
 ("b'envideo'", 2757),
 ("b'atencion'", 2711),
 ("b'salvavidas'", 2584),
 ("b'peruestaennuestrasmanos'", 2488),
 ("b'italia'", 2379),
 ("b'cubaporlasalud'", 2314),
 ("b'modoactivo'", 2301),
 ("b'falso'", 2213),
 ("b'doming

In [88]:
len(df)

1000015

In [77]:
dfc = df[df.hashtags.apply(lambda hts: b"cuarentena" in hts)]

In [89]:
for t in dfc.sample(100).full_text.values:
    print(t)
    print("--------------------")

Aqu√≠ est√° el video GRATIS por los RTs pasados üòà 

888 RTs y les dejo otro regalo üéÅ #M√©xico #Reforma #CDMX #COVID„Éº19 #cuarentena #Bonni3GG https://t.co/SPB5EqVlwd
--------------------
üá®üá∫üóûNoticias de #Cuba este #11DeAbril en @HrRebelde 
*Llega a hospitales cubanos donativo de China
*Entran en vigor medidas del transporte en Cuba
*Reparto Armando Mestre de Matanzas en #cuarentena restrictiva
#COVID„Éº19 #CubaPorLaSalud #QuedateEnLaCasa 
https://t.co/HYXAs1OigF
--------------------
#Temuco N√∫mero de pruebas q confirman #coronavirus #covid19 ha bajado por falta de personal en laboratorio q las realice. Parte del personal contagiado y otr@s en #cuarentena @InterferenciaCL
https://t.co/cvuSHYIIlE
--------------------
100% de acuerdo, alguien m√°s quiere apoyar???

Qu√© opinas @lopezobrador_ ???

#COVID19 
#cuarentena https://t.co/vtEQqNHW0X
--------------------
Yo no s√© si sab√≠an pero en M√©xico hay un pueblo ENTERO que lleva dos noches sin dormir porque hay un hombre l