1. Cargar tuits

In [42]:
import pandas as pd
import random

In [43]:
sample_rate = 0.01
chunk_size = 10000  # Adjust the chunk size as needed
all_chunks = []

for i in range(1,8):
    chunks = pd.read_csv(f'data/tweets_batch_00{i}.tsv', sep='\t',
                         usecols=["tweet_id", "user_id", "full_text"],
                         engine='python',
                         on_bad_lines='skip',
                         skiprows=lambda i: i>0 and random.random() > sample_rate,
                         chunksize=chunk_size)
    all_chunks += chunks

df = pd.concat(all_chunks, ignore_index=True)

for f in ["user_id"]:
    df[f] = df[f].astype("Int64")

In [44]:
df

Unnamed: 0,tweet_id,user_id,full_text
0,1249125075820969985,894490872665997312,🇪🇨 #Ecuador: 315 fallecidos y 7.257 casos conf...
1,1249125076055851008,121281445,Esta gente debe irse presa.
2,1249125089720905728,19553171,Este junior está usando su organización y a lo...
3,1249125101334863877,1280879857,Italia registra 19 mil 468 muertos con COVID-1...
4,1249125138349600769,2345159331,Hablemos del coronavirus y una teoría que está...
...,...,...,...
61880,1251661614367064065,1182863228830994432,"Tanto, EEUU Suecia y el Reino Unido intentaron..."
61881,1251661642452041730,3244892497,@arigameplays Mejor que se apuren a encontrar ...
61882,1251661715126829063,57103343,Súper útil: compilación de artículos sobre COV...
61883,1251661723330805763,865721213213184000,@cejaspobladas Mi pronóstico:\n\n-27 de abril ...


In [45]:
len(df)

61885

In [46]:
docs = df
docs = docs.full_text.dropna().values.tolist()

In [47]:
len(docs)

61880

In [48]:
docs[:2]

['🇪🇨 #Ecuador: 315 fallecidos y 7.257 casos confirmados de #COVID19 \n\n‣ https://t.co/RzkxKjpAjT https://t.co/6GltPyF1PI',
 'Esta gente debe irse presa.']

In [49]:
from copy import deepcopy
raw_docs = deepcopy(docs)

### 2. Preprocesar
- Tokenize.
- Lemmatize the tokens.
- Compute bigrams.
- Compute a bag-of-words representation of the data.

In [50]:
import string
import re
import spacy

# Load the Spanish library from SpaCy
nlp = spacy.load("es_core_news_sm")

# Create list of punctuation marks
punctuations = string.punctuation

# Create list of stopwords from spaCy
stopwords = spacy.lang.es.stop_words.STOP_WORDS

# Remove URLs
def remove_urls(text):
    text = re.sub(r"\S*https?:\S*", "", text, flags=re.MULTILINE)
    return text

# Creat tokenizer function
def spacy_tokenizer(sentence):
    # Create token object from spacy
    tokens = nlp(sentence)

    # Lemmatize each token and convert each token into lowercase
    # tokens = [word.lemma_.lower().strip() if word.lemma_ != "PROPN" else word.lower_ for word in tokens]
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "PROPN" else word.lower_ for word in tokens]
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    # tokens = [word for word in tokens if word not in punctuations]
    
    # Remove links
    tokens = [remove_urls(word) for word in tokens]
    
    # return preprocessed list of tokens
    return tokens

In [51]:
docs[0]

'🇪🇨 #Ecuador: 315 fallecidos y 7.257 casos confirmados de #COVID19 \n\n‣ https://t.co/RzkxKjpAjT https://t.co/6GltPyF1PI'

In [52]:
spacy_tokenizer(docs[0])

['🇪',
 '🇨',
 'ecuador',
 '315',
 'fallecido',
 '7257',
 'caso',
 'confirmado',
 'covid19',
 '‣',
 '',
 '']

In [53]:
# !pip install nltk

#### - Tokenize + 

In [54]:
for idx in range(len(docs)):
    docs[idx] = spacy_tokenizer(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [55]:
docs[0][:10]

['ecuador', 'fallecido', 'caso', 'confirmado', 'covid19']

In [56]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [57]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.3)

In [58]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

3. Topic modeling

In [59]:
# Train LDA model.
from gensim.models import LdaMulticore
# from gensim.models import LdaModel

# Set training parameters.
num_topics = 50
chunksize = 2000
passes = 30
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaMulticore(
    workers=10,
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='symmetric',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [60]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)


Average topic coherence: -7.3871.
[([(0.09318729, 'casa'),
   (0.07091166, 'video'),
   (0.041115846, 'hilo'),
   (0.04107943, 'amigo'),
   (0.04015587, 'sentir'),
   (0.039861906, 'guardia'),
   (0.039860316, 'ayer'),
   (0.038519923, 'covid'),
   (0.038326878, 'salir'),
   (0.035957232, 'abrir'),
   (0.035745617, 'área'),
   (0.033631843, 'mercado'),
   (0.029995844, 'hospital'),
   (0.025823422, 'paciente'),
   (0.025583485, 'mar'),
   (0.025108228, 'grabar'),
   (0.02509673, 'vacación'),
   (0.023342311, 'área_covid'),
   (0.021792278, 'grabar_video'),
   (0.021461518, 'hospital_sentir')],
  -0.9978730265825675),
 ([(0.05242013, 'méxico'),
   (0.04646802, 'sanitario'),
   (0.04605992, '@hlgatell'),
   (0.045788735, '@ssalud_mx'),
   (0.045606174, 'emergencia'),
   (0.037365295, 'salud'),
   (0.033198807, 'sector'),
   (0.027056752, 'experto'),
   (0.025455585, 'tema'),
   (0.021072997, 'indicación'),
   (0.020261858, 'subsecretario'),
   (0.018723195, 'sector_salud'),
   (0.0177602

4. Display topics

-
- top-10 tweets per topic

In [61]:
from random import sample

docinds_sample = sample(range(len(docs)), 1000)

In [62]:
corpus_sample = [corpus[i] for i in docinds_sample]

In [63]:
doc_topic_scores = list(model[corpus_sample])

In [64]:
from collections import defaultdict
import pandas as pd

topic_counts = defaultdict(int)
docinds_by_topic = defaultdict(list)
for i, scores in enumerate(doc_topic_scores):
    maxs = 0
    top_topic = None
    for j, score in scores:
        if score > maxs:
            maxs = score
            top_topic = j
    docinds_by_topic[top_topic].append(i)
    topic_counts[top_topic] += 1

topic_counts = pd.DataFrame(list(topic_counts.items()), columns=["topic", "n_docs"])
topic_counts["fraction"] = topic_counts.n_docs / topic_counts.n_docs.sum()
topic_counts.sort_values("fraction", ascending=False, inplace=True)
topic_counts

Unnamed: 0,topic,n_docs,fraction
4,41,50,0.05
7,46,38,0.038
8,24,38,0.038
18,9,37,0.037
17,36,36,0.036
31,44,30,0.03
38,29,30,0.03
3,13,30,0.03
30,26,28,0.028
10,23,28,0.028


In [65]:
MAX_DISPLAY_TWEETS_PER_TOPIC = 10
N_DISPLAY_TOPICS = 10

for t in topic_counts.topic[:N_DISPLAY_TOPICS]:
    docinds = docinds_by_topic[t]
    print(f"====== Topic #{t:02d} ====== ")
    if len(docinds) > MAX_DISPLAY_TWEETS_PER_TOPIC:
        docinds = sample(docinds, 10)
    for i in docinds:
        print(raw_docs[i] + '\n------\n')

Hablemos del coronavirus y una teoría que está tomando fuerza con los datos recolectados de los pacientes de Nueva York y que le daría un vuelco total a la investigación y al tratamiento en pacientes hospitalizados con Covid19.

Primero arranco diciendo que estamos mal enfocados
------

Panorama en México 11 de abril 2020: 4,219 casos confirmados, 9,983 casos sospechosos, 21,277 casos negativos y 273 defunciones. Se ha estudiado a 35,479 personas. De casos confirmados, 2,845 (67%) han sido leves y 1,374 (33%) han requerido hospitalización #COVID19. https://t.co/qIzhEwM4f7
------

@EFranzani Ahora cualquiera se agrupa para pegarle a carabineros,,Por ahora SP buen control y gestión ante la pandemia del covid 19...pero aún sigue siendo un fiasco en el control de orden público.
------

Hemos hecho todas las advertencias y propuestas viables al Gob.Nacional. En adelante, es responsabilidad del gobierno de @IvanDuque lo que pase con nosotros. Estamos desprotegidos para enfrentar Covid19.Ya l

- LDAviz?

In [66]:
1+1

2

## Hashtags populares

In [67]:
import re

import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return str(only_ascii)

def extract_hashtags(s):
    return [
        remove_accents(ht.lower())
        for ht in re.findall(r"#(\w+)", s)
    ]

In [68]:
df.head()

Unnamed: 0,tweet_id,user_id,full_text
0,1249125075820969985,894490872665997312,🇪🇨 #Ecuador: 315 fallecidos y 7.257 casos conf...
1,1249125076055851008,121281445,Esta gente debe irse presa.
2,1249125089720905728,19553171,Este junior está usando su organización y a lo...
3,1249125101334863877,1280879857,Italia registra 19 mil 468 muertos con COVID-1...
4,1249125138349600769,2345159331,Hablemos del coronavirus y una teoría que está...


In [69]:
df["full_text"] = df.full_text.astype(str)

In [70]:
df["hashtags"] = df.full_text.apply(extract_hashtags)

In [71]:
df.head()

Unnamed: 0,tweet_id,user_id,full_text,hashtags
0,1249125075820969985,894490872665997312,🇪🇨 #Ecuador: 315 fallecidos y 7.257 casos conf...,"[b'ecuador', b'covid19']"
1,1249125076055851008,121281445,Esta gente debe irse presa.,[]
2,1249125089720905728,19553171,Este junior está usando su organización y a lo...,[]
3,1249125101334863877,1280879857,Italia registra 19 mil 468 muertos con COVID-1...,[]
4,1249125138349600769,2345159331,Hablemos del coronavirus y una teoría que está...,[]


In [72]:
from collections import Counter

In [73]:
ht_counts = Counter()

In [74]:
for hts in df.hashtags.values:
    ht_counts.update(hts)

In [75]:
ht_counts.most_common(100)

[("b'covid19'", 18810),
 ("b'quedateencasa'", 2974),
 ("b'coronavirus'", 2265),
 ("b'covid'", 1063),
 ("b'cuba'", 913),
 ("b'covid_19'", 593),
 ("b'atencionmedicadecalidad'", 447),
 ("b'cubasalvavidas'", 431),
 ("b'venezuela'", 398),
 ("b'estevirusloparamosunidos'", 333),
 ("b'eeuu'", 329),
 ("b'laprevencioneslaclave'", 320),
 ("b'yoapoyolacuarentena'", 302),
 ("b'yomequedoencasa'", 283),
 ("b'covid19mx'", 263),
 ("b'ultimahora'", 251),
 ("b'venezuelabellaenrevolucion'", 251),
 ("b'cuarentena'", 240),
 ("b'17abr'", 236),
 ("b'mexico'", 214),
 ("b'concienciaycompromiso'", 213),
 ("b'17anosbarrioadentro'", 209),
 ("b'abrildeunioncivicomilitar'", 204),
 ("b'reporte'", 190),
 ("b'cubaporlasalud'", 188),
 ("b'urgente'", 186),
 ("b'sanadistancia'", 186),
 ("b'ahora'", 183),
 ("b'covid19chile'", 179),
 ("b'pandemia'", 178),
 ("b'abrildevictoriapopular'", 174),
 ("b'15abr'", 172),
 ("b'envivo'", 170),
 ("b'china'", 159),
 ("b'salud'", 156),
 ("b'18abr'", 156),
 ("b'envideo'", 154),
 ("b'covid2

In [76]:
len(df)

61885

In [77]:
dfc = df[df.hashtags.apply(lambda hts: b"cuarentena" in hts)]

In [78]:
for t in dfc.sample(100).full_text.values:
    print(t)
    print("--------------------")

ValueError: a must be greater than 0 unless no samples are taken