In [70]:
from gensim.models.ldamulticore import LdaMulticore
import multiprocessing as mp
import pandas as pd
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary

In [71]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("words")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/david/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/david/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/david/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [72]:
stop_words_nltk_en = set(stopwords.words("english"))
stop_words_nltk_es = set(stopwords.words("spanish"))

In [73]:
tk = TweetTokenizer()
porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [74]:
def textprep(line):
    tokens = tk.tokenize(str(line))
    tokens = [w.lower() for w in tokens if len(w) > 1]
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if w not in stop_words_nltk_en]
    tokens = [w for w in tokens if w not in stop_words_nltk_es]
    tokens = [wordnet_lemmatizer.lemmatize(w) for w in tokens]
    tokens = [porter.stem(w) for w in tokens]
    return tokens

In [75]:
path_in = "twitterClimateData.csv"
df = pd.read_csv(path_in, delimiter=";")
df = df[["text", "search_hashtags"]]

In [76]:
topics = df["search_hashtags"].unique()
k = len(topics)
print("Topics", topics)
print(f"Number of topics: {k}")

Topics ['#climatestrike' '#climatechange' '#greennewdeal' '#climatecrisis'
 '#climateaction' '#fridaysforfuture' '#environment' '#globalwarming'
 '#actonclimate' '#sustainability' '#savetheplanet' '#bushfires']
Number of topics: 12


We identify common words in the topics of the tweets, so for that reason we reduce `k` to 11


In [77]:
k -= 1

In [78]:
df["tokens_text"] = df.apply(lambda row: textprep(row["text"]), axis=1)

In [79]:
dictionary = Dictionary(df.tokens_text)
corpus = [dictionary.doc2bow(doc) for doc in df.tokens_text]

In [80]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 4)], [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(3, 1), (10, 1), (12, 1), (17, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2)], [(10, 1), (12, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2)], [(46, 1), (47, 1), (48, 1), (49, 1), (50, 1)], [(12, 1), (39, 1), (51, 1), (52, 1), (53, 1)], [], [], [(2, 1), (10, 1), (35, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1)], [(37, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1)], [(30, 1), (39, 1), (69, 1), (70, 1), (71, 1), (72, 1)], [(36, 1), (73, 1), (74, 2), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94

In [81]:
pool = mp.Pool(mp.cpu_count())
doc_term_matrix = pool.map(
    dictionary.doc2bow, [sentence for sentence in df.tokens_text]
)
pool.close()

In [82]:
lda_model = LdaMulticore(
    doc_term_matrix, num_topics=k, id2word=dictionary, passes=10, workers=10
)

In [83]:
def assign_topic(lda_model, dictionary, doc):
    vector = lda_model[dictionary.doc2bow(doc)] 
    vector = sorted(vector, key=lambda item: -item[1])
    return vector

In [84]:
df["topics"] = df.apply(lambda row: assign_topic(lda_model, dictionary, row["tokens_text"]), axis=1)
df.head()

Unnamed: 0,text,search_hashtags,tokens_text,topics
0,"2020 is the year we #votethemout, the year we ...",#climatestrike,"[year, year, heart, year, without, liveabl, fu...","[(4, 0.38177186), (9, 0.33451667), (1, 0.23521..."
1,Winter has not stopped this group of dedicated...,#climatestrike,"[winter, stop, group, dedic, climat, activist,...","[(6, 0.45944995), (1, 0.30267453), (8, 0.15705..."
2,WEEK 55 of #ClimateStrike at the @UN. Next wee...,#climatestrike,"[week, next, week, head, year, strike, time, s...","[(1, 0.68943936), (10, 0.2790867)]"
3,"A year of resistance, as youth protests shaped...",#climatestrike,"[year, resist, youth, protest, shape, climat, ...","[(5, 0.92422587)]"
4,HAPPY HOLIDAYS #greta #gretathunberg #climate...,#climatestrike,"[happi, holiday, energi, hous, team]","[(9, 0.3803963), (6, 0.2813667), (2, 0.2170014..."


In [85]:
# Mostrar los términos y sus pesos de un documento
print(list(lda_model[doc_term_matrix[0]]))

# Mostrar los términos más relevantes de los tópicos más relevantes tópico y sus pesos
print(lda_model.print_topics(num_topics=10, num_words=3))

[(1, 0.36525288), (10, 0.58018637)]
[(6, '0.018*"new" + 0.012*"today" + 0.011*"work"'), (8, '0.017*"park" + 0.016*"climat" + 0.013*"look"'), (10, '0.019*"peopl" + 0.014*"chang" + 0.012*"need"'), (1, '0.026*"climat" + 0.018*"today" + 0.010*"nation"'), (3, '0.014*"climat" + 0.012*"chang" + 0.010*"water"'), (4, '0.015*"day" + 0.013*"save" + 0.012*"earth"'), (2, '0.018*"energi" + 0.012*"fuel" + 0.012*"support"'), (5, '0.038*"climat" + 0.022*"chang" + 0.019*"action"'), (9, '0.011*"make" + 0.011*"need" + 0.010*"err"'), (7, '0.013*"et" + 0.013*"pour" + 0.010*"climat"')]


In [86]:
lda_topic_assignment = [max(p,key=lambda item: item[1]) for p in lda_model[corpus]]

In [87]:
lda_topic_assignment

[(4, 0.36958998),
 (6, 0.45939738),
 (1, 0.6894213),
 (5, 0.9242255),
 (9, 0.3804981),
 (10, 0.84846735),
 (0, 0.09090909),
 (0, 0.09090909),
 (1, 0.4730237),
 (10, 0.8862627),
 (5, 0.8701176),
 (5, 0.61776793),
 (0, 0.09090909),
 (10, 0.65713036),
 (10, 0.38224316),
 (5, 0.96361995),
 (6, 0.8989666),
 (1, 0.6436962),
 (5, 0.7589672),
 (8, 0.7095408),
 (5, 0.49191204),
 (9, 0.91734326),
 (1, 0.8715407),
 (0, 0.09090909),
 (0, 0.09090909),
 (9, 0.62718165),
 (9, 0.55446965),
 (4, 0.7726716),
 (5, 0.9350372),
 (2, 0.5490827),
 (4, 0.7001839),
 (5, 0.88055325),
 (5, 0.8863509),
 (1, 0.48850787),
 (1, 0.64790875),
 (1, 0.88634086),
 (5, 0.8863286),
 (0, 0.09090909),
 (0, 0.6960727),
 (4, 0.6660696),
 (5, 0.95214677),
 (5, 0.9090644),
 (2, 0.92420924),
 (10, 0.8989719),
 (2, 0.8700524),
 (5, 0.7671223),
 (4, 0.81816816),
 (8, 0.9173363),
 (2, 0.9545367),
 (0, 0.7504536),
 (1, 0.44514117),
 (0, 0.7178571),
 (5, 0.60098445),
 (1, 0.5306827),
 (10, 0.47979525),
 (0, 0.94316465),
 (1, 0.4949846

In [88]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, doc_term_matrix, dictionary, sort_topics = False)
vis