In [None]:
# importing relevant packages

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords


import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

from sklearn.feature_extraction.text import CountVectorizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer
## sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
## LDA
from gensim import corpora
import gensim

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

#import pyLDAvis.gensim
import pickle 
import pyLDAvis

import matplotlib.colors as mcolors

from collections import Counter




In [None]:
# reading data
df_2022 = pd.read_csv('../output/2022_sentiment.csv')
df_2018 = pd.read_csv('../output/2018_sentiment.csv')
df_2020 = pd.read_csv('../output/2020_sentiment.csv')

nat_disas = pd.read_csv("../data/disasters.csv")


In [None]:
nat_disas["year"] = nat_disas["start_date"].str[:4]
nat_disas_2018 = (nat_disas[nat_disas["year"] == "2018"])
nat_disas_2020 = (nat_disas[nat_disas["year"] == "2020"])

num_nat_disas_2018 = nat_disas_2018.shape[0]
num_nat_disas_2018
num_nat_disas_2020 = nat_disas_2020.shape[0]
num_nat_disas_2020

avg_compound_2018 = df_2018.loc[:, "compound_scores"].mean()
avg_compound_2020 = df_2020.loc[:, "compound_scores"].mean()


plt.bar(["2018", "2020"], [num_nat_disas_2018, num_nat_disas_2020], color = "teal")
plt.ylabel("Number of Natural Disasters")
plt.title("Number of Natural Disasters Globally by Year")
plt.show()

#barwidth = 0.5
#br1 = np.arange(len(["2018", "2020"]))
#br2 = [x + barwidth for x in br1]

#fig, ax1 = plt.subplots()
#color = 'tab:red'
#ax1.set_ylabel('Number of Natural Disasters', color=color)
#ax1.bar(["2018", "2020"], [num_nat_disas_2018, num_nat_disas_2020], color=color, position = 0)
#ax1.tick_params(axis='y', labelcolor=color)
#ax2 = ax1.twinx()  # instantiate a second Axes that shares the same x-axis

#color = 'tab:blue'
#ax2.set_ylabel('Average Sentiment Score', color=color)  # we already handled the x-label with ax1
#ax2.bar(["2018", "2020"], [avg_compound_2018, avg_compound_2020], color=color, position = 1)
#ax2.tick_params(axis='y', labelcolor=color)

#fig.tight_layout()  # otherwise the right y-label is slightly clipped
#plt.show()

plt.bar(["2018", "2020"], [avg_compound_2018, avg_compound_2020], color = "teal")
plt.ylabel("Average Compound Sentiment Score")
plt.title("Average Sentiment Score by Year")
plt.show()



In [None]:
df_2018

In [None]:
porter = SnowballStemmer("english")
custom_stopwords = ["climate", "change", "warm", "https", "global", "will", "this", "that", "what",
                    "cause", "from", "like", "have", "they", "climatechange", "with", "about", "more",
                    "replied", "tweet", "reply"]
## your code defining a text processing function
def processing(text_ex):
    tokens = word_tokenize(text_ex.lower()) # lowercase
    filtered_tokens = [word for word in tokens if word not in custom_stopwords]
    example_listing_preprocess = [porter.stem(token)
                                for token in filtered_tokens
                                if token.isalpha() and
                                len(token) >= 4]

    final_list = [" ".join(example_listing_preprocess)]
    return final_list

df_2018["processed_text"] = [processing(text) for text in df_2018["Embedded_text"]]
df_2022["processed_text"] = [processing(text) for text in df_2022["Embedded_text"]]
df_2020["processed_text"] = [processing(text) for text in df_2020["Embedded_text"]]


In [None]:
### FOR 2018
tokenized_docs = [doc[0].split() for doc in df_2018["processed_text"]]

dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

lda = lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = dictionary,
    num_topics = 4,
    random_state = 122,
)

lda


In [None]:

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=custom_stopwords,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show() ## 2018 PLOT!!

In [None]:
### FOR 2020
tokenized_docs = [doc[0].split() for doc in df_2020["processed_text"]]

dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

lda = lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = dictionary,
    num_topics = 4,
    random_state = 122,
)

lda


In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=custom_stopwords,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show() ## 2020 PLOT!!

In [None]:
### FOR 2022
tokenized_docs = [doc[0].split() for doc in df_2022["processed_text"]]

dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

lda = lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = dictionary,
    num_topics = 4,
    random_state = 122,
)

lda


In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=custom_stopwords,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show() ## 2022 PLOT!!

Citations:
1. https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/#9.-Word-Clouds-of-Top-N-Keywords-in-Each-Topic
