In [1]:
from gensim.models.ldamulticore import LdaMulticore
import multiprocessing as mp
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report
from gensim.corpora import Dictionary
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from sklearn.ensemble import RandomForestClassifier
import re

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("words")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/david/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/david/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/david/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
stop_words_nltk_en = set(stopwords.words("english"))
stop_words_nltk_es = set(stopwords.words("spanish"))

In [4]:
tk = TweetTokenizer()
porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [5]:
def textprep(line):
    tokens = tk.tokenize(str(line))
    tokens = [w.lower() for w in tokens if len(w) > 1]
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if w not in stop_words_nltk_en]
    tokens = [w for w in tokens if w not in stop_words_nltk_es]
    tokens = [wordnet_lemmatizer.lemmatize(w) for w in tokens]
    tokens = [porter.stem(w) for w in tokens]
    return tokens

In [6]:
path_in = "twitterClimateData.csv"
df = pd.read_csv(path_in, delimiter=";")
df = df[["text", "search_hashtags"]]

In [7]:
topics = df["search_hashtags"].unique()
k = len(topics)
print("Topics", topics)
print(f"Number of topics: {k}")

Topics ['#climatestrike' '#climatechange' '#greennewdeal' '#climatecrisis'
 '#climateaction' '#fridaysforfuture' '#environment' '#globalwarming'
 '#actonclimate' '#sustainability' '#savetheplanet' '#bushfires']
Number of topics: 12


We identify common words in the topics of the tweets, so for that reason we reduce `k` to 11


In [8]:
k -= 1

In [9]:
df["tokens_text"] = df.apply(lambda row: textprep(row["text"]), axis=1)

In [10]:
dictionary = Dictionary(df.tokens_text)
corpus = [dictionary.doc2bow(doc) for doc in df.tokens_text]

In [11]:
pool = mp.Pool(mp.cpu_count())
doc_term_matrix = pool.map(
    dictionary.doc2bow, [sentence for sentence in df.tokens_text]
)
pool.close()

In [12]:
lda_model = LdaMulticore(
    doc_term_matrix, num_topics=k, id2word=dictionary, passes=10, workers=10
)

In [13]:
def assign_topic(lda_model, dictionary, doc):
    vector = lda_model[dictionary.doc2bow(doc)]
    vector = sorted(vector, key=lambda item: -item[1])
    return vector

In [14]:
df["topics_vectors"] = df.apply(
    lambda row: assign_topic(lda_model, dictionary, row["tokens_text"]), axis=1
)
df.head()

Unnamed: 0,text,search_hashtags,tokens_text,topics_vectors
0,"2020 is the year we #votethemout, the year we ...",#climatestrike,"[year, year, heart, year, without, liveabl, fu...","[(3, 0.7157052), (7, 0.22973566)]"
1,Winter has not stopped this group of dedicated...,#climatestrike,"[winter, stop, group, dedic, climat, activist,...","[(8, 0.39006627), (5, 0.31269786), (9, 0.21640..."
2,WEEK 55 of #ClimateStrike at the @UN. Next wee...,#climatestrike,"[week, next, week, head, year, strike, time, s...","[(3, 0.59585154), (7, 0.28008366), (9, 0.09608..."
3,"A year of resistance, as youth protests shaped...",#climatestrike,"[year, resist, youth, protest, shape, climat, ...","[(4, 0.9242297)]"
4,HAPPY HOLIDAYS #greta #gretathunberg #climate...,#climatestrike,"[happi, holiday, energi, hous, team]","[(6, 0.84846497), (2, 0.015157661), (1, 0.0151..."


In [15]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, doc_term_matrix, dictionary, sort_topics = False)
vis

In [16]:
topics = lda_model.print_topics(num_topics=11, num_words=2)
topics_dictionary = {}
for topic in topics:
    topics_dictionary[topic[0]] = topic[1]
print(topics_dictionary)

{0: '0.016*"pour" + 0.015*"et"', 1: '0.009*"use" + 0.009*"issu"', 2: '0.015*"new" + 0.012*"nation"', 3: '0.023*"climat" + 0.020*"peopl"', 4: '0.023*"climat" + 0.019*"chang"', 5: '0.033*"climat" + 0.015*"chang"', 6: '0.016*"capitol" + 0.014*"u"', 7: '0.023*"climat" + 0.014*"chang"', 8: '0.037*"citi" + 0.023*"new"', 9: '0.015*"chang" + 0.015*"need"', 10: '0.013*"fuel" + 0.012*"err"'}


In [17]:
def vector_to_topic(vector):
    topic_key = vector[0][0]
    input_string = topics_dictionary[topic_key]
    words = re.findall(r'"([^"]*)"', input_string)
    return " ".join(words)

In [18]:
df["topics"] = df.apply(lambda row: vector_to_topic(row["topics_vectors"]), axis=1)

The topics gotten from LDA are:

In [19]:
unique_topics_lda = df["topics"].unique()
print(unique_topics_lda)

['climat peopl' 'citi new' 'climat chang' 'capitol u' 'chang need'
 'pour et' 'fuel err' 'new nation' 'use issu']


Now we are going to train the model with the new topics gotten from the LDA

In [20]:
df = df.sample(10000)
count_vec = CountVectorizer()
bow = count_vec.fit_transform(df["text"])
bow = np.array(bow.todense())
X = bow
y = df.topics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Multinomial Naive Bayes
nb_clf = MultinomialNB().fit(X_train, y_train)
y_pred_nb = nb_clf.predict(X_test)

# Logistic Regression
logreg = LogisticRegression().fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

# Decision Trees
tree_clf = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)

# Random Forests
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0).fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
print("Metrics for Naive Bayes Clasifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb)}")
print(f"F1 score: {f1_score(y_test, y_pred_nb, average='macro')}")
print(classification_report(y_test, y_pred_nb))

Metrics for Naive Bayes Clasifier
Accuracy: 0.413
F1 score: 0.3319585156975294
              precision    recall  f1-score   support

   capitol u       0.77      0.09      0.16       195
  chang need       0.49      0.39      0.43       353
    citi new       0.63      0.09      0.16       182
climat chang       0.32      0.88      0.47       727
climat peopl       0.60      0.26      0.36       348
    fuel err       0.59      0.09      0.15       215
  new nation       0.58      0.33      0.42       300
     pour et       0.67      0.39      0.50       399
    use issu       0.62      0.23      0.34       281

    accuracy                           0.41      3000
   macro avg       0.59      0.30      0.33      3000
weighted avg       0.54      0.41      0.38      3000



In [22]:
print("Metrics for Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_logreg)}")
print(f"F1 score: {f1_score(y_test, y_pred_logreg, average='macro')}")
print(classification_report(y_test, y_pred_logreg))

Metrics for Logistic Regression
Accuracy: 0.577
F1 score: 0.5551437095478718
              precision    recall  f1-score   support

   capitol u       0.71      0.43      0.54       195
  chang need       0.58      0.56      0.57       353
    citi new       0.68      0.43      0.53       182
climat chang       0.56      0.66      0.60       727
climat peopl       0.58      0.56      0.57       348
    fuel err       0.58      0.40      0.48       215
  new nation       0.56      0.52      0.54       300
     pour et       0.57      0.81      0.67       399
    use issu       0.54      0.47      0.50       281

    accuracy                           0.58      3000
   macro avg       0.60      0.54      0.56      3000
weighted avg       0.58      0.58      0.57      3000



In [23]:
print("Metrics for Decision Trees")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tree)}")
print(f"F1 score: {f1_score(y_test, y_pred_tree, average='macro')}")
print(classification_report(y_test, y_pred_tree))

Metrics for Decision Trees
Accuracy: 0.2896666666666667


F1 score: 0.12106183160452913
              precision    recall  f1-score   support

   capitol u       0.00      0.00      0.00       195
  chang need       1.00      0.02      0.04       353
    citi new       0.00      0.00      0.00       182
climat chang       0.27      0.68      0.38       727
climat peopl       0.00      0.00      0.00       348
    fuel err       0.00      0.00      0.00       215
  new nation       0.30      0.16      0.21       300
     pour et       0.32      0.81      0.46       399
    use issu       0.00      0.00      0.00       281

    accuracy                           0.29      3000
   macro avg       0.21      0.18      0.12      3000
weighted avg       0.26      0.29      0.18      3000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
print("Metrics for Random Forests")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(f"F1 score: {f1_score(y_test, y_pred_rf, average='macro')}")
print(classification_report(y_test, y_pred_rf))

Metrics for Random Forests
Accuracy: 0.24333333333333335


F1 score: 0.045040639979479616
              precision    recall  f1-score   support

   capitol u       0.00      0.00      0.00       195
  chang need       0.00      0.00      0.00       353
    citi new       0.00      0.00      0.00       182
climat chang       0.24      1.00      0.39       727
climat peopl       0.00      0.00      0.00       348
    fuel err       0.00      0.00      0.00       215
  new nation       0.00      0.00      0.00       300
     pour et       1.00      0.01      0.01       399
    use issu       0.00      0.00      0.00       281

    accuracy                           0.24      3000
   macro avg       0.14      0.11      0.05      3000
weighted avg       0.19      0.24      0.10      3000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
