In [1]:
import os 
os.chdir("C:\\Users\\Pieter-Jan\\Documents\\Work\\Candriam\\nlp\\ESG\\topic_sentiment_analysis")

In [2]:
# dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import text

# data manipulations
import random
import pandas as pd
import numpy as np
import re
from scipy import sparse

# CMT
from contextualized_topic_models.models.ctm import CTM
from contextualized_topic_models.utils.data_preparation import bert_embeddings_from_file, TextHandler
from contextualized_topic_models.datasets.dataset import CTMDataset
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, InvertedRBO, CoherenceWordEmbeddings
from contextualized_topic_models.utils.preprocessing import SimplePreprocessing

# lDA
from gensim.corpora.dictionary import Dictionary
from gensim.test.utils import common_texts
from gensim.models import LdaMulticore 
import pyLDAvis.gensim
from gensim import corpora, matutils, models, similarities
import pyLDAvis

# BerTopic
from bertopic import BERTopic

# other
from tqdm import tqdm

# own modules
import modules.preprocessing as preprocess

In [3]:
%reload_ext autoreload
%autoreload 2

# Goal

We are going to compare the performance of three **unsupervised** models for topic modelling on ESG documents.

1. Contextualized Topic Modelling (CTM): https://github.com/MilaNLProc/contextualized-topic-models
2. Latent Dirichlet Allocation (LDA): https://radimrehurek.com/gensim/models/ldamodel.html
3. BERTopic: https://github.com/MaartenGr/BERTopic

# Evaluation measures 

1. **Normalized Point-wise Mutual Information (NPMI) (Lau et al.,
2014)**

It measures how much the top-10 words of a topic are related to each other, considering the empirical frequency of the words computed on the
original corpus. τ is a symbolic metric and relies on co-occurrence.

2. **External Word Embeddings Topic Coherence**

As Ding et al. (2018) pointed out, though, topic
coherence computed on the same data is inherently
limited. Coherence computed on an external corpus, on the other hand, correlates much more to
human judgment, but it may be expensive to estimate. Thus, our second metric is an external
word embeddings topic coherence metric, which we compute by adopting a strategy similar to that
described in Ding et al. (2018). First, we compute
the average pairwise cosine similarity of the word
embeddings of the top-10 words in a topic using (Mikolov et al., 2013) embeddings. Then, we
compute the overall average of those values for all
the topics (α).

3. **rank-
biased overlap (RBO) (Webber et al., 2010)**

To evaluate how diverse the topics
generated by a single model are, we use the rank-
biased overlap (RBO) (Webber et al., 2010). RBO
compares two topics of the same model. The key
qualities of this measure are twofold: it allows
disjointedness between the lists of topics (i.e., two
topics can have different words in them) and it is
weighted on the ranking (i.e., two lists that share
some of the same words, albeit at different rankings,
are penalized less than two lists that share the same
words at the highest ranks). We deﬁne ρ as the rank-
biased overlap diversity, that we interpret as the
reciprocal of the standard RBO. ρ is 0 for identical
topics and 1 for completely different topics. Both
metrics are computed on the top-k ranked lists.
Following the state-of-the-art, we consider k = 10.

# Functions

In [50]:

def load_preprocess(data, dir_file, stop_words, nlp, update=False):
    """
    reads in processed data if exists
    """

    # read file if it exits
    if os.path.isfile(dir_file) and not update:
        df = pd.read_csv(dir_file, sep='\t', header=None)
        data["paragraph_cleaned"] = df.values

    else:
        # progress bar
        tqdm.pandas()
        # perfrom some cleaning (stopwords, lemmatize)
        data["paragraph_cleaned"] = data['paragraph_raw'].progress_apply(preprocess.lemmatize, nlp=nlp, stop_words=stop_words, method=1)
        # save raw and cleaned text
        data["paragraph_raw"].to_csv('output/newsgroup_raw.txt', sep='\t', index=False, header=False)
        data["paragraph_cleaned"].to_csv(dir_file, sep='\t', index=False, header=False)
    
    return data

In [5]:
# global variable used throughout te notebooextract_text_from_pdf_url
UPDATE = True

# Load data and cleaning

In [6]:
nr_subsamples = 10000
dataset = fetch_20newsgroups(
    shuffle=True,
    random_state=32,
    remove=('headers', 'footers', 'qutes')
    )["data"]

In [7]:
# Remove Emails
data = [re.sub(r"\S*@\S*\s?", '', sent) for sent in dataset]

# remove special characters
data = [re.sub(r'[^A-Za-z]+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [8]:
data_df = pd.DataFrame(data, columns=['paragraph_raw'])
data_df.shape

(11314, 1)

In [9]:
# our list contains all english stop words 
stop_words = text.ENGLISH_STOP_WORDS

In [11]:
# load spacy model to lematize text
nlp = preprocess.load_spacy_model()

✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [12]:
data_df = load_preprocess(data=data_df, dir_file='output/newsgroup_cleaned.txt', stop_words=stop_words, nlp=nlp, update=UPDATE)

100%|██████████| 11314/11314 [08:52<00:00, 21.26it/s]


In [13]:
print(data_df.shape)
data_df.head()

(11314, 2)


Unnamed: 0,paragraph_raw,paragraph_cleaned
0,The real question here in my opinion is what M...,real question opinion motorola processor run c...
1,Please could someone in the US give me the cur...,current street price following relevant taxis ...
2,Can somebody please help me with information a...,somebody help information american magnetics c...
3,In article Pat Myrto writes I am sick dismayed...,article myrto write sick dismay discouraged as...
4,From article by John R Daker Cup holders drivi...,article john daker holder drive importantant u...


## 1. Contextualized Topic Modelling

## 1.1 Prepare data for model

In [14]:
# use cleaned embeddings (lemmatized + stopwords removed)
# handler = TextHandler('output/newsgroup_cleaned.txt')
# handler.prepare()
# handler.bow

In [15]:
with open('output/newsgroup_cleaned.txt',"r") as fr:
    text_cleaned = [doc.split() for doc in fr.read().splitlines()] # load text for NPMI

dictionary = Dictionary(text_cleaned)

'''
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''

dictionary.filter_extremes(no_below=15, no_above=0.10, keep_n= 100000)
print(len(dictionary.token2id))

2020-11-17 17:19:59.128 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
2020-11-17 17:20:01.224 INFO    gensim.corpora.dictionary: adding document #10000 to Dictionary(51580 unique tokens: ['benchmark', 'comparable', 'conversation', 'datum', 'david']...)
2020-11-17 17:20:01.478 INFO    gensim.corpora.dictionary: built Dictionary(54503 unique tokens: ['benchmark', 'comparable', 'conversation', 'datum', 'david']...) from 11314 documents (total 1129305 corpus positions)
2020-11-17 17:20:01.588 INFO    gensim.corpora.dictionary: discarding 48459 tokens: [('know', 3635), ('question', 1486), ('refund', 13), ('cannon', 13), ('carson', 13), ('charger', 14), ('frode', 2), ('help', 1315), ('like', 3392), ('magnetics', 1)]...
2020-11-17 17:20:01.590 INFO    gensim.corpora.dictionary: keeping 6044 tokens which were in no less than 15 and no more than 1131 (=10.0%) documents
2020-11-17 17:20:01.633 INFO    gensim.corpora.dictionary: resulting dictionary: Dic

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in text_cleaned]

In [17]:
tf_array = matutils.corpus2dense(bow_corpus, num_terms=len(dictionary.token2id)).T
tf_array.shape

(11314, 6044)

In [None]:
# convert to sparse matrix
tf_array_sparse = sparse.csr_matrix(tf_array)
tf_array_sparse

In [18]:
# create or load bert embeddings (either use raw text or clean text)
# we can expirment with both
embeddings_bert = preprocess.load_bert_embeddings(
        text_dir="output/newsgroup_raw.txt", 
        model="distiluse-base-multilingual-cased",
        dir_embeddings="output/newsgroup_bert_Embeddings_raw.npy",
        update=UPDATE
        )

2020-11-17 17:20:05.625 INFO    root: Load pretrained SentenceTransformer: distiluse-base-multilingual-cased
2020-11-17 17:20:05.628 INFO    root: Did not find folder distiluse-base-multilingual-cased. Assume to download model from server.
2020-11-17 17:20:05.669 INFO    root: Load SentenceTransformer from folder: C:\Users\Pieter-Jan/.cache\torch\sentence_transformers\sbert.net_models_distiluse-base-multilingual-cased
2020-11-17 17:20:10.014 INFO    root: Use pytorch device: cpu
Batches: 100%|██████████| 57/57 [29:55<00:00, 31.49s/it]


In [19]:
embeddings_bert.shape

(11314, 512)

In [21]:
# ivert dictionary
inv_token2id = {v: k for k, v in dictionary.token2id.items()}
# create dataset
training_dataset = CTMDataset(tf_array_sparse, embeddings_bert, inv_token2id)

## 1.2 Train model

In [22]:
random.seed(69)
ctm = CTM(
    input_size=len(dictionary.token2id), 
    bert_input_size=512, 
    n_components=20, 
    inference_type="combined", 
    num_epochs=10,
    reduce_on_plateau=True
    )
ctm.fit(training_dataset) # run model

Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: True
                   Save Dir: None
Epoch: [1/10]	Samples: [11314/113140]	Train Loss: 644.7722137327095	Time: 0:00:38.026307
Epoch: [2/10]	Samples: [22628/113140]	Train Loss: 617.8858217694891	Time: 0:00:39.461052
Epoch: [3/10]	Samples: [33942/113140]	Train Loss: 607.5998524366935	Time: 0:00:38.177199
Epoch: [4/10]	Samples: [45256/113140]	Train Loss: 601.2149657987338	Time: 0:00:39.296127
Epoch: [5/10]	Samples: [56570/113140]	Train Loss: 596.8946186002961	Time: 0:00:40.052483
Epoch: [6/10]	Samples: [67884/113140]	Train Loss: 594.3946885150588	Time: 0:0

## 1.3 Evaluate topics

In [23]:
cmt_topics_l = ctm.get_topic_lists(10)
cmt_topics_d = {}
for i in range(len(cmt_topics_l)):
    cmt_topics_d[i] = cmt_topics_l[i]

In [24]:
pd.DataFrame.from_dict(cmt_topics_d).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,christian,life,religion,bible,claim,jesus,truth,science,true,church
1,government,state,turkish,armenian,genocide,homicide,ottoman,force,turkey,firearm
2,file,program,line,include,number,output,window,base,follow,entry
3,space,chip,shuttle,datum,computer,software,orbit,nasa,mail,disk
4,rapidly,satisfy,divide,unique,commonly,capable,transport,secrecy,virtually,initial
5,game,team,player,play,hockey,season,score,league,second,goal
6,bike,car,ride,wheel,engine,mile,road,rear,honda,dealer
7,motherboard,speaker,appreciate,monitor,pin,audio,gateway,brand,video,board
8,censorship,internally,electronically,sheep,specialized,invalidate,accessible,subsequently,dramatic,satisfy
9,satisfy,lawful,swiss,participant,academy,rapidly,accessible,sysadmin,strictly,closely


### 1.3.1 Normalized Point-wise Mutual Information

In [25]:
npmi = CoherenceNPMI(texts=text_cleaned, topics=ctm.get_topic_lists(10))
npmi.score()

tual)
2020-11-17 17:57:00.388 INFO    gensim.topic_coherence.text_analysis: 26 batches submitted to accumulate stats from 1664 documents (165845 virtual)
2020-11-17 17:57:00.475 INFO    gensim.topic_coherence.text_analysis: 27 batches submitted to accumulate stats from 1728 documents (176791 virtual)
2020-11-17 17:57:00.495 INFO    gensim.topic_coherence.text_analysis: 28 batches submitted to accumulate stats from 1792 documents (182474 virtual)
2020-11-17 17:57:00.544 INFO    gensim.topic_coherence.text_analysis: 29 batches submitted to accumulate stats from 1856 documents (192673 virtual)
2020-11-17 17:57:00.615 INFO    gensim.topic_coherence.text_analysis: 30 batches submitted to accumulate stats from 1920 documents (197668 virtual)
2020-11-17 17:57:00.659 INFO    gensim.topic_coherence.text_analysis: 31 batches submitted to accumulate stats from 1984 documents (205573 virtual)
2020-11-17 17:57:00.721 INFO    gensim.topic_coherence.text_analysis: 32 batches submitted to accumulate s

-0.07338773817323598

### 1.3.2 External Word Embeddings Topic Coherence

In [26]:
CoherenceWordEmbeddings(ctm.get_topic_lists(10)).score()

2020-11-17 17:57:09.736 INFO    gensim.models.utils_any2vec: loading projection weights from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2020-11-17 17:58:24.303 INFO    gensim.models.utils_any2vec: loaded (3000000, 300) matrix from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


0.15084322

### 1.3.3 Rank-Biased Overlap 

In [27]:
InvertedRBO(ctm.get_topic_lists(10)).score()

0.9859468748523684

## 2. Latent Dirichlet Allocation

## 2.1 Train model

In [28]:
num_topics = 20
lda_model =  LdaMulticore(
    corpus=bow_corpus, 
    num_topics = num_topics, 
    id2word = dictionary,                                    
    passes = 10,
    workers = 2
    )

at" + 0.009*"line" + 0.009*"output" + 0.009*"send" + 0.008*"information" + 0.008*"available"
2020-11-17 17:59:46.400 INFO    gensim.models.ldamodel: topic #2 (0.050): 0.012*"science" + 0.012*"exist" + 0.009*"evidence" + 0.008*"book" + 0.007*"theory" + 0.007*"atheist" + 0.007*"atheism" + 0.007*"existence" + 0.006*"study" + 0.006*"universe"
2020-11-17 17:59:46.402 INFO    gensim.models.ldamodel: topic #14 (0.050): 0.015*"president" + 0.010*"water" + 0.007*"job" + 0.005*"homosexual" + 0.005*"talk" + 0.005*"number" + 0.004*"package" + 0.004*"consider" + 0.004*"group" + 0.004*"state"
2020-11-17 17:59:46.404 INFO    gensim.models.ldamodel: topic #9 (0.050): 0.007*"weapon" + 0.006*"gun" + 0.006*"car" + 0.006*"drive" + 0.005*"engine" + 0.005*"firearm" + 0.005*"cost" + 0.004*"high" + 0.004*"light" + 0.004*"control"
2020-11-17 17:59:46.407 INFO    gensim.models.ldamodel: topic #7 (0.050): 0.014*"printer" + 0.014*"book" + 0.011*"instruction" + 0.009*"power" + 0.009*"price" + 0.008*"battery" + 0.0

## 2.2 Evaluate topics

In [29]:
lda_topics_d = {}
lda_topics_l = []
for i in range(num_topics):
    t = [w[0] for w in lda_model.show_topic(i)[0:10]]
    lda_topics_d[i+1] = t
    lda_topics_l.append(t)

In [30]:
# show topics
# evert row is a topic
pd.DataFrame.from_dict(lda_topics_d).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,space,launch,nasa,satellite,orbit,program,center,earth,mission,health
2,church,ripem,catholic,pope,authority,code,group,patent,message,public
3,exist,science,evidence,atheist,theory,atheism,argument,example,reason,book
4,moral,objective,morality,animal,book,natural,human,claim,keith,speed
5,government,public,state,clinton,secret,police,american,attack,number,court
6,jesus,christian,bible,life,christ,word,faith,love,death,world
7,window,display,application,widget,server,motif,program,include,code,version
8,book,printer,power,battery,instruction,price,circuit,input,design,print
9,armenian,turkish,armenians,child,kill,government,turkey,woman,armenia,greek
10,weapon,car,drive,gun,firearm,engine,high,rate,speed,auto


### 2.2.1 Normalized Point-wise Mutual Information

In [31]:
npmi = CoherenceNPMI(texts=text_cleaned, topics=lda_topics_l)
npmi.score()

18:00:31.208 INFO    gensim.topic_coherence.text_analysis: 34 batches submitted to accumulate stats from 2176 documents (213827 virtual)
2020-11-17 18:00:31.229 INFO    gensim.topic_coherence.text_analysis: 35 batches submitted to accumulate stats from 2240 documents (220191 virtual)
2020-11-17 18:00:31.282 INFO    gensim.topic_coherence.text_analysis: 36 batches submitted to accumulate stats from 2304 documents (227051 virtual)
2020-11-17 18:00:31.407 INFO    gensim.topic_coherence.text_analysis: 37 batches submitted to accumulate stats from 2368 documents (231646 virtual)
2020-11-17 18:00:31.509 INFO    gensim.topic_coherence.text_analysis: 38 batches submitted to accumulate stats from 2432 documents (236039 virtual)
2020-11-17 18:00:31.557 INFO    gensim.topic_coherence.text_analysis: 39 batches submitted to accumulate stats from 2496 documents (241718 virtual)
2020-11-17 18:00:31.570 INFO    gensim.topic_coherence.text_analysis: 40 batches submitted to accumulate stats from 2560 do

0.09898559684649942

### 2.2.2 External Word Embeddings Topic Coherence

In [32]:
CoherenceWordEmbeddings(lda_topics_l).score()

2020-11-17 18:00:41.526 INFO    gensim.models.utils_any2vec: loading projection weights from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2020-11-17 18:01:55.661 INFO    gensim.models.utils_any2vec: loaded (3000000, 300) matrix from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


0.1608593

### 2.2.3 Rank-Biased Overlap 

In [33]:
InvertedRBO(lda_topics_l).score()

0.991251863478609

In [34]:
# Visualize the topics
# pyLDAvis.enable_notebook()
# LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
# LDAvis_prepared

## 3. Bert Topic

## 3.1 Train model

In [35]:
random.seed(69)

with open('output/newsgroup_cleaned.txt',"r") as fr:
    docs = [doc for doc in fr.read().splitlines()] 

model = BERTopic(verbose=True)
topics = model.fit_transform(docs, embeddings_bert)

2020-11-17 18:02:35,173 - BERTopic - Reduced dimensionality with UMAP
2020-11-17 18:02:35.173 INFO    BERTopic: Reduced dimensionality with UMAP
2020-11-17 18:02:38,756 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2020-11-17 18:02:38.756 INFO    BERTopic: Clustered UMAP embeddings with HDBSCAN


## 3.2 Evaluate topics

In [36]:
topcis_b = model.get_topics()

In [37]:
# extract words for each topic
topics_k = {}
for k,v in topcis_b.items():
    t_words = []
    for w in v:
        t_words.append(w[0])
    # append the first 10 words
    topics_k[k] = t_words[0:10]

In [38]:
topics_bert = list(topics_k.values())
pd.DataFrame.from_dict(topics_k).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
-1,president,stephanopoulos,say,believe,people,job,go,health,state,think
0,team,game,play,player,season,hockey,score,league,year,baseball
1,jewish,lowenstein,koufax,stankowitz,sandy,hank,berg,lame,baseball,rack
2,test,inguiry,woof,subscribe,hello,critus,hdfd,thud,bilinsky,melittin
3,bike,ride,car,motorcycle,engine,road,brake,tire,helmet,drive
4,nrhj,gizw,wwiz,bxom,bhjn,maxbyte,tbxn,nriz,byte,pnei
5,window,file,program,space,drive,card,use,image,software,thank
6,drug,cocaine,alcohol,marijuana,legalization,heroin,cigarette,kid,legalize,legal
7,food,patient,disease,doctor,medical,treatment,health,pain,cause,diet
8,armenian,turkish,armenians,turkey,armenia,turk,greek,genocide,azerbaijan,say


### 3.2.1 Normalized Point-wise Mutual Information

In [39]:
npmi = CoherenceNPMI(texts=text_cleaned, topics=topics_bert)
npmi.score()

17 18:02:53.759 INFO    gensim.topic_coherence.text_analysis: 32 batches submitted to accumulate stats from 2048 documents (205250 virtual)
2020-11-17 18:02:53.810 INFO    gensim.topic_coherence.text_analysis: 33 batches submitted to accumulate stats from 2112 documents (209999 virtual)
2020-11-17 18:02:53.921 INFO    gensim.topic_coherence.text_analysis: 34 batches submitted to accumulate stats from 2176 documents (214986 virtual)
2020-11-17 18:02:53.931 INFO    gensim.topic_coherence.text_analysis: 35 batches submitted to accumulate stats from 2240 documents (224481 virtual)
2020-11-17 18:02:53.949 INFO    gensim.topic_coherence.text_analysis: 36 batches submitted to accumulate stats from 2304 documents (228234 virtual)
2020-11-17 18:02:54.077 INFO    gensim.topic_coherence.text_analysis: 37 batches submitted to accumulate stats from 2368 documents (232614 virtual)
2020-11-17 18:02:54.130 INFO    gensim.topic_coherence.text_analysis: 38 batches submitted to accumulate stats from 2432

0.1365956953251772

### 3.2.2 External Word Embeddings Topic Coherence

In [48]:
CoherenceWordEmbeddings(topics_bert).score()

2020-11-17 18:19:56.796 INFO    gensim.models.utils_any2vec: loading projection weights from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2020-11-17 18:20:44.772 INFO    gensim.models.utils_any2vec: loaded (3000000, 300) matrix from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


nan

### 3.2.3 Rank-Biased Overlap 

In [41]:
InvertedRBO(topics_bert).score()

0.9969902925952691