In [1]:
import os
directory = "C:\\Users\\Pieter-Jan\\Documents\\Work\\Candriam\\nlp\\ESG\\topic_sentiment_analysis"
os.chdir(directory)

In [2]:
# data manipulations
import random
import pandas as pd
import numpy as np
import re
from scipy import sparse

# CMT
from contextualized_topic_models.models.ctm import CTM
from contextualized_topic_models.utils.data_preparation import bert_embeddings_from_file, TextHandler
from contextualized_topic_models.datasets.dataset import CTMDataset
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, InvertedRBO, CoherenceWordEmbeddings
from contextualized_topic_models.utils.preprocessing import SimplePreprocessing

# lDA
from gensim.corpora.dictionary import Dictionary
from gensim.test.utils import common_texts
from gensim.models import LdaMulticore 
import pyLDAvis.gensim
from gensim import corpora, matutils, models, similarities
import pyLDAvis

# BerTopic
from bertopic import BERTopic

# other
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
import PyPDF2
from pdfminer.high_level import extract_text
from sklearn.feature_extraction import text

# own modules
import modules.preprocessing as preprocess

# Goal

We are going to compare the performance of three **unsupervised** models for topic modelling on 50 corperate governance documents.

1. Contextualized Topic Modelling (CTM): https://github.com/MilaNLProc/contextualized-topic-models
2. Latent Dirichlet Allocation (LDA): https://radimrehurek.com/gensim/models/ldamodel.html
3. BERTopic: https://github.com/MaartenGr/BERTopic

# Evaluation measures 

1. **Normalized Point-wise Mutual Information (NPMI) (Lau et al.,
2014)**

It measures how much the top-10 words of a topic are related to each other, considering the empirical frequency of the words computed on the
original corpus. τ is a symbolic metric and relies on co-occurrence.

2. **External Word Embeddings Topic Coherence**

As Ding et al. (2018) pointed out, though, topic
coherence computed on the same data is inherently
limited. Coherence computed on an external corpus, on the other hand, correlates much more to
human judgment, but it may be expensive to estimate. Thus, our second metric is an external
word embeddings topic coherence metric, which we compute by adopting a strategy similar to that
described in Ding et al. (2018). First, we compute
the average pairwise cosine similarity of the word
embeddings of the top-10 words in a topic using (Mikolov et al., 2013) embeddings. Then, we
compute the overall average of those values for all
the topics (α).

3. **rank-
biased overlap (RBO) (Webber et al., 2010)**

To evaluate how diverse the topics
generated by a single model are, we use the rank-
biased overlap (RBO) (Webber et al., 2010). RBO
compares two topics of the same model. The key
qualities of this measure are twofold: it allows
disjointedness between the lists of topics (i.e., two
topics can have different words in them) and it is
weighted on the ranking (i.e., two lists that share
some of the same words, albeit at different rankings,
are penalized less than two lists that share the same
words at the highest ranks). We deﬁne ρ as the rank-
biased overlap diversity, that we interpret as the
reciprocal of the standard RBO. ρ is 0 for identical
topics and 1 for completely different topics. Both
metrics are computed on the top-k ranked lists.
Following the state-of-the-art, we consider k = 10.

## Read in reports and perform some text processing

# Evaluation measures 

1. **Normalized Point-wise Mutual Information (NPMI) (Lau et al.,
2014)**

It measures how much the top-10 words of a topic are related to each other, considering the empirical frequency of the words computed on the
original corpus. τ is a symbolic metric and relies on co-occurrence.

2. **External Word Embeddings Topic Coherence**

As Ding et al. (2018) pointed out, though, topic
coherence computed on the same data is inherently
limited. Coherence computed on an external corpus, on the other hand, correlates much more to
human judgment, but it may be expensive to estimate. Thus, our second metric is an external
word embeddings topic coherence metric, which we compute by adopting a strategy similar to that
described in Ding et al. (2018). First, we compute
the average pairwise cosine similarity of the word
embeddings of the top-10 words in a topic using (Mikolov et al., 2013) embeddings. Then, we
compute the overall average of those values for all
the topics (α).

3. **rank-
biased overlap (RBO) (Webber et al., 2010)**

To evaluate how diverse the topics
generated by a single model are, we use the rank-
biased overlap (RBO) (Webber et al., 2010). RBO
compares two topics of the same model. The key
qualities of this measure are twofold: it allows
disjointedness between the lists of topics (i.e., two
topics can have different words in them) and it is
weighted on the ranking (i.e., two lists that share
some of the same words, albeit at different rankings,
are penalized less than two lists that share the same
words at the highest ranks). We deﬁne ρ as the rank-
biased overlap diversity, that we interpret as the
reciprocal of the standard RBO. ρ is 0 for identical
topics and 1 for completely different topics. Both
metrics are computed on the top-k ranked lists.
Following the state-of-the-art, we consider k = 10.

In [3]:
UPDATE = False
DIR_REPORTS = '"C:/Users/Pieter-Jan/Documents/Work/Candriam/nlp/ESG/reports/Corporate_Governance_Report" global variable used throughout te notebook to update preprocessing steps
UPDATE = True
DIR_REPORTS = "C:/Users/Pieter-Jan/Documents/Work/Candriam/nlp/ESG/reports/Corporate_Governance_Report"

  and should_run_async(code)


SyntaxError: EOL while scanning string literal (<ipython-input-3-010f4a6c95ad>, line 2)

## Download reports and perform some text processing

In [5]:
df = pd.read_excel("data//Indusonlyfiles = [f for f in os.listdir(DIR_REPORTS) if os.path.isfile(os.path.join(DIR_REPORTS, f))]pdf = d.DataFrame()onlyfiles, columns=["filename"]

bjRef:47>}, 'Font': {'F0': <PDFObjRef:48>, 'F1': <PDFObjRef:58>, 'F2': <PDFObjRef:53>, 'F3': <PDFObjRef:63>}}, 'MediaBox': [0, 0, 611, 791], 'Contents': <PDFObjRef:19>, 'Parent': <PDFObjRef:84>}
2020-11-17 23:42:23.478 INFO    pdfminer.pdfinterp: Processing page: <PDFPage: Resources={'ProcSets': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'ExtGState': {'G0': <PDFObjRef:47>}, 'Font': {'F0': <PDFObjRef:48>, 'F1': <PDFObjRef:58>, 'F2': <PDFObjRef:53>, 'F3': <PDFObjRef:63>}}, MediaBox=[0, 0, 611, 791]>
2020-11-17 23:42:23.481 INFO    pdfminer.pdfinterp: render_contents: resources={'ProcSets': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'ExtGState': {'G0': <PDFObjRef:47>}, 'Font': {'F0': <PDFObjRef:48>, 'F1': <PDFObjRef:58>, 'F2': <PDFObjRef:53>, 'F3': <PDFObjRef:63>}}, streams=[<PDFStream(19): raw=1337, {'Filter': /'FlateDecode', 'Length': 1336}>], ctm=(1, 0, 0, 1, 0, 0)
2020-11-17 23:42:23.540 INFO    pdfminer.pdfpage: Page: {'Type': /'Page', 'Resources': {'ProcSets': [/'P

In [None]:
df_processed = preprocess.load_processed_text(
    df,
    dir_read_pdf=DIR_REPORTS, 
    columns_to_keep = ["filename"],
    file_processed_text="output/Cgovernance_processed_pdfMiner.txt",
    n_min_word_paragraph=50, 
    n_max_word_paragraph=125,  
    update=True,
    method_extract_content = "pdfMiner"
    )

In [7]:
# save only the paragraph to a text file
df_processed["paragraph"].to_csv('output/Cgovernance_processed_raw_pdfMiner.txt', sep='\t', index=False, header=False)

2020-11-17 23:42:26.465 INFO    numexpr.utils: NumExpr defaulting to 4 threads.


In [8]:
df_processed.shape

(1520, 2)

In [9]:
# our list contains all english stop words 
stop_words = text.ENGLISH_STOP_WORDS

In [10]:
# load spacy model to lematize text
nlp = preprocess.load_spacy_model()

✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


## 1. Contextualized Topic Modelling

## 1.1 Prepare data for model

In [11]:
df_processed = preprocess.load_lemmatize(
    data=df_processed, 
    dir_file='output/Cgovernance_processed_cleaned_pdfMiner.txt', 
    stop_words=stop_words, 
    nlp=nlp, 
    method=1, 
    update=True
)

100%|██████████| 1520/1520 [00:47<00:00, 31.76it/s]


In [12]:
print(df_processed.shape)
df_processed.head()

(1520, 3)


Unnamed: 0,filename,paragraph,paragraph_cleaned
0,20171024_HBI_Corporate_Governance_Report_SD000...,HANESBRANDS INC CORPORATE GOVERNANCE GUIDELINE...,hanesbrands corporate governance guideline dir...
1,20171024_HBI_Corporate_Governance_Report_SD000...,candidate has served in policy making roles in...,candidate serve policy making role business go...
2,20171024_HBI_Corporate_Governance_Report_SD000...,Board shall approve the nomination or the elec...,board shall approve nomination election direct...
3,20171024_HBI_Corporate_Governance_Report_SD000...,virtue of any applicable law or authority The ...,virtue applicable authority board shall elect ...
4,20171024_HBI_Corporate_Governance_Report_SD000...,knowledge of Hanesbrands business In addition ...,knowledge hanesbrand business addition board r...


In [13]:
with open('output/Cgovernance_processed_cleaned_pdfMiner.txt',"r") as fr:
    text_cleaned = [doc.split() for doc in fr.read().splitlines()] # load text for NPMI

dictionary = Dictionary(text_cleaned)

'''
Remove very rare and very common words:

- words appearing less than 5 times
- words appearing in more than 15% of all documents
'''

dictionary.filter_extremes(no_below=5, no_above=0.15, keep_n= 100000)
print(len(dictionary.token2id))

2020-11-17 23:43:23.009 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
2020-11-17 23:43:23.240 INFO    gensim.corpora.dictionary: built Dictionary(2853 unique tokens: ['activity', 'applicable', 'arena', 'assessment', 'background']...) from 1520 documents (total 94899 corpus positions)
2020-11-17 23:43:23.246 INFO    gensim.corpora.dictionary: discarding 1639 tokens: [('applicable', 255), ('arena', 2), ('board', 1360), ('business', 378), ('committee', 1043), ('company', 807), ('consider', 299), ('corporate', 550), ('director', 1304), ('distinguish', 1)]...
2020-11-17 23:43:23.247 INFO    gensim.corpora.dictionary: keeping 1214 tokens which were in no less than 5 and no more than 228 (=15.0%) documents
2020-11-17 23:43:23.255 INFO    gensim.corpora.dictionary: resulting dictionary: Dictionary(1214 unique tokens: ['activity', 'assessment', 'background', 'candidate', 'competence']...)
1214


In [14]:
bow_corpus = [dictionary.doc2bow(doc) for doc in text_cleaned]

In [15]:
tf_array = matutils.corpus2dense(bow_corpus, num_terms=len(dictionary.token2id)).T
tf_array.shape

(1520, 1214)

In [16]:
# convert to sparse matrix
tf_array_sparse = sparse.csr_matrix(tf_array)
tf_array_sparse

<1520x1214 sparse matrix of type '<class 'numpy.float32'>'
	with 41522 stored elements in Compressed Sparse Row format>

In [17]:
# create or load bert embeddings (either use raw text or clean text)
# we can expirment with both
embeddings_bert = preprocess.load_bert_embeddings(
        text_dir="output/Cgovernance_processed_raw_pdfMiner.txt", 
        model="distiluse-base-multilingual-cased",
        dir_embeddings="output/Cgovernance_bertEmbeddings_pdfMiner.npy",
        update=True
)

2020-11-17 23:43:23.634 INFO    root: Load pretrained SentenceTransformer: distiluse-base-multilingual-cased
2020-11-17 23:43:23.636 INFO    root: Did not find folder distiluse-base-multilingual-cased. Assume to download model from server.
2020-11-17 23:43:23.641 INFO    root: Load SentenceTransformer from folder: C:\Users\Pieter-Jan/.cache\torch\sentence_transformers\sbert.net_models_distiluse-base-multilingual-cased
2020-11-17 23:43:27.138 INFO    root: Use pytorch device: cpu
Batches: 100%|██████████| 8/8 [04:42<00:00, 35.32s/it]


## 1.2 Train model

In [18]:
embeddings_bert.shape

(1520, 512)

## 1.3 Evaluate topics

In [19]:
# ivert dictionary
inv_token2id = {v: k for k, v in dictionary.token2id.items()}
# create dataset
training_dataset = CTMDataset(tf_array_sparse, embeddings_bert, inv_token2id)

In [20]:
random.seed(69)
ctm = CTM(
    input_size=len(dictionary.token2id), 
    bert_input_size=512, 
    n_components=8, 
    inference_type="combined",
    num_epochs=20,
    reduce_on_plateau=True
    )
ctm.fit(training_dataset) # run model

Settings: 
                   N Components: 8
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.875
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: True
                   Save Dir: None
Epoch: [1/20]	Samples: [1520/30400]	Train Loss: 261.2963443153783	Time: 0:00:08.091877
Epoch: [2/20]	Samples: [3040/30400]	Train Loss: 255.32405941611842	Time: 0:00:07.598107
Epoch: [3/20]	Samples: [4560/30400]	Train Loss: 251.37415771484376	Time: 0:00:10.079666
Epoch: [4/20]	Samples: [6080/30400]	Train Loss: 247.277976588199	Time: 0:00:09.409812
Epoch: [5/20]	Samples: [7600/30400]	Train Loss: 244.75961014597038	Time: 0:00:08.388969
Epoch: [6/20]	Samples: [9120/30400]	Train Loss: 242.36711168791118	Time: 0:00:07.0360

### 1.3.1 Normalized Point-wise Mutual Information

In [21]:
cmt_topics_l = ctm.get_topic_lists(10)
cmt_topics_d = {}
for i in range(len(cmt_topics_l)):
    cmt_topics_d[i] = cmt_topics_l[i]

### 1.3.2 External Word Embeddings Topic Coherence

In [22]:
pd.DataFrame.from_dict(cmt_topics_d).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,resignation,vote,election,nominee,tender,accept,offer,certification,majority,promptly
1,annually,periodically,charter,guideline,education,program,orientation,base,continue,duty
2,experience,skill,criterion,background,nominee,retirement,limit,candidate,diversity,commit
3,advisor,party,code,general,matter,access,legal,confidential,communication,ethic
4,stock,requirement,standard,york,cash,ownership,exchange,market,list,common
5,oversee,process,risk,report,function,evaluation,financial,evaluate,approve,strategy
6,agenda,item,advance,distribute,session,attend,schedule,material,discussion,attendance
7,family,consolidate,revenue,gross,immediate,payment,relationship,fiscal,organization,partner


### 1.3.3 Rank-Biased Overlap 

In [23]:
npmi = CoherenceNPMI(texts=text_cleaned, topics=ctm.get_topic_lists(10))
npmi.score()

2020-11-17 23:50:51.297 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
2020-11-17 23:50:51.569 INFO    gensim.corpora.dictionary: built Dictionary(2853 unique tokens: ['activity', 'applicable', 'arena', 'assessment', 'background']...) from 1520 documents (total 94899 corpus positions)
2020-11-17 23:50:51.577 INFO    gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows
2020-11-17 23:50:57.712 INFO    gensim.topic_coherence.text_analysis: 1 batches submitted to accumulate stats from 64 documents (3501 virtual)
2020-11-17 23:50:57.719 INFO    gensim.topic_coherence.text_analysis: 2 batches submitted to accumulate stats from 128 documents (7082 virtual)
2020-11-17 23:50:57.725 INFO    gensim.topic_coherence.text_analysis: 3 batches submitted to accumulate stats from 192 documents (10724 virtual)
2020-11-17 23:50:57.732 INFO    gensim.topic_c

0.10366851974616874

## 2. Latent Dirichlet Allocation

## 2.1 Train model

In [24]:
CoherenceWordEmbeddings(ctm.get_topic_lists(10)).score()

2020-11-17 23:50:59.360 INFO    gensim.models.utils_any2vec: loading projection weights from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2020-11-17 23:52:21.873 INFO    gensim.models.utils_any2vec: loaded (3000000, 300) matrix from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


0.14070016

## 2.2 Evaluate topics

In [25]:
InvertedRBO(ctm.get_topic_lists(10)).score()

0.9971407413058674

In [26]:
num_topics = 8
lda_model =  LdaMulticore(
    corpus=bow_corpus, 
    num_topics = num_topics, 
    id2word = dictionary,                                    
    passes = 10,
    workers = 2
    )

2020-11-17 23:52:23.214 INFO    gensim.models.ldamodel: using symmetric alpha at 0.125
2020-11-17 23:52:23.217 INFO    gensim.models.ldamodel: using symmetric eta at 0.125
2020-11-17 23:52:23.219 INFO    gensim.models.ldamodel: using serial LDA version on this node
2020-11-17 23:52:23.225 INFO    gensim.models.ldamulticore: running online LDA training, 8 topics, 10 passes over the supplied corpus of 1520 documents, updating every 4000 documents, evaluating every ~1520 documents, iterating 50x with a convergence threshold of 0.001000
2020-11-17 23:52:23.232 INFO    gensim.models.ldamulticore: training LDA model using 2 processes
2020-11-17 23:52:23.335 INFO    gensim.models.ldamulticore: PROGRESS: pass 0, dispatched chunk #0 = documents up to #1520/1520, outstanding queue size 1
2020-11-17 23:52:27.354 INFO    gensim.models.ldamodel: topic #5 (0.125): 0.023*"evaluation" + 0.015*"agenda" + 0.013*"session" + 0.012*"succession" + 0.009*"stock" + 0.008*"audit" + 0.008*"report" + 0.007*"plan

### 2.2.1 Normalized Point-wise Mutual Information

In [27]:
lda_topics_d = {}
lda_topics_l = []
for i in range(num_topics):
    t = [w[0] for w in lda_model.show_topic(i)[0:10]]
    lda_topics_d[i+1] = t
    lda_topics_l.append(t)

### 2.2.2 External Word Embeddings Topic Coherence

In [28]:
# show topics
# evert row is a topic
pd.DataFrame.from_dict(lda_topics_d).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,corporation,transaction,information,person,party,discover,guideline,related,organization,confidential
2,advisor,plan,shareholder,financial,family,receive,immediate,fee,legal,deem
3,stock,guideline,ownership,share,fifth,requirement,conduct,value,require,code
4,candidate,change,experience,recommend,public,membership,nominee,election,criterion,nomination
5,resignation,election,vote,independence,majority,relationship,receive,family,exchange,accept
6,evaluation,succession,plan,development,session,senior,charter,annually,report,planning
7,program,orientation,education,continue,financial,advisor,accounting,outside,limit,operation
8,agenda,material,advance,schedule,item,communication,stockholder,session,expect,information


### 2.2.3 Rank-Biased Overlap 

In [29]:
npmi = CoherenceNPMI(texts=text_cleaned, topics=lda_topics_l)
npmi.score()

2020-11-17 23:52:58.061 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
2020-11-17 23:52:58.284 INFO    gensim.corpora.dictionary: built Dictionary(2853 unique tokens: ['activity', 'applicable', 'arena', 'assessment', 'background']...) from 1520 documents (total 94899 corpus positions)
2020-11-17 23:52:58.288 INFO    gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows
2020-11-17 23:53:03.185 INFO    gensim.topic_coherence.text_analysis: 1 batches submitted to accumulate stats from 64 documents (3501 virtual)
2020-11-17 23:53:03.188 INFO    gensim.topic_coherence.text_analysis: 2 batches submitted to accumulate stats from 128 documents (7082 virtual)
2020-11-17 23:53:03.194 INFO    gensim.topic_coherence.text_analysis: 3 batches submitted to accumulate stats from 192 documents (10724 virtual)
2020-11-17 23:53:03.199 INFO    gensim.topic_c

0.022353241509189863

## 3. Bert Topic

## 3.1 Train model

In [30]:
CoherenceWordEmbeddings(lda_topics_l).score()

2020-11-17 23:53:04.645 INFO    gensim.models.utils_any2vec: loading projection weights from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2020-11-17 23:54:26.382 INFO    gensim.models.utils_any2vec: loaded (3000000, 300) matrix from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


0.13298091

## 3.2 Evaluate topics

In [31]:
InvertedRBO(lda_topics_l).score()

0.9795098052311224

In [32]:
random.seed(10)
with open('output/Cgovernance_processed_cleaned_pdfMiner.txt',"r") as fr:
    docs = [doc for doc in fr.read().splitlines()] 

model = BERTopic(verbose=True)
topics = model.fit_transform(docs, embeddings_bert)

2020-11-17 23:54:49,596 - BERTopic - Reduced dimensionality with UMAP
2020-11-17 23:54:49.596 INFO    BERTopic: Reduced dimensionality with UMAP
2020-11-17 23:54:49,878 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2020-11-17 23:54:49.878 INFO    BERTopic: Clustered UMAP embeddings with HDBSCAN


In [33]:
topcis_b = model.get_topics()

### 3.2.1 Normalized Point-wise Mutual Information

In [34]:
# extract words for each topic
topics_k = {}
for k,v in topcis_b.items():
    t_words = []
    for w in v:
        t_words.append(w[0])
    # append the first 10 words
    topics_k[k] = t_words[0:10]

### 3.2.2 External Word Embeddings Topic Coherence

In [35]:
topics_bert = list(topics_k.values())
pd.DataFrame.from_dict(topics_k).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
-1,guideline,code,conflict,conduct,program,evaluation,business,policy,education,advisor
0,stock,share,ownership,common,unit,value,retainer,equal,salary,own
1,relationship,standard,independence,determination,york,exchange,material,list,determine,affirmatively
2,family,immediate,year,payment,current,fiscal,organization,partner,gross,charitable
3,experience,candidate,skill,diversity,background,nominee,criterion,diverse,ability,integrity
4,public,serve,profit,service,audit,notify,invitation,accept,limit,ability
5,vote,resignation,election,majority,accept,tender,cast,offer,decision,reject
6,change,retirement,limit,term,resignation,offer,resign,retire,accept,occupation
7,communication,access,information,confidential,contact,advisor,secretary,outside,senior,request
8,agenda,lead,session,chairman,item,schedule,material,advance,preside,chair


### 3.2.3 Rank-Biased Overlap 

In [36]:
npmi = CoherenceNPMI(texts=text_cleaned, topics=topics_bert)
npmi.score()

2020-11-17 23:54:50.443 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
2020-11-17 23:54:50.671 INFO    gensim.corpora.dictionary: built Dictionary(2853 unique tokens: ['activity', 'applicable', 'arena', 'assessment', 'background']...) from 1520 documents (total 94899 corpus positions)
2020-11-17 23:54:50.679 INFO    gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows
2020-11-17 23:54:56.497 INFO    gensim.topic_coherence.text_analysis: 1 batches submitted to accumulate stats from 64 documents (3501 virtual)
2020-11-17 23:54:56.501 INFO    gensim.topic_coherence.text_analysis: 2 batches submitted to accumulate stats from 128 documents (7082 virtual)
2020-11-17 23:54:56.511 INFO    gensim.topic_coherence.text_analysis: 3 batches submitted to accumulate stats from 192 documents (10724 virtual)
2020-11-17 23:54:56.521 INFO    gensim.topic_c

0.15455125119020208

In [37]:
CoherenceWordEmbeddings(topics_bert).score()

2020-11-17 23:54:58.887 INFO    gensim.models.utils_any2vec: loading projection weights from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2020-11-17 23:56:21.814 INFO    gensim.models.utils_any2vec: loaded (3000000, 300) matrix from C:\Users\Pieter-Jan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


0.1545211

In [38]:
InvertedRBO(topics_bert).score()

0.992035617275102