# Topic modelling process with example

### 1. Topic modelling:
    1. Topic modelling structure
    2. Get value from you model

In [2]:
#Imports
import pandas as pd
import numpy as np
import json, string, pprint, spacy, nltk, math, gensim
import gensim.corpora as corpora
import nl_core_news_lg 
from gensim.utils import simple_preprocess
from bertopic import BERTopic
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util
from bertopic.backend import languages 
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from nltk.tokenize import sent_tokenize
from typing import List
from hdbscan import HDBSCAN
from umap import UMAP
import plotly.io as pio
pio.renderers.default='iframe'

### 1.1. Topic modelling structure

In [3]:
# import previously processed data
paragraphsDF = pd.read_csv('actors_organizations_processed_data.csv')
paragraphsDF

Unnamed: 0,title,paragraph_num,paragraph_text,organizations_in_paragraph,actors_in_paragraph,date_published,organizations_list,one_or_more_organizations,numb_unique_organizations,actors_list,one_or_more_actors,numb_unique_actors,both_org_act
0,Contaminatie in het vlees van ‘grote grazers’ ...,1,>,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False
1,Contaminatie in het vlees van ‘grote grazers’ ...,2,Retouradres Postbus,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False
2,Contaminatie in het vlees van ‘grote grazers’ ...,3,20350 2500 EJ Den Haag,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False
3,Contaminatie in het vlees van ‘grote grazers’ ...,4,De Voorzitter van de Tweede Kamer der Staten-G...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False
4,Contaminatie in het vlees van ‘grote grazers’ ...,5,Postbus,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
223587,Verslag Landbouw- en Visserijraad 17 januari 2022,43,1 Kamerstuk,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False
223588,Verslag Landbouw- en Visserijraad 17 januari 2022,44,"21 501-32, nr.",[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False
223589,Verslag Landbouw- en Visserijraad 17 januari 2022,45,1354.,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False
223590,Verslag Landbouw- en Visserijraad 17 januari 2022,46,", was de Nederlandse inzet voor zeebaars om ee...",[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False


Text preprocessing
The preprocessing pipeline is mentioned below.
###### Tokenisation
First basic tokenization is implemented, to split the text into 
tokens as is recommended by Kannan et al. (2014). For this process I used genism’s 
simple_preprocess, which will convert the text into lowercase & tokens and remove punctuation. 

In [4]:
# Tokenization using gensim
def sent_to_words(sentences, deacc=True): # deacc=True removes punctuations
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))  
        
# Convert the data to a list
data = paragraphsDF["paragraph_text"].values.tolist()
data_words = list(sent_to_words(data))

###### Stop word removal 
Secondly stop words will be removed from the data as well as a list of punctuation characters from 
the string.punctiation string, which is a pre-initialized string used as a string constant. These are 
removed because they have little relevance for understanding the content of a text (Kannan et al., 
2014).

In [5]:
# string.punctuation refers to a list of punctuations
stop_words = stopwords.words('dutch') + list(string.punctuation) #occur in many documents and have no link to a distinct industry.
additional_stop_words = ['geer','minister', 'postbus', 'retouradres','kamer', 'antwoord', 'www', 'brief', 'voorzitter', 'generaal', 'rijksoverheid', 'voorzitter', 'kamervrag', 'kamervraag','voorzitter', 'generaal', 'annoteren', 'agenda', 'kamerstuk', 'beantwoording', 'stichting','lid', 'partij', 'fractie', 'waarom', 'https', 'brief', 'verslag', 'motie', 'agendapunt', 'indiener', 'Tweede','tweede', 'Kenmerk', 'kenmerk', 'voortgang','kamerstukk', 'website', 'org', 'kamerbrief', 'idem', 'bijlage', 'wet', 'artikel', 'vergaderjaar', 'overheid', 'vraag', 'bericht', 'rapport', 'aanhangsel','staan', 'beleidsreactie','inhoudsopgave','lid', 'jaar', 'commissie', 'reactie', 'reactie', 'mededeling', 'http','zien', 'Elzijn', 'isie', 'ieren', 'pa', 'ibidem','programma','algemeen', 'pagina', 'context', 'circulair', 'voorbeeld', 'bijlaag', 'hoofdstuk', 'zien', 'leeswijzer', 'algemeen', 'blad', 'vooronderzoek', 'revisie', 'zone', 'legenda', 'lineair', 'stof', 'kolomn', 'tabel', 'zone', 'voorstellen', 'heer', 'dank', 'mevrouw', 'wel', 'tijd', 'meneer','adema', 'zaak', 'besluit','commisiedebat','datum', 'onderzoek', 'pagina','geer','minister','vraag','heer','kabinet','agenta','gemeente','gaan','kamer','wel','www', 'aanwezig', 'bijvoorbeeld', 'beide', 'dergelijke', 'dezelfde', 'elke', 'enkele', 'eveneens', 'gaande', 'gaandeweg', 'gehele', 'gehouden', 'genoeg', 'geweest', 'groter', 'hebben', 'heel', 'hetzelfde', 'hetzij', 'huidige', 'hunne', 'immers', 'inmiddels', 'intussen', 'juist', 'kleine', 'komt', 'korte', 'laatst', 'laten', 'lijken', 'maken', 'meeste', 'meestal', 'mede', 'middel', 'misschien', 'namelijk', 'nemen', 'net', 'nieuwe', 'niemand', 'niets', 'nodig', 'nogal', 'normaal', 'nu', 'o.a.', 'ofwel', 'omtrent', 'ondanks', 'onder andere', 'ongeveer', 'ons', 'onzes', 'onzeker', 'overal', 'precies', 'redelijk', 'sinds', 'slechts', 'sommige', 'steeds', 'terwijl', 'toch', 'totaal', 'uiteraard', 'vaak', 'vanaf', 'verschillende', 'vervolgens', 'volledig', 'volgens', 'vroeg', 'vroeger', 'waaronder', 'waarvan', 'wat betreft', 'weer', 'weinig', 'weliswaar', 'waarom', 'wanneer', 'zoals', 'zoveel', 'zulke', 'biodiversiteit', 'natuur', 'ecologie', 'soort', 'soorten', 'plant', 'planten', 'dier', 'dieren', 'bos', 'bosgebied', 'natuurgebied', 'bescherming', 'milieu', 'vervuiling', 'klimaatverandering', 'duurzaamheid', 'ecosysteem', 'biologisch', 'natuurlijk', 'gezondheid', 'beschermen', 'behoud', 'natuurbeheer', 'landschap', 'landschapsbeheer', 'fauna', 'flora', 'wetlands', 'bodem', 'water', 'lucht', 'biodiversiteitsverdrag', 'conventiebiologie', 'duurzaamheidsdoelen', 'habitats', 'inheems', 'soortenrijkdom', 'natuurlijke hulpbronnen', 'sustainable', 'sustainability']
stop_words_final = stop_words + additional_stop_words

In [6]:
# Removing the stopwords from the data
def rem_stopwords (text):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words_final] for doc in text]

# remove stop words
data_words_nostops = rem_stopwords(data_words)

###### Lemmatization
Lastly lemmatization has been performed, since its superior to stemming (Khyani et al., 2021), 
which is a text normalization technique that will switch any word to its lemma. For this process I 
used to open-source software library called spaCy, but NLTK could also have been used. The spaCy 
pre-trained model called en_core_web_md, can be thought of as some kind of pipeline. When this 
model is called upon a text or word, the text will run through the pipeline. If the text isn’t tokenized 
it will be tokenized after which different components will be activated. The thing that’s most 
interesting about this pipeline is a tagger which will assign Part-of-Speech tags based on spaCy’s 
English language model. This is done to gain a variety of annotations. The POS tag refers to a label 
which will be assigned to every token in the corpus to indicate the type of said token (is it a verb or 
punctation or adjective) and other grammatical categories. These POS tags can then be used in the 
preprocess to remove unwanted tags. The only tags that I have allowed in my analysis are Noun, Adj, 
Verb and Adv.

In [7]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
# or even higher

nlp = spacy.load('nl_core_news_lg', disable=['parser', 'ner'])
nlp.max_length = 1322782
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [8]:
paragraphsDF['content'] = data_lemmatized
newDF = paragraphsDF.drop(paragraphsDF[paragraphsDF['content'].apply(lambda x: len(x)==0)].index)

 **Creating Corpus & BERTopics**
 
 
BERTopic is a smart topic modeling algorithm that utilizes BERT (Bidirectional Encoder Representations from Transformers), a state-of-the-art natural language processing model developed by Google, to create meaningful and accurate topics from a given corpus. Here are some reasons why BERTopic is considered smart to use:

1. Incorporates contextual understanding: BERT is designed to understand the context of text data, which allows BERTopic to create topics that are based on the full context of the documents. This makes it more accurate and meaningful compared to other topic modeling algorithms.

2. Utilizes clustering: BERTopic uses clustering algorithms, such as Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN), to group similar documents together and create coherent topics. This clustering approach helps ensure that the topics are not only meaningful but also distinguishable from one another.

3. Customizable: BERTopic is highly customizable and can be tailored to specific needs. For example, users can adjust the number of topics they want to extract, or exclude specific words from the analysis to improve the quality of topics generated.

4. Efficient: BERTopic is designed to be computationally efficient and can process large datasets quickly. It also has the ability to update topics as new documents are added to the corpus, making it a scalable solution for topic modeling.

5. Easy to use: BERTopic is user-friendly and can be implemented with just a few lines of code. The resulting topics can be visualized using a variety of tools, making it easy to interpret and communicate the findings to others.

Overall, BERTopic is a smart choice for topic modeling as it combines the power of BERT with efficient clustering algorithms and customizability to create meaningful and accurate topics from text data.

We select the "dutch" as the main language for our documents. If you want a multilingual model that supports 50+ languages, please select "multilingual" instead.

In [9]:
newDF['corp'] = [','.join(map(str, l)) for l in newDF['content']]
newDF['corp'] = newDF['corp'].str.replace(',',' ', regex=False)

In [10]:
# reset the index of the dataframe
newDF = newDF.reset_index(drop=True)
newDF

Unnamed: 0,title,paragraph_num,paragraph_text,organizations_in_paragraph,actors_in_paragraph,date_published,organizations_list,one_or_more_organizations,numb_unique_organizations,actors_list,one_or_more_actors,numb_unique_actors,both_org_act,content,corp
0,Contaminatie in het vlees van ‘grote grazers’ ...,4,De Voorzitter van de Tweede Kamer der Staten-G...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,[staat],staat
1,Contaminatie in het vlees van ‘grote grazers’ ...,7,: Parnassusplein 5,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,[parnassusplein],parnassusplein
2,Contaminatie in het vlees van ‘grote grazers’ ...,13,Correspondentie uitsluitend richten aan het re...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,"[correspondentie, uitsluitend, richten, vermel...",correspondentie uitsluitend richten vermelding
3,Contaminatie in het vlees van ‘grote grazers’ ...,15,Betreft Contaminatie in het vlees van ‘grote g...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,"[betreffen, contaminatie, vlees, groot, grazer...",betreffen contaminatie vlees groot grazer uite...
4,Contaminatie in het vlees van ‘grote grazers’ ...,17,"Geachte voorzitter, Ten behoeve van natuurbehe...",[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,"[geacht, behoeve, uiterwaarde, specifiek, robu...",geacht behoeve uiterwaarde specifiek robuust r...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153461,Verslag Landbouw- en Visserijraad 17 januari 2022,40,"zeebaars, tong, roggen, tarbot en griet, zeedu...",[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,"[zeebaars, tong, rogg, tarbot, griet, horsmakr...",zeebaars tong rogg tarbot griet horsmakreel la...
153462,Verslag Landbouw- en Visserijraad 17 januari 2022,41,Voor de bestanden van tong in de Noordzee (–28...,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,"[bestand, tong, tarbot, griet, langoustine, ze...",bestand tong tarbot griet langoustine zeeduive...
153463,Verslag Landbouw- en Visserijraad 17 januari 2022,42,Zoals aan uw Kamer gemeld,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,[melden],melden
153464,Verslag Landbouw- en Visserijraad 17 januari 2022,46,", was de Nederlandse inzet voor zeebaars om ee...",[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,"[Nederlands, inzet, toename, realiseren, klein...",Nederlands inzet toename realiseren kleinschal...


We can save the df for further inspection:

In [11]:
newDF.to_csv('actors_organizations_corp_data.csv', index=False)

You can load the processed dataset directly from here:

In [12]:
#newDF = pd.read_csv('actors_organizations_corp_data.csv')

In [14]:
# drop duplicated paragraphs
newDF = newDF.drop_duplicates(subset=['paragraph_text'])

In [15]:
newDF = newDF.reset_index(drop=True)
newDF

Unnamed: 0,title,paragraph_num,paragraph_text,organizations_in_paragraph,actors_in_paragraph,date_published,organizations_list,one_or_more_organizations,numb_unique_organizations,actors_list,one_or_more_actors,numb_unique_actors,both_org_act,content,corp
0,Contaminatie in het vlees van ‘grote grazers’ ...,4,De Voorzitter van de Tweede Kamer der Staten-G...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,[staat],staat
1,Contaminatie in het vlees van ‘grote grazers’ ...,7,: Parnassusplein 5,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,[parnassusplein],parnassusplein
2,Contaminatie in het vlees van ‘grote grazers’ ...,13,Correspondentie uitsluitend richten aan het re...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,"[correspondentie, uitsluitend, richten, vermel...",correspondentie uitsluitend richten vermelding
3,Contaminatie in het vlees van ‘grote grazers’ ...,15,Betreft Contaminatie in het vlees van ‘grote g...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,"[betreffen, contaminatie, vlees, groot, grazer...",betreffen contaminatie vlees groot grazer uite...
4,Contaminatie in het vlees van ‘grote grazers’ ...,17,"Geachte voorzitter, Ten behoeve van natuurbehe...",[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,"[geacht, behoeve, uiterwaarde, specifiek, robu...",geacht behoeve uiterwaarde specifiek robuust r...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83947,Verslag Landbouw- en Visserijraad 17 januari 2022,31,Visserijraad van 17 januari 2022.\nOp 1 januar...,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,"[visserijraad, nemen, Frankrijk, voorzittersch...",visserijraad nemen Frankrijk voorzitterschap r...
83948,Verslag Landbouw- en Visserijraad 17 januari 2022,34,Op 7 februari a.s. organiseert het Franse voor...,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,"[organiseren, Frans, voorzitterschap, informee...",organiseren Frans voorzitterschap informeel la...
83949,Verslag Landbouw- en Visserijraad 17 januari 2022,39,Visserijraad van december jl. is uw Kamer geïn...,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,"[visserijraad, jl, ïnformeren, akkoord, vereni...",visserijraad jl ïnformeren akkoord verenigen k...
83950,Verslag Landbouw- en Visserijraad 17 januari 2022,42,Zoals aan uw Kamer gemeld,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,[melden],melden


###### Transformer embedding
BERTopic supports several libraries for encoding our text to dense vector embeddings. If we build poor quality embeddings, nothing we do in the other steps will be able to help us, so it is very important that we choose a suitable embedding model. the Sentence Transformers library provides the most extensive library of high-performing sentence embedding models.They can be found on HuggingFace Hub by searching for “sentence-transformers”. The first result of this search is sentence-transformers/all-MiniLM-L6-v2, this is a popular high-performing model that creates 384-dimensional sentence embeddings.

In [16]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

###### UMAP
UMAP is an amazing technique for dimensionality reduction. In BERTopic, it is used to reduce the dimensionality of document embedding into something easier to use with HDBSCAN to create good clusters.

However, it does has a significant number of parameters you could take into account. As exposing all parameters in BERTopic would be difficult to manage, we can instantiate our UMAP model and pass it to BERTopic:

- n_neighbors is the number of neighboring sample points used when making the manifold approximation.By increasing n_neighbors we can preserve more global structures, whereas a lower n_neighbors better preserves local structures, finding a good n_neighbours value allows us to preserve both local and global structures relatively well. 
- n_components refers to the dimensionality of the embeddings after reducing them. A too low dimensionality (n_components) results in a loss of information while a too high dimensionality results in poorer clustering results.
- metric refers to the method used to compute the distances in high dimensional space. 
- low_memory is used when datasets may consume a lot of memory. 

In [17]:
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  metric='cosine', 
                  low_memory=False)

###### HDBSCAN
After reducing the embeddings with UMAP, we use HDBSCAN to cluster our documents into clusters of similar documents. Similar to UMAP, HDBSCAN has many parameters that could be tweaked to improve the cluster's quality.
- Min_cluster_size is arguably the most important parameter in HDBSCAN. It controls the minimum size of a cluster and thereby the number of clusters that will be generated.
- Metric, like with HDBSCAN is used to calculate the distances.
- Prediction_data, make sure you always set this value to True as it is needed to predict new points later on. 


In [18]:
hdbscan_model = HDBSCAN(min_cluster_size=120, # 100
                        metric='euclidean', 
                        prediction_data=True,
                        min_samples=10)

###### BERTopic model
- The language parameter is used to simplify the selection of models for those who are not familiar with sentence-transformers models.
- Top_n_words refers to the number of words per topic that you want to be extracted.
- The n_gram_range parameter refers to the CountVectorizer used when creating the topic representation.
- min_topic_size is an important parameter! It is used to specify what the minimum size of a topic can be.
- nr_topics can be a tricky parameter. It specifies, after training the topic model, the number of topics that will be reduced.

In [19]:
model = BERTopic(language="dutch",
                 nr_topics="auto",
                 top_n_words = 10, 
                 n_gram_range = (1,1), #unigrams
                 umap_model=umap_model,
                 hdbscan_model=hdbscan_model,
                 embedding_model = embedding_model)
topics, probs = model.fit_transform(newDF['corp'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

We can then extract the most and least frequent topics:

In [20]:
model.get_topic_freq()
#44548 outliers

Unnamed: 0,Topic,Count
0,-1,41941
1,0,28043
2,1,1152
3,2,648
4,3,609
5,4,558
6,5,511
7,6,497
8,7,436
9,8,432


In [21]:
model.get_topic(6)

[('plan', 0.09133204304512553),
 ('project', 0.06809770564792807),
 ('planning', 0.03558821280534266),
 ('komen', 0.01308824301289012),
 ('geven', 0.012001880566448768),
 ('aanpak', 0.01153848485143551),
 ('groen', 0.010917154586908846),
 ('waar', 0.010838567862203724),
 ('gaan', 0.00972023207260067),
 ('ambitie', 0.009693827887137867)]

###### **Visualize Topics**
After having trained our `BERTopic` model, we can iteratively go through perhaps a hundred topic to get a good 
understanding of the topics that were extract. However, that takes quite some time and lacks a global representation. 
Instead, we can visualize the topics that were generated in a way very similar to 
[LDAvis](https://github.com/cpsievert/LDAvis):

In [22]:
model.visualize_topics()

In [23]:
model.save("model_nneighbors15_ncomponents5_cluster_size120_unigram")


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



###### Reduce topics based on the above  visualization

We can reduce the number of topics after having trained a BERTopic model. The advantage of doing so is that you can decide the number of topics after knowing how many are created. It is difficult to predict before training your model how many topics that are in your documents and how many will be extracted. Instead, we can decide afterward how many topics seem realistic. So the "nr_topics" parameter should be adapted to a certain realistic number.

In [None]:
# choose here how many topics you want
# if you execute this cell, you need to reload the model if want to see all topics again
model.reduce_topics(newDF['corp'], nr_topics=10)

### 1.2. Get value from you model

Load and test the model:

In [24]:
my_model = BERTopic.load("model_nneighbors15_ncomponents5_cluster_size120_unigram")

In [25]:
topics_my_model = my_model.get_topic_freq()
topics_my_model

Unnamed: 0,Topic,Count
0,-1,41941
1,0,28043
2,1,1152
3,2,648
4,3,609
5,4,558
6,5,511
7,6,497
8,7,436
9,8,432


Extract keywords per topic:

In [26]:
# learn how to extract keywords per topic
topic1 = my_model.get_topic(-1)
words1 = [t[0] for t in topic1]
words1

['gebied',
 'europees',
 'goed',
 'gaan',
 'landbouw',
 'komen',
 'groot',
 'geven',
 'lid',
 'duurzaam']

In [27]:
# get a list of all topics
topics_my_model = topics_my_model.reset_index(drop=True)
topics = topics_my_model['Topic'].to_list()

In [28]:
# make a dict of all topics and their keywords
words_per_topic = {}
for x in topics:
    keywords = my_model.get_topic(x)
    words = [t[0] for t in keywords]
    words_per_topic[x] = words

In [29]:
df_keywords = pd.DataFrame.from_dict(words_per_topic, orient='index')
df_keywords.head(12)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
-1,gebied,europees,goed,gaan,landbouw,komen,groot,geven,lid,duurzaam
0,landbouw,gebied,groot,goed,boer,lid,gaan,komen,geven,mogelijk
1,ecologisch,economisch,economie,ecoregeling,circulair,eco,activiteit,vast,ecosysteem,klimaat
2,biologisch,biomassa,biogrondstoffen,landbouw,houtig,product,biotechnologie,duurzaam,lid,europees
3,toelichting,toezegging,toepassing,toekomst,toepassen,toename,toerisme,toekomstvisie,toestaan,toelichten
4,park,westerpark,stad,tuinpark,groen,rust,haven,fietsroute,druk,doorgaan
5,hectare,provincie,nnn,natura,bossenstrategie,grasland,gebied,natuurnetwerk,realiseren,inrichten
6,plan,project,planning,komen,geven,aanpak,groen,waar,gaan,ambitie
7,instrument,instrumentarium,fiscaal,fiscaliteit,evaluatie,regeling,financieel,doel,beleid,provincie
8,financiering,financieel,investering,financieren,sector,instelling,privaat,mobiliseren,investeren,beleidsagenda


In [30]:
my_model.visualize_topics()

In [31]:
# Get the topic of each paragraph
topics, _ = my_model.transform(newDF["corp"])

In [32]:
# Add the topic as a new column to your dataframe
newDF["topic"] = topics

In [33]:
# map the dict with keywords per topic to the final dataframe
newDF['keywords'] = newDF['topic'].map(words_per_topic)
newDF

Unnamed: 0,title,paragraph_num,paragraph_text,organizations_in_paragraph,actors_in_paragraph,date_published,organizations_list,one_or_more_organizations,numb_unique_organizations,actors_list,one_or_more_actors,numb_unique_actors,both_org_act,content,corp,topic,keywords
0,Contaminatie in het vlees van ‘grote grazers’ ...,4,De Voorzitter van de Tweede Kamer der Staten-G...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,[staat],staat,-1,"[gebied, europees, goed, gaan, landbouw, komen..."
1,Contaminatie in het vlees van ‘grote grazers’ ...,7,: Parnassusplein 5,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,[parnassusplein],parnassusplein,0,"[landbouw, gebied, groot, goed, boer, lid, gaa..."
2,Contaminatie in het vlees van ‘grote grazers’ ...,13,Correspondentie uitsluitend richten aan het re...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,"[correspondentie, uitsluitend, richten, vermel...",correspondentie uitsluitend richten vermelding,32,"[evaluatie, interview, doelmatigheid, conclusi..."
3,Contaminatie in het vlees van ‘grote grazers’ ...,15,Betreft Contaminatie in het vlees van ‘grote g...,[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,"[betreffen, contaminatie, vlees, groot, grazer...",betreffen contaminatie vlees groot grazer uite...,-1,"[gebied, europees, goed, gaan, landbouw, komen..."
4,Contaminatie in het vlees van ‘grote grazers’ ...,17,"Geachte voorzitter, Ten behoeve van natuurbehe...",[],[],2022-12-14 00:00:00,[''],,1,[''],,1,False,"[geacht, behoeve, uiterwaarde, specifiek, robu...",geacht behoeve uiterwaarde specifiek robuust r...,-1,"[gebied, europees, goed, gaan, landbouw, komen..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83947,Verslag Landbouw- en Visserijraad 17 januari 2022,31,Visserijraad van 17 januari 2022.\nOp 1 januar...,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,"[visserijraad, nemen, Frankrijk, voorzittersch...",visserijraad nemen Frankrijk voorzitterschap r...,-1,"[gebied, europees, goed, gaan, landbouw, komen..."
83948,Verslag Landbouw- en Visserijraad 17 januari 2022,34,Op 7 februari a.s. organiseert het Franse voor...,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,"[organiseren, Frans, voorzitterschap, informee...",organiseren Frans voorzitterschap informeel la...,0,"[landbouw, gebied, groot, goed, boer, lid, gaa..."
83949,Verslag Landbouw- en Visserijraad 17 januari 2022,39,Visserijraad van december jl. is uw Kamer geïn...,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,"[visserijraad, jl, ïnformeren, akkoord, vereni...",visserijraad jl ïnformeren akkoord verenigen k...,19,"[informatie, ïnformeren, platform, informeren,..."
83950,Verslag Landbouw- en Visserijraad 17 januari 2022,42,Zoals aan uw Kamer gemeld,[],[],2022-01-26 00:00:00,[''],,1,[''],,1,False,[melden],melden,-1,"[gebied, europees, goed, gaan, landbouw, komen..."


# End products data of this notebook:
### 1. newDF - dataframe of documents used for our PowerBI dashboard containing:
     a. split into paragraphs
     b. topics
     c. keywords
     d. actors
     e. organizations
     f. date
### 2. model - trained Bertopic model

In [34]:
newDF.to_csv('newDF_nneighbors15_ncomponents5_cluster_size120_unigram.csv', index=False)

In [39]:
final_data_format = pd.read_csv('newDF_nneighbors15_ncomponents5_cluster_size120_unigram.csv')

We can check the paragraphs of a single topic:

In [41]:
topic_1 = final_data_format.loc[final_data_format['topic'] == 1]
topic_1

Unnamed: 0,title,paragraph_num,paragraph_text,organizations_in_paragraph,actors_in_paragraph,date_published,organizations_list,one_or_more_organizations,numb_unique_organizations,actors_list,one_or_more_actors,numb_unique_actors,both_org_act,content,corp,topic,keywords
218,Verslag van een schriftelijk overleg met de mi...,9,De leden van de vaste commissie voor Economisc...,"['Economische Zaken en Klimaat', 'Landbouw, Na...",[],2022-02-09 00:00:00,"['Economische Zaken en Klimaat', 'Landbouw', '...",True,3,[''],,1,False,"['lid', 'vast', 'economisch', 'zaak', 'klimaat...",lid vast economisch zaak klimaat landbouw same...,1,"['ecologisch', 'economisch', 'economie', 'ecor..."
222,Verslag van een schriftelijk overleg met de mi...,14,De griffier van de vaste commissie voor Econom...,"['Economische Zaken en Klimaat', 'Landbouw, Na...",['Boer'],2022-02-09 00:00:00,"['Economische Zaken en Klimaat', 'Landbouw', '...",True,3,['Boer'],True,1,True,"['griffier', 'vast', 'economisch', 'zaak', 'kl...",griffier vast economisch zaak klimaat landbouw...,1,"['ecologisch', 'economisch', 'economie', 'ecor..."
224,Verslag van een schriftelijk overleg met de mi...,17,De leden van de vaste commissie voor Economisc...,"['Economische Zaken en Klimaat', 'Landbouw, Na...",[],2022-02-09 00:00:00,"['Economische Zaken en Klimaat', 'Landbouw', '...",True,3,[''],,1,False,"['lid', 'vast', 'economisch', 'zaak', 'klimaat...",lid vast economisch zaak klimaat landbouw bela...,1,"['ecologisch', 'economisch', 'economie', 'ecor..."
246,Verslag van een schriftelijk overleg met de mi...,49,"Zo ja, kunt u aangeven waarom u SATL niet de g...","['Economische Zaken en Klimaat', 'Landbouw, Na...","['Adegeest', 'van der Linden']",2022-02-09 00:00:00,"['Economische Zaken en Klimaat', 'Landbouw', '...",True,3,"['Adegeest', 'van der Linden']",True,2,True,"['aangeven', 'satl', 'vragen', 'gegeven', 'ver...",aangeven satl vragen gegeven verstrekken reden...,1,"['ecologisch', 'economisch', 'economie', 'ecor..."
248,Verslag van een schriftelijk overleg met de mi...,51,Aan de Voorzitter van de Eerste Kamer der Stat...,"['Economische Zaken en Klimaat', 'Landbouw, Na...",[],2022-02-09 00:00:00,"['Economische Zaken en Klimaat', 'Landbouw', '...",True,3,[''],,1,False,"['één', 'staat', 'lid', 'vast', 'economisch', ...",één staat lid vast economisch zaak klimaat lan...,1,"['ecologisch', 'economisch', 'economie', 'ecor..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82701,Antwoord op vragen van het lid Bouchallikh ove...,41,"– gelet op de consequenties voor gezondheid, c...",[],[],2022-12-16 00:00:00,[''],,1,[''],,1,False,"['let', 'consequentie', 'cultureel', 'erfgoed'...",let consequentie cultureel erfgoed economie,1,"['ecologisch', 'economisch', 'economie', 'ecor..."
82718,Vaststelling van de begrotingsstaten van het M...,38,Van dit overleg brengen de commissies bijgaand...,"['Landbouw, Natuur en Voedselkwaliteit', 'Land...","['Geurts', 'Mulder', 'Jansma']",2022-04-29 00:00:00,"['Landbouw', 'Natuur en Voedselkwaliteit', 'La...",True,2,"['Geurts', 'Mulder', 'Jansma']",True,3,True,"['overleg', 'brengen', 'commissie', 'bijgaand'...",overleg brengen commissie bijgaand redigeren w...,1,"['ecologisch', 'economisch', 'economie', 'ecor..."
82748,Vaststelling van de begrotingsstaten van het M...,243,Het CDA ziet die grote uitdagingen.\nNiet alle...,[],[],2022-04-29 00:00:00,[''],,1,[''],,1,False,"['zien', 'groot', 'uitdaging', 'wennen', 'rand...",zien groot uitdaging wennen randvoorwaarde soc...,1,"['ecologisch', 'economisch', 'economie', 'ecor..."
82885,Voorlopig verslag,10,COMMISSIE VOOR ECONOMISCHE ZAKEN EN KLIMAAT/LA...,[],[],2021-03-16 00:00:00,[''],,1,[''],,1,False,"['economisch', 'zaak', 'klimaat', 'landbouw']",economisch zaak klimaat landbouw,1,"['ecologisch', 'economisch', 'economie', 'ecor..."


In [42]:
topic_1['paragraph_text'].to_list()

['De leden van de vaste commissie voor Economische Zaken en Klimaat / Landbouw, Natuur en Voedselkwaliteit 1 Samenstelling:',
 'De griffier van de vaste commissie voor Economische Zaken en Klimaat / Landbouw, Natuur en Voedselkwaliteit, De Boer BRIEF VAN DE VOORZITTER VAN DE VASTE COMMISSIE VOOR ECONOMISCHE ZAKEN EN KLIMAAT / LANDBOUW, NATUUR EN VOEDSELKWALITEIT',
 'De leden van de vaste commissie voor Economische Zaken en Klimaat / Landbouw, Natuur en Voedselkwaliteit hebben met belangstelling kennisgenomen van uw brief 3 Kamerstukken I, 2021–2022, 35\xa0334,\xa0AK.',
 'Zo ja, kunt u aangeven waarom u SATL niet de gevraagde gegevens verstrekt?\nBent u het eens met de redenering van Leon Adegeest dat zonder de invoer van stikstofgegevens de daadwerkelijke stikstofdepositie op de Natura 2000-gebieden vanuit Lelystad Airport niet berekend kan worden?\nDe leden van de vaste commissie voor Economische Zaken en Klimaat / Landbouw, Natuur en Voedselkwaliteit zien uw reactie met belangstellin