In [1]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os



# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download nl_core_news_sm

C:\Users\krist\anaconda3\python.exe: No module named spacy


In [2]:
df = pd.read_csv('cleaned_test.csv')
df.head()

Unnamed: 0,Date,Title,Numac,Link FR,Link NL,Text,Cleaned Text,Summary_1,Summary_2,Tag text
0,1/14/2020,REGION DE BRUXELLES-CAPITALE\nREGION DE BRUXEL...,2020010053,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...,: 2020010053 BRUSSELS HOOFDSTEDELIJK GEWEST 8...,"Demissionair minister van Financiën, belast me...",,: 2020010053 BRUSSELS HOOFDSTEDELIJK GEWEST ...
1,1/24/2020,MINISTERE DE LA COMMUNAUTE FRANCAISE\n20 DECEM...,2020010214,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\n\neinde eerste woord laatste woord\nPub...,: 2020010214 MINISTERIE VAN DE FRANSE GEMEENS...,De Franse Gemeenschap en de Executieve van de ...,,: 2020010214 MINISTERIE VAN DE FRANSE GEMEEN...
2,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\n20 JANVIER 20...,2020040138,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...,: 2020040138 FEDERALE OVERHEIDSDIENST FINANCI...,De ministers van Financiën en de staatssecreta...,,: 2020040138 FEDERALE OVERHEIDSDIENST FINANC...
3,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\n20 JANVIER 20...,2020020094,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...,: 2020020094 FEDERALE OVERHEIDSDIENST FINANCI...,Het ministerie van Financiën heeft het ministe...,,: 2020020094 FEDERALE OVERHEIDSDIENST FINANC...
4,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\nAdministratio...,2020010193,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\n\neinde eerste woord laatste woord\nPub...,: 2020010193 FEDERALE OVERHEIDSDIENST FINANCI...,,,: 2020010193 FEDERALE OVERHEIDSDIENST FINANC...


In [10]:
df.fillna('', inplace=True)

In [6]:
df['Tag text']=""

for idx, row in df.iterrows():

    nlp = spacy.load('nl_core_news_sm')
    text = df.loc[idx, 'Cleaned Text']
    tokenized = nlp(text)
    tokens = [token.text for token in tokenized]

    stopwords = spacy.lang.nl.stop_words.STOP_WORDS

    text_no_stop = [lemma for lemma in tokens if lemma not in stopwords]
    cleaned = ' '.join(text_no_stop)

    df.loc[idx,'Tag text'] = cleaned

In [7]:
doc = nlp(df["Tag text"][3])
spacy.displacy.render(doc, style='ent',jupyter=True)

In [8]:
df.to_excel("cleaned_test.xlsx", index=False)
df.to_csv("cleaned_test.csv", index=False, encoding="utf-8-sig")

In [25]:
# TfidfVectorizer 

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


tfidfvectorizer = TfidfVectorizer(analyzer='word')

# convert th documents into a matrix


tfidf_wm = tfidfvectorizer.fit_transform(df['Summary_1'])
tfidfvectorizer.get_feature_names_out()

print(tfidf_wm.shape)


(662, 2847)


In [26]:
NUM_TOPICS = 10

In [27]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(tfidf_wm)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [28]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(tfidf_wm)

In [29]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(tfidf_wm)

In [30]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [31]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, tfidfvectorizer)

LDA Model:
Topic 0:
[('kunstendecreet', 0.3096560232349893), ('60', 0.30950662179854055), ('budget', 0.30949704945388845), ('indiendatum', 0.30942431807952003), ('immunoassay', 0.3094026165138431), ('voorschrift', 0.30864244476311237), ('vrijgemaakt', 0.30858330108337423), ('vergoedingsbasis', 0.30856364861163144), ('554982', 0.3085230380324343), ('beoordelingsronde', 0.3084039367718184)]
Topic 1:
[('grondwet', 5.243629527870734), ('hetgeen', 3.474277618501199), ('volgt', 3.148125863272838), ('aangenomen', 3.107679448228397), ('74', 3.097708343784169), ('volksvertegenwoordigers', 3.0235537700529527), ('bekrachtigen', 2.7507475273801005), ('wij', 2.696862790857278), ('kamer', 2.45548029799081), ('uit', 2.251510892800294)]
Topic 2:
[('2245bis', 5.750477604416362), ('mogen', 0.2993756898035802), ('zesde', 0.2992773707787663), ('talen', 0.29905799214920953), ('leger', 0.2989093755055768), ('1938', 0.2988320500831356), ('actief', 0.298826678123876), ('demissionairen', 0.2988229320861248), (

In [32]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, tfidfvectorizer)

NMF Model:
Topic 0:
[('van', 1.4763662388640975), ('besluit', 1.061648444023689), ('het', 0.7509423435087368), ('tot', 0.7468664239634208), ('koninklijk', 0.5531625016892053), ('wijziging', 0.5292825163957093), ('de', 0.5114217286713603), ('2020', 0.40406545274217226), ('houdende', 0.3812804296581353), ('december', 0.3331237138584261)]
Topic 1:
[('decreet', 0.7945975013907962), ('van', 0.6179051455482085), ('machten', 0.5930222139806036), ('bijzondere', 0.5777771484112199), ('toekenning', 0.5543738633506361), ('17', 0.5077912145959931), ('gezondheidscrisis', 0.4948120096911782), ('maart', 0.45380592907959255), ('het', 0.4055546745671994), ('aan', 0.3941144079759016)]
Topic 2:
[('zaken', 0.8685888840804307), ('minister', 0.6776660038154957), ('buitenlandse', 0.5897008407079276), ('van', 0.5295988959219371), ('demissionair', 0.4461077799946123), ('ministerie', 0.42132053674421127), ('en', 0.39917922686140145), ('volksgezondheid', 0.3850575353917065), ('sociale', 0.2971249637455836), ('we

In [22]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, tfidfvectorizer)

LSI Model:
Topic 0:
[('2020', 0.25176443194233555), ('besluit', 0.22365445769494988), ('artikel', 0.21440110211796196), ('gelet', 0.20391690671228793), ('19', 0.1837195152548483), ('regering', 0.18242355790157552), ('covid', 0.17449653586628358), ('2021', 0.16756424461416844), ('overwegende', 0.13936774991794285), ('gemeenschap', 0.13465952748737206)]
Topic 1:
[('gemeenschap', 0.4624840445280653), ('franse', 0.4529611534163234), ('regering', 0.24807113813336576), ('onderwijs', 0.1370597950515426), ('decreet', 0.1335864523418038), ('machten', 0.13196475319823955), ('bijzondere', 0.13080267830203587), ('ambtenarenzaken', 0.1118268887000394), ('promotie', 0.10161317978075539), ('jeugd', 0.08588954303405519)]
Topic 2:
[('akkoord', 0.33968508821256965), ('bevoegde', 0.22752684698971376), ('autoriteiten', 0.22521447810667175), ('onderling', 0.21599892593233802), ('belgië', 0.21218108943734937), ('overeenkomst', 0.20847508812606907), ('2020', 0.16841733212653828), ('gemeenschap', 0.1457058732