In [1]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os



# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download nl_core_news_sm

C:\Users\krist\anaconda3\python.exe: No module named spacy


In [2]:
df = pd.read_csv('KPMG Tax Case - CSV_Summarized.csv')
df.head()

Unnamed: 0,Date,Title,Numac,Link FR,Link NL,Text,Cleaned Text,Summary_1,Summary_2
0,1/14/2020,REGION DE BRUXELLES-CAPITALE\nREGION DE BRUXEL...,2020010053,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...,: 2020010053 BRUSSELS HOOFDSTEDELIJK GEWEST 8...,"Demissionair minister van Financiën, belast me...",
1,1/24/2020,MINISTERE DE LA COMMUNAUTE FRANCAISE\n20 DECEM...,2020010214,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\n\neinde eerste woord laatste woord\nPub...,: 2020010214 MINISTERIE VAN DE FRANSE GEMEENS...,De Franse Gemeenschap en de Executieve van de ...,
2,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\n20 JANVIER 20...,2020040138,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...,: 2020040138 FEDERALE OVERHEIDSDIENST FINANCI...,De ministers van Financiën en de staatssecreta...,
3,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\n20 JANVIER 20...,2020020094,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...,: 2020020094 FEDERALE OVERHEIDSDIENST FINANCI...,Het ministerie van Financiën heeft het ministe...,
4,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\nAdministratio...,2020010193,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\n\neinde eerste woord laatste woord\nPub...,: 2020010193 FEDERALE OVERHEIDSDIENST FINANCI...,,


In [5]:
df.fillna('', inplace=True)

In [6]:
df['Tag text']=""

for idx, row in df.iterrows():

    nlp = spacy.load('nl_core_news_sm')
    text = df.loc[idx, 'Cleaned Text']
    tokenized = nlp(text)
    tokens = [token.text for token in tokenized]

    stopwords = spacy.lang.nl.stop_words.STOP_WORDS

    text_no_stop = [lemma for lemma in tokens if lemma not in stopwords]
    cleaned = ' '.join(text_no_stop)

    df.loc[idx,'Tag text'] = cleaned

In [7]:
doc = nlp(df["Tag text"][3])
spacy.displacy.render(doc, style='ent',jupyter=True)

In [8]:
df.to_excel("cleaned_test.xlsx", index=False)
df.to_csv("cleaned_test.csv", index=False, encoding="utf-8-sig")

In [14]:
# TfidfVectorizer 

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


tfidfvectorizer = TfidfVectorizer(analyzer='word')

# convert th documents into a matrix


tfidf_wm = tfidfvectorizer.fit_transform(df['Tag text'])
tfidfvectorizer.get_feature_names_out()

print(tfidf_wm.shape)


(662, 20929)


In [15]:
NUM_TOPICS = 10

In [16]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(tfidf_wm)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [17]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(tfidf_wm)

In [18]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(tfidf_wm)

In [19]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [20]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, )

LDA Model:
Topic 0:
[('2020', 0.14179453026788905), ('bel', 0.13505738861359554), ('nace', 0.13441165345448353), ('code', 0.12862033572749287), ('overeenkomst', 0.12647226743970877), ('besluit', 0.12647093826597902), ('artikel', 0.12532189402951327), ('koninklijk', 0.12477619019517609), ('gelet', 0.12272894541724493), ('19', 0.12181409099928825)]
Topic 1:
[('relatives', 0.437075217998743), ('modifications', 0.4167647881154457), ('titre', 0.39370244012035843), ('ajoutee', 0.18691235448887164), ('confirmation', 0.18611173287188754), ('enregistrement', 0.18458708855188585), ('greffe', 0.18441543557707293), ('divers', 0.18407364166119708), ('arrete', 0.1838190344385638), ('valeur', 0.18334515832649745)]
Topic 2:
[('31122021', 0.5264898734354219), ('1012021', 0.5259777439854704), ('ingraving', 0.37181135772326934), ('804', 0.3394320970155201), ('738', 0.2693299847772257), ('607', 0.2651789993141868), ('336', 0.2568606055355108), ('542', 0.22962457899265418), ('673', 0.22370679364741167), ('

In [21]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, tfidfvectorizer)

NMF Model:
Topic 0:
[('2020', 0.8287381780896311), ('koninklijk', 0.7539954346340891), ('besluit', 0.6477211357507594), ('gelet', 0.5738841752955748), ('covid', 0.5695645270746533), ('19', 0.5515411284315654), ('artikel', 0.4963758234238924), ('maatregelen', 0.49610564221999276), ('wet', 0.493910499777956), ('coronavirus', 0.42919595876257094)]
Topic 1:
[('franse', 1.086569535258148), ('gemeenschap', 1.080715028999917), ('regering', 0.5378017744393051), ('2020', 0.32584344479776706), ('onderwijs', 0.3042807081007697), ('bijzondere', 0.29333135605952926), ('machten', 0.2757197296524651), ('decreet', 0.2706490501459327), ('ambtenarenzaken', 0.25345411159343756), ('promotie', 0.23679822139165269)]
Topic 2:
[('akkoord', 0.8202654356889039), ('bevoegde', 0.5487710644367642), ('autoriteiten', 0.5407926888099067), ('belgië', 0.5210108735237939), ('onderling', 0.5163452803920507), ('overeenkomst', 0.4918735911165889), ('2020', 0.44935157386429), ('luxemburg', 0.34190166760335844), ('overleg', 

In [22]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, tfidfvectorizer)

LSI Model:
Topic 0:
[('2020', 0.25176443194233555), ('besluit', 0.22365445769494988), ('artikel', 0.21440110211796196), ('gelet', 0.20391690671228793), ('19', 0.1837195152548483), ('regering', 0.18242355790157552), ('covid', 0.17449653586628358), ('2021', 0.16756424461416844), ('overwegende', 0.13936774991794285), ('gemeenschap', 0.13465952748737206)]
Topic 1:
[('gemeenschap', 0.4624840445280653), ('franse', 0.4529611534163234), ('regering', 0.24807113813336576), ('onderwijs', 0.1370597950515426), ('decreet', 0.1335864523418038), ('machten', 0.13196475319823955), ('bijzondere', 0.13080267830203587), ('ambtenarenzaken', 0.1118268887000394), ('promotie', 0.10161317978075539), ('jeugd', 0.08588954303405519)]
Topic 2:
[('akkoord', 0.33968508821256965), ('bevoegde', 0.22752684698971376), ('autoriteiten', 0.22521447810667175), ('onderling', 0.21599892593233802), ('belgië', 0.21218108943734937), ('overeenkomst', 0.20847508812606907), ('2020', 0.16841733212653828), ('gemeenschap', 0.1457058732