In [1]:
import chardet
import pandas as pd

with open('./data/tax.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large


df = pd.read_csv('./data/tax.csv', encoding=result['encoding'])

df

Unnamed: 0,Date,Title,Numac,Link FR,Link NL,Text
0,1/14/2020,REGION DE BRUXELLES-CAPITALE\nREGION DE BRUXEL...,2020010053,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...
1,1/16/2020,SERVICE PUBLIC FEDERAL INTERIEUR\n10 DECEMBRE ...,2020010044,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR DE\nELI - Navigatie systeem via een Euro...
2,1/16/2020,SERVICE PUBLIC FEDERAL FINANCES\n7 DECEMBRE 20...,2020040052,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR DE\nELI - Navigatie systeem via een Euro...
3,1/24/2020,MINISTERE DE LA COMMUNAUTE FRANCAISE\n20 DECEM...,2020010214,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\n\neinde eerste woord laatste woord\nPub...
4,1/28/2020,SERVICE PUBLIC FEDERAL FINANCES\n20 JANVIER 20...,2020040138,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...
...,...,...,...,...,...,...
764,4/4/2022,SERVICE PUBLIC FEDERAL FINANCES\n29 MARS 2021....,2022020551,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR DE\nELI - Navigatie systeem via een Euro...
765,4/4/2022,"SERVICE PUBLIC FEDERAL EMPLOI, TRAVAIL ET CONC...",2022201913,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...
766,4/4/2022,SERVICE PUBLIC FEDERAL SECURITE SOCIALE\n23 MA...,2022031428,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL FR\nbelgiëlex.be - Kruispuntbank Wetgevi...
767,4/4/2022,GOUVERNEMENTS DE COMMUNAUTE ET DE REGION\nREGI...,40002126,http://www.ejustice.just.fgov.be/cgi/article.p...,http://www.ejustice.just.fgov.be/cgi/article_b...,NL


In [2]:
df = df[df["Text"].str.contains("Duitse vertaling")==False]

In [3]:
import nltk
nltk.download('stopwords') 
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mahboubeh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mahboubeh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Dataset
from sklearn.datasets import fetch_20newsgroups
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 100
# Text preprocessing and modelling
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
# Visualisation
#import matplotlib.pyplot as plt
#import seaborn as sns
#sns.set(style='whitegrid', context='talk')
#from wordcloud import WordCloud
#import pyLDAvis
#import pyLDAvis.sklearn
# Warnings
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
# Stopwords
stop_words = stopwords.words("dutch")
stop_words.extend(['bis', 'NL', 'FR','artikel', "januari", "februari", "maart", "april", "mei", "juni", "juli", "augustus", "september", "oktober", "november", "december"])

In [5]:
def preprocess_text(document):
    """Preprocess document into normalised tokens."""
    # Tokenise words into alphabetic tokens with minimum length of 3
    tokeniser = RegexpTokenizer(r'[A-Za-z]{3,}')
    tokens = tokeniser.tokenize(document)
    
    # Tag words with POS tag
    pos_map = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
    pos_tags = pos_tag(tokens)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(t.lower(), pos=pos_map.get(p[0], 'v')) for t, p in pos_tags]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stop_words]
    return keywords

In [15]:
# raw documents to tf-idf matrix: 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
vectorizer = TfidfVectorizer(analyzer=preprocess_text, 
                             use_idf=True, 
                             smooth_idf=True)
# SVD to reduce dimensionality: 
svd_model = TruncatedSVD(n_components=2,       
                         algorithm='randomized',
                         n_iter=10)
# pipeline of tf-idf + SVD, fit to and applied to documents:
svd_transformer = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])
svd_matrix = svd_transformer.fit_transform(df['Text'])

terms = vectorizer.get_feature_names() 
for i, comp in enumerate(svd_model.components_): 
    terms_comp = zip(terms, comp) 
    sorted_terms = sorted(terms_comp, key= lambda x:x[1],        
    reverse=True)[:6] 
    print("Topic "+str(i)+": ") 
    for t in sorted_terms: 
        print(t[0]) 
        print(" ")

Topic 0: 
besluit
 
regering
 
meet
 
gelet
 
covid
 
minister
 
Topic 1: 
gemeenschap
 
franse
 
regering
 
decreet
 
onderwijs
 
ambtenarenzaken
 
