In [2]:
import gensim
import pandas as pd
import json
from collections import defaultdict
import re

settings_file = 'D:/thesis/settings - nl.json'



In [3]:
#Preparation

#Read settings
settings = json.loads(open(settings_file).read())["settings"]

In [4]:
#Preparing corpus and dict for gensim LDA

#Read data
df = pd.read_csv(settings['data_csv'])

#Prepare data

punct = re.compile('[.,/#!$%\^&\*;:{}=\-_`~()\'\"]')
df['TEXT'] = df['TEXT'].apply(lambda x: re.sub(punct,'',x.lower()).split())

dictionary = gensim.corpora.Dictionary(df['TEXT'])

corp = [dictionary.doc2bow(text) for text in df['TEXT']]

#Remove words that occur in many documents
word_freq = defaultdict(lambda: 0)
doc_freq = defaultdict(lambda: 0)

for document in corp:
    for (word,freq) in document:
        word_freq[word] += freq
    for (word,frew) in set(document):
        doc_freq[word] += 1

for key in dictionary:
    doc_freq[key] = doc_freq[key] / len(corp)

In [16]:
#Train the model
lda = gensim.models.ldamulticore.LdaMulticore(corpus = corp, 
                                              id2word = dictionary, 
                                              num_topics = 25, 
                                              workers = 3,
                                              iterations= 5000)

In [15]:
lda.print_topics(25)

[(0,
  '0.078*"de" + 0.033*"in" + 0.031*"van" + 0.028*"het" + 0.018*"europese" + 0.017*"een" + 0.014*"voor" + 0.012*"en" + 0.012*"dat" + 0.011*"unie"'),
 (1,
  '0.034*"in" + 0.034*"procent" + 0.034*"de" + 0.017*"dan" + 0.014*"van" + 0.012*"jaar" + 0.008*"euro" + 0.008*"vorig" + 0.008*"een" + 0.007*"met"'),
 (2,
  '0.044*"de" + 0.018*"dat" + 0.014*"investeringen" + 0.014*"in" + 0.012*"van" + 0.011*"europese" + 0.011*"het" + 0.010*"barnier" + 0.009*"bij" + 0.009*"boeren"'),
 (3,
  '0.083*"de" + 0.028*"het" + 0.027*"een" + 0.026*"van" + 0.026*"en" + 0.019*"is" + 0.019*"europese" + 0.018*"in" + 0.015*"dat" + 0.013*"unie"'),
 (4,
  '0.026*"kok" + 0.014*"mierlo" + 0.010*"charette" + 0.009*"belgie" + 0.008*"regeringsconferentie" + 0.007*"helsinki" + 0.007*"turijn" + 0.006*"santer" + 0.006*"igc" + 0.006*"grootbrittannie"'),
 (5,
  '0.088*"de" + 0.041*"van" + 0.030*"het" + 0.024*"een" + 0.018*"in" + 0.016*"is" + 0.015*"dat" + 0.014*"europese" + 0.011*"unie" + 0.011*"niet"'),
 (6,
  '0.011*"weu"