In [1]:
import gensim
import pandas as pd
import json
from collections import defaultdict
import re

settings_file = 'D:/thesis/settings - nl.json'



In [2]:
#Preparation

#Read settings
settings = json.loads(open(settings_file).read())["settings"]

In [65]:
#Preparing corpus and dict for gensim LDA

#Read data
df = pd.read_json(settings['data_json'])
df.sort_index(inplace = True)

#Prepare data
texts = [' '.join(text) for text in df['TEXT']]
punct = re.compile('[.,/#!$%\^&\*;:{}=\-_`~()\'\"]')
texts = [re.sub(punct,'',text.lower()).split() for text in texts]

dictionary = gensim.corpora.Dictionary(texts)

corp = [dictionary.doc2bow(text) for text in texts]

#Remove words that occur in many documents
word_freq = defaultdict(lambda: 0)
doc_freq = defaultdict(lambda: 0)

for document in corp:
    for (word,freq) in document:
        word_freq[word] += freq
    for (word,freq) in set(document):
        doc_freq[word] += 1

for key in dictionary:
    doc_freq[key] = doc_freq[key] / len(corp)

#TODO: drop tokens that occur too often or not often enough
min_count = 5
freq_range = (0.01,0.8)
drop_list = []

for key in word_freq:
    if (word_freq[key] < min_count) or (freq_range[0] > doc_freq[key]) or (doc_freq[key] > freq_range[1]):
        drop_list.append(key)

for i in range(len(corp)):
    corp[i] = [(word,freq) for word,freq in corp[i] if (word not in drop_list)]

In [104]:
#Train the model
lda = gensim.models.ldamulticore.LdaMulticore(corpus = corp, 
                                              id2word = dictionary, 
                                              num_topics = 20, 
                                              workers = 3,
                                              iterations= 10000)

In [105]:
[dictionary[word] for word,score in lda.get_topic_terms(0,25)]

['die',
 'niet',
 'zijn',
 'aan',
 'ook',
 'er',
 'om',
 'maar',
 'over',
 'als',
 'worden',
 'europa',
 'heeft',
 'tot',
 'bij',
 'nog',
 'landen',
 'dit',
 'meer',
 'moet',
 'door',
 'zal',
 'zich',
 'hebben',
 'kan']