# Topic model

In [13]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('dutch')

In [15]:
# Data preprocessing
df = pd.read_excel('all_speech.xlsx')

child = df[df.agecat == 'child']
adult = df[df.agecat == 'adult']

data = child.speech.values.tolist() # or: adult
pprint(data[:3])

       Unnamed: 0            speaker agecat  \
40             40  capiteinlatechild  child   
57             57            meisjes  child   
70             70          capitein8  child   
71             71          capitein8  child   
75             75          capitein8  child   
79             79         capitein10  child   
396           396            jongens  child   
397           397             jongen  child   
400           400             jongen  child   
403           403             jongen  child   
404           404             jongen  child   
1255         1255           jonathan  child   
1256         1256           jonathan  child   
1257         1257           jonathan  child   
1258         1258           jonathan  child   
1259         1259           jonathan  child   
1260         1260           jonathan  child   
1261         1261           jonathan  child   
1262         1262           jonathan  child   
1263         1263           jonathan  child   
1264         

In [16]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuation

data_words = list(sent_to_words(data))

print(data_words[:3])

[['wat', 'is', 'er', 'wat', 'doen', 'ze'], ['als', 'je', 'je', 'moeder', 'aan', 'het', 'huilen', 'brengt', 'zo', 'erg', 'dat', 'haar', 'tranen', 'op', 'haar', 'borst', 'vallen', 'zou', 'het', 'niet', 'vreemd', 'zijn', 'als', 'je', 'ziek', 'wordt', 'en', 'sterft'], ['agya', 'ee']]


In [20]:
# Functions for stopwords and bigrams
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [21]:
# Remove stop words
data_words_nostops = remove_stopwords(data_words)

# Create Dictionary
id2word = corpora.Dictionary(data_words_nostops)
print(id2word)

# Create Corpus
texts = data_words_nostops # or: data_words_bigrams

Dictionary(4509 unique tokens: ['borst', 'brengt', 'erg', 'huilen', 'moeder']...)


In [24]:
# Term frequency
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:5])

[[], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(10, 1), (11, 1)], [(12, 1)], [(10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1)]]


In [26]:
# Human readable format of term-frequency
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]]

[[],
 [('borst', 1),
  ('brengt', 1),
  ('erg', 1),
  ('huilen', 1),
  ('moeder', 1),
  ('sterft', 1),
  ('tranen', 1),
  ('vallen', 1),
  ('vreemd', 1),
  ('ziek', 1)],
 [('agya', 1), ('ee', 1)],
 [('vader', 1)],
 [('agya', 1),
  ('ee', 1),
  ('vader', 1),
  ('nee', 1),
  ('nooit', 2),
  ('oboroni', 1)],
 [('gebleven', 1), ('mensen', 1), ('waar', 1)],
 [('kapitaan', 1), ('later', 2), ('osofo', 1)],
 [('efua', 1), ('halen', 1), ('manu', 1), ('zoekt', 1)],
 [('wilt', 1), ('wonen', 1)],
 []]

In [27]:
# LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=200,
                                           passes=2,
                                           alpha='auto',
                                           per_word_topics=True)

In [28]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.048*"niks" + 0.041*"vroeg" + 0.031*"ga" + 0.028*"nee" + 0.024*"hoor" + '
  '0.022*"waarom" + 0.018*"vader" + 0.018*"net" + 0.017*"riep" + 0.017*"zag"'),
 (1,
  '0.082*"we" + 0.029*"nou" + 0.028*"waar" + 0.027*"goed" + 0.026*"weer" + '
  '0.018*"wist" + 0.014*"kom" + 0.013*"heen" + 0.013*"weg" + 0.012*"zit"'),
 (2,
  '0.069*"meester" + 0.037*"moeder" + 0.032*"zeg" + 0.029*"weet" + '
  '0.019*"mamma" + 0.016*"ie" + 0.014*"zeggen" + 0.014*"gaat" + 0.014*"ging" + '
  '0.013*"elkaar"'),
 (3,
  '0.051*"zegt" + 0.042*"wel" + 0.027*"caro" + 0.024*"jij" + 0.022*"mimoen" + '
  '0.019*"ogen" + 0.015*"misschien" + 0.014*"terug" + 0.013*"moeten" + '
  '0.013*"mag"'),
 (4,
  '0.109*"zei" + 0.045*"spiek" + 0.034*"opa" + 0.034*"oma" + 0.023*"keek" + '
  '0.021*"gaan" + 0.019*"polleke" + 0.014*"nooit" + 0.012*"huis" + '
  '0.011*"kijkt"')]


In [29]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
