# Tweede kamer Topic modeling

In [1]:
import pandas as pd
import numpy as np
import gensim
import plotly.express as px
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import pickle 
import umap

import spacy
nlp = spacy.load("nl")

# Gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim

  """


## Import data

In [None]:
## put data chuncks in one zip file
#!cat data/CorpusTweedeKamer* > CorpusTweedeKamer.zip

In [2]:
%%time
tweede_kamer = pd.read_csv("CorpusTweedeKamer.zip")
tweede_kamer = (
    tweede_kamer
    .assign(datum = pd.to_datetime(tweede_kamer.date))
    .assign(speaker = tweede_kamer.speaker.str.lower())
)

CPU times: user 12.9 s, sys: 942 ms, total: 13.8 s
Wall time: 14.1 s


In [3]:
tweede_kamer.shape

(1143366, 12)

In [4]:
tweede_kamer.sample(6)

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,datum
617465,2010-05-19,,299,elias,VVD,828.0,False,54,Eerst een feitelijke correctie. Ook de VVD hee...,NL-TweedeKamer,NLD,2010-05-19
705660,2012-02-09,,438,karabulut,SP,1363.0,False,135,Het is interessant. In het wetsvoorstel en de ...,NL-TweedeKamer,NLD,2012-02-09
352051,2004-03-17,,273,vendrik,GL,1537.0,False,116,Dan moet ik concluderen dat het antwoord op mi...,NL-TweedeKamer,NLD,2004-03-17
505163,2008-03-27,,329,kamp,VVD,828.0,False,277,"Dit gaat over ook over politie, justitie en do...",NL-TweedeKamer,NLD,2008-03-27
444809,2006-09-28,,441,balkenende,CDA,1157.0,False,20,"U moet niet zeggen dat het onzin is, zo liggen...",NL-TweedeKamer,NLD,2006-09-28
162681,1998-11-19,,476,van den berg,SGP,1178.0,False,32,Ik vind het erg belangrijk dat het ministerie ...,NL-TweedeKamer,NLD,1998-11-19


## TOPIC Modeling with classical LDA

### selection of more recent speeches that are longer

In [7]:
recente_speeches = (
    tweede_kamer
    .query('datum > "2019-01-01"')
    .query('terms > 50')
    .query('terms < 1000')
)
recente_speeches.shape

(20148, 12)

In [8]:
recente_speeches.sample(7)

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,datum
1114922,2019-03-06,,257,de heer özdil,GL,1537.0,False,92,"En de laatste motie. Motie De Kamer, gehoord d...",NL-TweedeKamer,NLD,2019-03-06
1125454,2019-04-16,,80,de heer van dam,CDA,1157.0,False,235,Wij hebben de laatste tijd in deze Kamer gereg...,NL-TweedeKamer,NLD,2019-04-16
1133702,2019-06-05,,605,mevrouw yeşilgöz-zegerius,VVD,828.0,False,71,Uiteindelijk hebben wij op 31 oktober tot diep...,NL-TweedeKamer,NLD,2019-06-05
1124649,2019-04-10,,696,minister schouten,,,False,949,"Dank u wel, voorzitter. Het klopt dat wij vana...",NL-TweedeKamer,NLD,2019-04-10
1126963,2019-04-18,,325,de heer jasper van dijk,SP,1363.0,False,723,Voorzitter. Er is veel onveiligheid in asielzo...,NL-TweedeKamer,NLD,2019-04-18
1110920,2019-02-13,,409,de heer madlener,PVV,298.0,False,176,"Dank u wel, voorzitter. Ik heb als ex-ondernem...",NL-TweedeKamer,NLD,2019-02-13
1130183,2019-05-15,,382,mevrouw agema,PVV,298.0,False,88,Ik kan toch de minister niet helemaal goed vol...,NL-TweedeKamer,NLD,2019-05-15


In [28]:
type(text)

str

In [None]:
remove punctuation....

In [25]:
test = nlp(recente_speeches.text.values[:10].tolist())

TypeError: Argument 'string' has incorrect type (expected str, got list)

### clean up stopwords

In [17]:
%%time 

from pprint import pprint
from collections import defaultdict

# remove common words and tokenize
nlstop = spacy.lang.nl.stop_words.STOP_WORDS
nlstop.update(['minister', 'voorzitter', 'heer' , 'mevrouw', 'kamer', 'vraag', 'nr.', 'dank'])
print(sorted(nlstop))

texts = [
    [word for word in document.lower().split() if word not in nlstop]
    for document in recente_speeches.text
]

["'t", 'aan', 'aangaangde', 'aangezien', 'achter', 'achterna', 'af', 'afgelopen', 'al', 'aldus', 'alhoewel', 'alle', 'allebei', 'alleen', 'allen', 'alles', 'als', 'altijd', 'ander', 'andere', 'anderen', 'anders', 'anderzijds', 'behalve', 'beide', 'beiden', 'ben', 'beneden', 'bent', 'bepaald', 'beter', 'betere', 'betreffende', 'bij', 'bijna', 'bijvoorbeeld', 'binnen', 'binnenin', 'boven', 'bovenal', 'bovendien', 'bovenstaand', 'buiten', 'daar', 'daarheen', 'daarin', 'daarna', 'daarnet', 'daarom', 'daarop', 'dan', 'dat', 'de', 'den', 'der', 'des', 'deze', 'dezelfde', 'dezen', 'die', 'dien', 'dikwijls', 'dit', 'doch', 'doen', 'doet', 'door', 'doorgaand', 'doorgaans', 'dus', 'echter', 'een', 'eens', 'eerder', 'eerst', 'eerste', 'eersten', 'effe', 'eigen', 'elk', 'elke', 'en', 'enige', 'enkel', 'enkele', 'enz', 'er', 'erdoor', 'etc', 'even', 'eveneens', 'evenwel', 'ff', 'gauw', 'ge', 'gedurende', 'geen', 'gegeven', 'gehad', 'geheel', 'gekund', 'geleden', 'gelijk', 'gemogen', 'geven', 'gewee

### create dict and corpus

In [18]:
%%time 

dictionary = corpora.Dictionary(texts)
print(dictionary)

#### filter out extremes: too little or too much frequencies.....
dictionary.filter_extremes(
    no_below = 10,
    no_above = 0.925
)

print(dictionary)

corpus = [dictionary.doc2bow(text) for text in texts]

Dictionary(101650 unique tokens: ['150', '2018', '2018,', '2019', '2030']...)
Dictionary(13232 unique tokens: ['150', '2018', '2018,', '2019', '2030']...)
CPU times: user 3.18 s, sys: 73.2 ms, total: 3.25 s
Wall time: 3.27 s


In [19]:
%%time
tweedekamer_lda = LdaModel(corpus, num_topics = 16, id2word = dictionary, passes = 10)

CPU times: user 1min 47s, sys: 1 s, total: 1min 48s
Wall time: 1min 49s


In [20]:
lda_topics = tweedekamer_lda.print_topics(num_topics = 16)
lda_topics

[(0,
  '0.013*"3" + 0.013*"staatssecretaris" + 0.009*"migranten" + 0.008*"sea-watch" + 0.008*"gaat" + 0.007*"nijboer" + 0.006*"jaar" + 0.006*"organisaties" + 0.006*"cda" + 0.006*"schip"'),
 (1,
  '0.028*"kinderen" + 0.023*"zorg" + 0.022*"ouders" + 0.018*"jongeren" + 0.013*"onderwijs" + 0.013*"vrouwen" + 0.012*"regeling" + 0.011*"jaar" + 0.010*"inspectie" + 0.009*"medische"'),
 (2,
  '0.030*"wet" + 0.019*"amendement" + 0.019*"raad" + 0.012*"wetsvoorstel" + 0.010*"state" + 0.010*"voorzitter." + 0.009*"dank" + 0.008*"gaat" + 0.006*"advies" + 0.006*"huidige"'),
 (3,
  '0.029*"nederland" + 0.024*"nederlandse" + 0.024*"europese" + 0.015*"landen" + 0.010*"—" + 0.009*"commissie" + 0.009*"voorzitter." + 0.009*"kabinet" + 0.007*"politieke" + 0.007*"box"'),
 (4,
  '0.027*"mensen" + 0.012*"goed" + 0.012*"gaat" + 0.009*"jaar" + 0.009*"staatssecretaris" + 0.008*"heel" + 0.008*"zorg" + 0.007*"groep" + 0.007*"zorgen" + 0.006*"voorzitter."'),
 (5,
  '0.026*"heel" + 0.016*"echt" + 0.015*"gaan" + 0.015*"

In [12]:
### save model corpus and dict
from gensim.test.utils import datapath

# Save model to disk.
temp_file = datapath("model")
tweedekamer_lda.save("lda_2ekamer_model")

pickle.dump(corpus, open("corpus_2ekamer.pck", "wb"))
pickle.dump(dictionary, open ("dict_2ekamer.pck", "wb"))

'/Users/lamlon/anaconda3/lib/python3.7/site-packages/gensim/test/test_data/model'

In [3]:
### Load model and corpus and dict
tweedekamer_lda = LdaModel.load("lda_2ekamer_model")

corpus = pickle.load( open("corpus_2ekamer.pck", "rb"))
dictionary = pickle.load( open ("dict_2ekamer.pck", "rb"))
vis_data = pickle.load(open("lda_visdata.pck","rb"))

In [6]:
pyLDAvis.enable_notebook(local = True)

In [15]:
%%time 
vis_data = pyLDAvis.gensim.prepare(tweedekamer_lda, corpus, dictionary)


CPU times: user 22.4 s, sys: 526 ms, total: 23 s
Wall time: 27.5 s


In [16]:
pickle.dump(vis_data, open("lda_visdata.pck", "wb"))

In [4]:
pyLDAvis.show(vis_data)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]
127.0.0.1 - - [08/Aug/2020 20:42:21] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Aug/2020 20:42:22] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [08/Aug/2020 20:42:22] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [08/Aug/2020 20:42:22] "GET /LDAvis.js HTTP/1.1" 200 -

stopping Server...


In [7]:
pyLDAvis.display(vis_data)

## Topic modeling with top2vec