# Tweede kamer Tensorflow Sentence Model

In [1]:
import tensorflow_text
import tensorflow_hub as hub

import pandas as pd
import numpy as np
import gensim
import plotly.express as px
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

import umap

In [2]:
# Import universal sentence model, multilang ook voor Nederlands
embed_NL = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

In [4]:
## check it....
embeddings = embed_NL([
    "Ik ga naar huis, doei.",
    "Nou blijf ik op tot 11 uur in de avond"]
)
embeddings

<tf.Tensor: id=14388, shape=(2, 512), dtype=float32, numpy=
array([[-0.07198761,  0.02842309, -0.03948487, ..., -0.04149985,
         0.04083857, -0.02191485],
       [-0.03685803,  0.00436619, -0.04363939, ...,  0.03421106,
         0.04059676, -0.06889848]], dtype=float32)>

## Import data

In [None]:
## put data chuncks in one zip file
!cat data/CorpusTweedeKamer* > CorpusTweedeKamer.zip

In [3]:
%%time
tweede_kamer = pd.read_csv("CorpusTweedeKamer.zip")
tweede_kamer = (
    tweede_kamer
    .assign(datum = pd.to_datetime(tweede_kamer.date))
    .assign(speaker = tweede_kamer.speaker.str.lower())
)

CPU times: user 13.2 s, sys: 890 ms, total: 14.1 s
Wall time: 14.4 s


In [4]:
tweede_kamer.shape

(1143366, 12)

In [8]:
tweede_kamer.sample(10)

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,datum
274583,2001-12-06,,194,de wit,SP,1363.0,False,27,Maar moet dat niet vaststaan voordat wij hier ...,NL-TweedeKamer,NLD,2001-12-06
958394,2016-10-26,,520,de heer elias,VVD,828.0,False,2,Of houdstermaatschappij.,NL-TweedeKamer,NLD,2016-10-26
356298,2004-04-21,,508,van velzen,SP,1363.0,False,372,Voorzitter. De minister van LNV heeft per brie...,NL-TweedeKamer,NLD,2004-04-21
741743,2012-12-04,,121,anne mulder,VVD,828.0,False,372,"Dat gebeurt al, want sterftecijfers worden ges...",NL-TweedeKamer,NLD,2012-12-04
786064,2013-10-08,,221,klein,other,,False,79,Ik ben in dit verband even de weg kwijt. De he...,NL-TweedeKamer,NLD,2013-10-08
964470,2016-11-16,,276,de heer öztürk,other,,False,5,Dan moet u goed luisteren.,NL-TweedeKamer,NLD,2016-11-16
68623,1996-10-17,,1,rosenmöller,GL,1537.0,False,1352,Voorzitter! De Nederlandse politiek dreigt de ...,NL-TweedeKamer,NLD,1996-10-17
738776,2012-11-21,,291,verheijen,VVD,828.0,False,169,"Het staat verderop in mijn tekst, die ik dan n...",NL-TweedeKamer,NLD,2012-11-21
255890,2001-06-19,,118,voorzitter,other,,True,28,Deze motie is voorgesteld door de leden Passto...,NL-TweedeKamer,NLD,2001-06-19
1084326,2018-10-31,,1056,de heer klaver,GL,1537.0,False,59,Om half één 's nachts zitten er vast hordes me...,NL-TweedeKamer,NLD,2018-10-31


### Embded

In [5]:
recente_speeches = (
    tweede_kamer
    .dropna(subset = ["text"])
    .query('terms > 15')
    .query('terms < 1000')
    .query('datum > "2010-01-01"')
)

In [6]:
recente_speeches = recente_speeches.reset_index(drop = True)

In [7]:
recente_speeches

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,datum
0,2010-01-12,,1,weekers,VVD,828.0,False,376,Voorzitter. In de week van de val van DSB heef...,NL-TweedeKamer,NLD,2010-01-12
1,2010-01-12,,2,bos,PvdA,1234.0,False,260,Voorzitter. De laatste conclusie van de heer W...,NL-TweedeKamer,NLD,2010-01-12
2,2010-01-12,,3,weekers,VVD,828.0,False,204,Het gaat ons er niet om dat de minister zich b...,NL-TweedeKamer,NLD,2010-01-12
3,2010-01-12,,4,bos,PvdA,1234.0,False,226,Ik zal die zorgen overbrengen aan de heer Sche...,NL-TweedeKamer,NLD,2010-01-12
4,2010-01-12,,5,tony van dijck,PVV,298.0,False,107,Ook de fractie van de Partij voor de Vrijheid ...,NL-TweedeKamer,NLD,2010-01-12
...,...,...,...,...,...,...,...,...,...,...,...,...
468196,2019-07-04,,963,staatssecretaris broekers-knol,,,False,290,Ik ga uit van de gegevens die wij hebben gekre...,NL-TweedeKamer,NLD,2019-07-04
468197,2019-07-04,,965,staatssecretaris broekers-knol,,,False,142,Dan de motie op stuk nr. 2519 van de heer Hidd...,NL-TweedeKamer,NLD,2019-07-04
468198,2019-07-04,,967,de voorzitter,,,True,62,Over exact 60 minuten gaan wij stemmen over de...,NL-TweedeKamer,NLD,2019-07-04
468199,2019-07-04,,968,mevrouw van toorenburg,CDA,1157.0,False,21,Toch nog even — misschien is het allemaal afge...,NL-TweedeKamer,NLD,2019-07-04


In [22]:
%%time

i=0
B = 1500
zz = recente_speeches.iloc[ (0 + (i*B)) : (i+1)*B, :]
out_emb = embed_NL( zz.text.str.lower().values ).numpy()

N = math.ceil(recente_speeches.shape[0] / B)
print(N)

for i in range(1, N) :

     print(i)

     zz = recente_speeches.iloc[ (0 + (i*B)) : (i+1)*B, :]
     
     tmp_emb = embed_NL( zz.text.str.lower().values ).numpy()
     out_emb = np.vstack((out_emb, tmp_emb))


1
2
CPU times: user 3min 9s, sys: 1min 2s, total: 4min 11s
Wall time: 44.7 s


In [25]:
out_emb.shape

(4500, 512)

In [48]:
embeddings_df = pd.DataFrame(TF_query_embeddings)

In [49]:
recente_speeches_met_emb = pd.concat([recente_speeches, embeddings_df], axis=1)

In [50]:
recente_speeches_met_emb

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,...,502,503,504,505,506,507,508,509,510,511
0,1998-12-01,,179,vliegenthart,PvdA,1234.0,False,17,Ik ben nog wel 20 à 25 minuten bezig.,NL-TweedeKamer,...,0.037875,-0.025647,-0.012416,0.014532,-0.025066,0.060782,0.015138,-0.100352,0.073999,0.020567
1,2017-12-20,,422,de voorzitter,,,True,36,Ik neem aan dat er geen bezwaar tegen bestaat ...,NL-TweedeKamer,...,-0.054891,-0.031692,-0.014315,-0.012349,0.006541,-0.041419,-0.073185,-0.053635,0.062346,-0.014302
2,2018-02-15,,123,de heer sjoerdsma,D66,45.0,False,34,"Voorzitter, nog één zin. Maar laat ik ook zegg...",NL-TweedeKamer,...,0.017628,-0.078403,-0.019696,-0.080405,-0.049069,0.029821,-0.024627,-0.000960,0.022202,0.080329
3,1995-11-02,,408,van de camp,CDA,1157.0,False,47,Voorzitter! Het uitwerken van die meerdere var...,NL-TweedeKamer,...,0.004823,0.001006,0.012556,-0.039923,-0.015621,0.006838,-0.053398,0.015160,0.018328,0.011306
4,2006-02-01,,269,de wit,SP,1363.0,False,348,Ik heb aandacht gevraagd voor de positie van d...,NL-TweedeKamer,...,0.015038,-0.020210,-0.013434,-0.066156,-0.064930,0.052278,-0.034463,-0.070829,-0.051718,0.076446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2015-10-01,,678,wiebes,other,,False,163,De constructie van de postcoderoos is uitgebre...,NL-TweedeKamer,...,0.019693,-0.032995,-0.046715,-0.078461,0.018016,0.028285,-0.015144,-0.027879,0.051958,0.090900
4996,1997-04-23,,85,remkes,VVD,828.0,False,117,"Ja, maar waar het mij om gaat, is dat er onder...",NL-TweedeKamer,...,0.084790,-0.035510,0.038430,-0.052308,-0.019230,-0.067377,0.044026,0.001222,-0.000705,0.060241
4997,1996-06-06,,52,van de vondervoort,PvdA,1234.0,False,497,Ik begrijp dat u over de tekst struikelt. Maar...,NL-TweedeKamer,...,0.076989,-0.064937,-0.014233,-0.040007,-0.040482,-0.063832,-0.025371,-0.072617,-0.038270,0.079579
4998,2001-10-18,,299,ten hoopen,CDA,1157.0,False,72,"Ik heb wel een standpunt, maar als dat had wil...",NL-TweedeKamer,...,0.023364,-0.022795,0.005160,-0.038095,0.050096,-0.036349,-0.007740,-0.008400,0.023201,0.103882


## Import already emnbedded speeches that we saved

In [2]:
import pickle
recente_speeches_met_emb = pickle.load(open("kamer_debatten_recente_speeches_met_emb.pck", "rb"))

In [3]:
recente_speeches_met_emb

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,...,502,503,504,505,506,507,508,509,510,511
0,2010-01-12,,1,weekers,VVD,828.0,False,376,Voorzitter. In de week van de val van DSB heef...,NL-TweedeKamer,...,0.053838,-0.041742,-0.032260,-0.067037,-0.022552,-0.023394,-0.065473,-0.069716,0.012569,0.062962
1,2010-01-12,,2,bos,PvdA,1234.0,False,260,Voorzitter. De laatste conclusie van de heer W...,NL-TweedeKamer,...,0.053762,-0.070165,-0.014723,-0.067824,-0.056783,-0.006215,-0.072602,-0.071131,-0.018375,0.080761
2,2010-01-12,,3,weekers,VVD,828.0,False,204,Het gaat ons er niet om dat de minister zich b...,NL-TweedeKamer,...,0.069975,-0.066717,-0.014065,-0.068072,-0.066792,0.006887,-0.068825,-0.071827,0.018592,0.074508
3,2010-01-12,,4,bos,PvdA,1234.0,False,226,Ik zal die zorgen overbrengen aan de heer Sche...,NL-TweedeKamer,...,0.082787,-0.077583,0.064040,-0.065484,-0.048932,-0.053128,-0.070319,-0.073950,0.039685,0.095769
4,2010-01-12,,5,tony van dijck,PVV,298.0,False,107,Ook de fractie van de Partij voor de Vrijheid ...,NL-TweedeKamer,...,-0.019395,-0.025600,-0.051116,-0.067381,-0.071851,-0.021703,-0.082825,-0.032184,-0.049826,0.088014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468192,2019-07-04,,963,staatssecretaris broekers-knol,,,False,290,Ik ga uit van de gegevens die wij hebben gekre...,NL-TweedeKamer,...,0.051056,-0.042262,-0.051334,-0.051260,-0.016387,-0.056653,-0.002518,-0.075303,-0.005929,0.087212
468193,2019-07-04,,965,staatssecretaris broekers-knol,,,False,142,Dan de motie op stuk nr. 2519 van de heer Hidd...,NL-TweedeKamer,...,0.059314,-0.056630,-0.031098,-0.041214,-0.001191,0.002085,0.038942,-0.011882,0.044419,0.059911
468194,2019-07-04,,967,de voorzitter,,,True,62,Over exact 60 minuten gaan wij stemmen over de...,NL-TweedeKamer,...,0.008260,-0.023593,0.078514,-0.011184,-0.030405,-0.023197,0.012507,-0.052127,0.023723,0.062293
468195,2019-07-04,,968,mevrouw van toorenburg,CDA,1157.0,False,21,Toch nog even — misschien is het allemaal afge...,NL-TweedeKamer,...,-0.073596,0.022221,-0.085983,-0.041239,0.000995,-0.014434,-0.021121,-0.051808,0.041230,0.124054


## UMAP


In [4]:
recente_speeches_met_emb.iloc[:,12:524]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0.001517,-0.016863,-0.017720,0.036810,-0.043257,0.021977,0.004515,0.042213,-0.022001,0.004596,...,0.053838,-0.041742,-0.032260,-0.067037,-0.022552,-0.023394,-0.065473,-0.069716,0.012569,0.062962
1,0.016520,-0.032066,0.034188,0.010240,-0.082366,0.044600,0.028787,0.016408,0.026870,0.053727,...,0.053762,-0.070165,-0.014723,-0.067824,-0.056783,-0.006215,-0.072602,-0.071131,-0.018375,0.080761
2,0.033603,-0.006131,-0.034597,0.038439,-0.003401,0.022826,-0.005924,0.007572,0.009028,0.048081,...,0.069975,-0.066717,-0.014065,-0.068072,-0.066792,0.006887,-0.068825,-0.071827,0.018592,0.074508
3,0.043181,0.035323,-0.040279,-0.005309,-0.097467,-0.018668,0.027678,0.061460,0.013056,0.040457,...,0.082787,-0.077583,0.064040,-0.065484,-0.048932,-0.053128,-0.070319,-0.073950,0.039685,0.095769
4,-0.049212,0.035122,0.030433,0.015830,0.004795,0.014693,0.019232,0.015539,0.005376,0.013581,...,-0.019395,-0.025600,-0.051116,-0.067381,-0.071851,-0.021703,-0.082825,-0.032184,-0.049826,0.088014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468192,-0.028713,0.050518,-0.001817,-0.004725,-0.086307,0.043043,0.078066,-0.013987,-0.030064,0.025046,...,0.051056,-0.042262,-0.051334,-0.051260,-0.016387,-0.056653,-0.002518,-0.075303,-0.005929,0.087212
468193,-0.001933,0.037012,0.052258,0.016521,-0.051501,0.049920,0.035739,-0.053260,0.003325,0.012937,...,0.059314,-0.056630,-0.031098,-0.041214,-0.001191,0.002085,0.038942,-0.011882,0.044419,0.059911
468194,0.001891,0.018683,0.023671,-0.032840,-0.070723,0.032373,0.058054,0.002620,-0.002296,-0.069319,...,0.008260,-0.023593,0.078514,-0.011184,-0.030405,-0.023197,0.012507,-0.052127,0.023723,0.062293
468195,-0.017085,-0.013560,0.038470,-0.057001,0.062642,0.065041,-0.023146,0.025174,-0.066608,-0.052003,...,-0.073596,0.022221,-0.085983,-0.041239,0.000995,-0.014434,-0.021121,-0.051808,0.041230,0.124054


In [26]:
sample = (
    recente_speeches_met_emb
    .query("datum >= '2019-01-01'")
    .sample(15000)
)
sample = sample.reset_index(drop=True)

In [27]:
%%time
matrix = np.array(sample.iloc[:,12:524])
embedding2 = umap.UMAP(n_components = 2, metric = "cosine", n_neighbors = 5 ).fit_transform(matrix)

CPU times: user 25.4 s, sys: 358 ms, total: 25.7 s
Wall time: 24.3 s


In [28]:
tmp = pd.DataFrame(embedding2, columns=["x", "y"])
tweede_kamer_wv = pd.concat(
    [
        sample,
        tmp
    ],
     axis=1
)

In [29]:
tweede_kamer_wv

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,...,504,505,506,507,508,509,510,511,x,y
0,2019-06-04,,145,staatssecretaris blokhuis,,,False,150,In mijn argumentatie heb ik daar niks aan toe ...,NL-TweedeKamer,...,-0.034677,-0.082104,-0.043291,-0.066047,-0.020336,-0.098126,0.044645,0.090700,-0.839608,-1.984850
1,2019-06-19,,640,de heer van kent,SP,1363.0,False,80,"De manier waarop je pensioen opbouwt, is iets ...",NL-TweedeKamer,...,-0.044146,-0.027814,-0.008813,-0.013081,0.012956,0.005935,0.053902,0.094718,0.271573,-1.901651
2,2019-02-20,,467,minister bruins,,,False,88,Dan de verplichting van apothekers om cashontv...,NL-TweedeKamer,...,0.029843,-0.031528,-0.014502,0.008386,0.000288,-0.052191,-0.063031,0.077676,-1.021259,-4.329572
3,2019-03-27,,402,de heer futselaar,SP,1363.0,False,34,Met dat laaste ben ik het sowieso eens. Ik wil...,NL-TweedeKamer,...,0.025577,-0.076744,-0.015926,0.009394,-0.068499,-0.060727,-0.035302,0.102352,-2.793454,0.756622
4,2019-03-28,,327,minister rutte,,,False,279,"Ja, potentieel, maar dan kom je natuurlijk ook...",NL-TweedeKamer,...,-0.078285,-0.034149,0.031486,-0.050321,-0.013772,-0.068957,-0.011894,0.084279,-4.362110,-0.522307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,2019-06-26,,721,de voorzitter,,,True,19,Ik heb ze voor me liggen. Dat is het amendemen...,NL-TweedeKamer,...,-0.005376,0.000433,0.064335,-0.027713,0.013759,0.009157,-0.007386,0.068720,1.251740,2.186965
14996,2019-03-06,,143,de voorzitter,,,True,23,"Nee, zo gaan we het niet doen, meneer Wiersma,...",NL-TweedeKamer,...,0.038230,0.000595,-0.019490,0.004661,0.003805,-0.001934,0.043724,0.045041,5.799386,1.076726
14997,2019-06-11,,131,mevrouw ouwehand,PvdD,1467.0,False,118,"Een laatste vraag, voorzitter. Ik heb in de ve...",NL-TweedeKamer,...,-0.064451,-0.052503,-0.040933,-0.018369,-0.051031,-0.071905,-0.010961,0.088036,0.573588,-2.040182
14998,2019-06-19,,427,de heer van aalst,PVV,298.0,False,136,Daar ben ik het zeker mee eens. Maar vooral al...,NL-TweedeKamer,...,-0.004263,-0.057630,-0.037909,-0.004886,0.034617,-0.072492,0.034394,0.084354,0.218147,-2.363330


In [30]:
plotdata = tweede_kamer_wv.dropna(subset=["speaker", "party"])

fig = px.scatter(
    plotdata,
    x = "x",
    y = "y",
    color = "party",
    hover_name = "text",
    width = 1900, height = 1200
)
fig