# Tweede kamer Tensorflow Sentence Model

In [2]:
import tensorflow_text
import tensorflow_hub as hub

import pandas as pd
import numpy as np
import gensim
import plotly.express as px
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

import umap

In [3]:
# Import universal sentence model, multilang ook voor Nederlands
embed_NL = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

In [4]:
## check it....
embeddings = embed_NL([
    "Ik ga naar huis, doei.",
    "Nou blijf ik op tot 11 uur in de avond"]
)
embeddings

<tf.Tensor: id=14388, shape=(2, 512), dtype=float32, numpy=
array([[-0.07198761,  0.02842309, -0.03948487, ..., -0.04149985,
         0.04083857, -0.02191485],
       [-0.03685803,  0.00436619, -0.04363939, ...,  0.03421106,
         0.04059676, -0.06889848]], dtype=float32)>

## Import data

In [5]:
%%time
tweede_kamer = pd.read_csv("CorpusTweedeKamer.zip")
tweede_kamer = (
    tweede_kamer
    .assign(datum = pd.to_datetime(tweede_kamer.date))
    .assign(speaker = tweede_kamer.speaker.str.lower())
)

CPU times: user 15.7 s, sys: 1.43 s, total: 17.2 s
Wall time: 21 s


In [7]:
tweede_kamer.shape

(1143366, 12)

In [8]:
tweede_kamer.sample(10)

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,datum
274583,2001-12-06,,194,de wit,SP,1363.0,False,27,Maar moet dat niet vaststaan voordat wij hier ...,NL-TweedeKamer,NLD,2001-12-06
958394,2016-10-26,,520,de heer elias,VVD,828.0,False,2,Of houdstermaatschappij.,NL-TweedeKamer,NLD,2016-10-26
356298,2004-04-21,,508,van velzen,SP,1363.0,False,372,Voorzitter. De minister van LNV heeft per brie...,NL-TweedeKamer,NLD,2004-04-21
741743,2012-12-04,,121,anne mulder,VVD,828.0,False,372,"Dat gebeurt al, want sterftecijfers worden ges...",NL-TweedeKamer,NLD,2012-12-04
786064,2013-10-08,,221,klein,other,,False,79,Ik ben in dit verband even de weg kwijt. De he...,NL-TweedeKamer,NLD,2013-10-08
964470,2016-11-16,,276,de heer öztürk,other,,False,5,Dan moet u goed luisteren.,NL-TweedeKamer,NLD,2016-11-16
68623,1996-10-17,,1,rosenmöller,GL,1537.0,False,1352,Voorzitter! De Nederlandse politiek dreigt de ...,NL-TweedeKamer,NLD,1996-10-17
738776,2012-11-21,,291,verheijen,VVD,828.0,False,169,"Het staat verderop in mijn tekst, die ik dan n...",NL-TweedeKamer,NLD,2012-11-21
255890,2001-06-19,,118,voorzitter,other,,True,28,Deze motie is voorgesteld door de leden Passto...,NL-TweedeKamer,NLD,2001-06-19
1084326,2018-10-31,,1056,de heer klaver,GL,1537.0,False,59,Om half één 's nachts zitten er vast hordes me...,NL-TweedeKamer,NLD,2018-10-31


### Embded

In [6]:
recente_speeches = (
    tweede_kamer
    .query('terms > 15')
    .query('terms < 1000')
    .query('datum > "2010-01-01"')
)

In [7]:
recente_speeches = recente_speeches.reset_index(drop = True)

In [16]:
recente_speeches

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,datum
0,2010-01-12,,1,weekers,VVD,828.0,False,376,Voorzitter. In de week van de val van DSB heef...,NL-TweedeKamer,NLD,2010-01-12
1,2010-01-12,,2,bos,PvdA,1234.0,False,260,Voorzitter. De laatste conclusie van de heer W...,NL-TweedeKamer,NLD,2010-01-12
2,2010-01-12,,3,weekers,VVD,828.0,False,204,Het gaat ons er niet om dat de minister zich b...,NL-TweedeKamer,NLD,2010-01-12
3,2010-01-12,,4,bos,PvdA,1234.0,False,226,Ik zal die zorgen overbrengen aan de heer Sche...,NL-TweedeKamer,NLD,2010-01-12
4,2010-01-12,,5,tony van dijck,PVV,298.0,False,107,Ook de fractie van de Partij voor de Vrijheid ...,NL-TweedeKamer,NLD,2010-01-12
...,...,...,...,...,...,...,...,...,...,...,...,...
468196,2019-07-04,,963,staatssecretaris broekers-knol,,,False,290,Ik ga uit van de gegevens die wij hebben gekre...,NL-TweedeKamer,NLD,2019-07-04
468197,2019-07-04,,965,staatssecretaris broekers-knol,,,False,142,Dan de motie op stuk nr. 2519 van de heer Hidd...,NL-TweedeKamer,NLD,2019-07-04
468198,2019-07-04,,967,de voorzitter,,,True,62,Over exact 60 minuten gaan wij stemmen over de...,NL-TweedeKamer,NLD,2019-07-04
468199,2019-07-04,,968,mevrouw van toorenburg,CDA,1157.0,False,21,Toch nog even — misschien is het allemaal afge...,NL-TweedeKamer,NLD,2019-07-04


In [24]:
maanden = pd.date_range('2018-10-01','2018-12-01', freq='MS').strftime("%Y-%m-%d").tolist()
maanden

['2018-10-01', '2018-11-01', '2018-12-01']

In [9]:
%%time
m = '2019-01-01'
m2 = '2019-02-01'
zz = (
    recente_speeches
    .query(f"datum > '{m}'")
    .query(f"datum < '{m2}'")
)
    

CPU times: user 23.4 ms, sys: 14.4 ms, total: 37.8 ms
Wall time: 36.5 ms


In [10]:
zz

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,datum
439394,2019-01-15,,1,de voorzitter,,,True,30,We beginnen zoals gebruikelijk op dinsdagmidda...,NL-TweedeKamer,NLD,2019-01-15
439395,2019-01-15,,4,de heer klaver,GL,1537.0,False,327,Voorzitter. Als deze premier naar het grote be...,NL-TweedeKamer,NLD,2019-01-15
439396,2019-01-15,,6,minister rutte,,,False,485,"Voorzitter. Het doel, in de Klimaatwet, waarva...",NL-TweedeKamer,NLD,2019-01-15
439397,2019-01-15,,7,de heer klaver,GL,1537.0,False,276,Ik hoor hier randvoorwaarden. Dat kon je vorig...,NL-TweedeKamer,NLD,2019-01-15
439398,2019-01-15,,8,minister rutte,,,False,275,Op het tweede punt heb ik al geantwoord dat ik...,NL-TweedeKamer,NLD,2019-01-15
...,...,...,...,...,...,...,...,...,...,...,...,...
443211,2019-01-31,,593,mevrouw van brenk,50PLUS,714.0,False,50,Voorzitter. Oud zit fout bij dit kabinet. Er w...,NL-TweedeKamer,NLD,2019-01-31
443212,2019-01-31,,594,de voorzitter,,,True,33,"Dank u wel, mevrouw Van Brenk. Daarmee zijn wi...",NL-TweedeKamer,NLD,2019-01-31
443213,2019-01-31,,600,minister koolmees,,,False,611,"Dank, mevrouw de voorzitter. Ik dank de Kamer ...",NL-TweedeKamer,NLD,2019-01-31
443214,2019-01-31,,602,minister koolmees,,,False,943,Hij gaat er dus mee akkoord. De motie op stuk ...,NL-TweedeKamer,NLD,2019-01-31


In [11]:
%%time
TF_query_embeddings = embed_NL(
     zz.text.str.lower().values   
).numpy()


In [12]:
TF_query_embeddings

array([[ 0.07687331,  0.03715456,  0.03364848, ...,  0.00180932,
         0.01864223,  0.00600021],
       [-0.0514434 , -0.05560552, -0.05491542, ..., -0.07330836,
         0.04419007,  0.0726134 ],
       [-0.03661173,  0.04719664,  0.03342238, ..., -0.07438703,
         0.04382566,  0.07660549],
       ...,
       [-0.05550526,  0.04067192, -0.0279091 , ..., -0.06881995,
         0.04363856,  0.05298588],
       [-0.05947262,  0.06017054, -0.03204539, ..., -0.06721859,
         0.02988186,  0.06628574],
       [-0.009613  , -0.00625878, -0.01120661, ..., -0.01422072,
         0.02342246,  0.06752376]], dtype=float32)

In [48]:
embeddings_df = pd.DataFrame(TF_query_embeddings)

In [49]:
sample_2ekm = pd.concat([sample, embeddings_df], axis=1)

In [50]:
sample_2ekm

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,...,502,503,504,505,506,507,508,509,510,511
0,1998-12-01,,179,vliegenthart,PvdA,1234.0,False,17,Ik ben nog wel 20 à 25 minuten bezig.,NL-TweedeKamer,...,0.037875,-0.025647,-0.012416,0.014532,-0.025066,0.060782,0.015138,-0.100352,0.073999,0.020567
1,2017-12-20,,422,de voorzitter,,,True,36,Ik neem aan dat er geen bezwaar tegen bestaat ...,NL-TweedeKamer,...,-0.054891,-0.031692,-0.014315,-0.012349,0.006541,-0.041419,-0.073185,-0.053635,0.062346,-0.014302
2,2018-02-15,,123,de heer sjoerdsma,D66,45.0,False,34,"Voorzitter, nog één zin. Maar laat ik ook zegg...",NL-TweedeKamer,...,0.017628,-0.078403,-0.019696,-0.080405,-0.049069,0.029821,-0.024627,-0.000960,0.022202,0.080329
3,1995-11-02,,408,van de camp,CDA,1157.0,False,47,Voorzitter! Het uitwerken van die meerdere var...,NL-TweedeKamer,...,0.004823,0.001006,0.012556,-0.039923,-0.015621,0.006838,-0.053398,0.015160,0.018328,0.011306
4,2006-02-01,,269,de wit,SP,1363.0,False,348,Ik heb aandacht gevraagd voor de positie van d...,NL-TweedeKamer,...,0.015038,-0.020210,-0.013434,-0.066156,-0.064930,0.052278,-0.034463,-0.070829,-0.051718,0.076446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2015-10-01,,678,wiebes,other,,False,163,De constructie van de postcoderoos is uitgebre...,NL-TweedeKamer,...,0.019693,-0.032995,-0.046715,-0.078461,0.018016,0.028285,-0.015144,-0.027879,0.051958,0.090900
4996,1997-04-23,,85,remkes,VVD,828.0,False,117,"Ja, maar waar het mij om gaat, is dat er onder...",NL-TweedeKamer,...,0.084790,-0.035510,0.038430,-0.052308,-0.019230,-0.067377,0.044026,0.001222,-0.000705,0.060241
4997,1996-06-06,,52,van de vondervoort,PvdA,1234.0,False,497,Ik begrijp dat u over de tekst struikelt. Maar...,NL-TweedeKamer,...,0.076989,-0.064937,-0.014233,-0.040007,-0.040482,-0.063832,-0.025371,-0.072617,-0.038270,0.079579
4998,2001-10-18,,299,ten hoopen,CDA,1157.0,False,72,"Ik heb wel een standpunt, maar als dat had wil...",NL-TweedeKamer,...,0.023364,-0.022795,0.005160,-0.038095,0.050096,-0.036349,-0.007740,-0.008400,0.023201,0.103882


## UMAP


In [51]:
sample_2ekm.iloc[:,12:524]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0.000926,-0.040687,-0.023129,-0.005145,-0.110545,-0.010618,-0.021249,0.014477,-0.087705,-0.016819,...,0.037875,-0.025647,-0.012416,0.014532,-0.025066,0.060782,0.015138,-0.100352,0.073999,0.020567
1,-0.076726,0.017157,-0.057032,-0.024102,-0.086373,0.030082,0.009749,0.032965,-0.006094,-0.007308,...,-0.054891,-0.031692,-0.014315,-0.012349,0.006541,-0.041419,-0.073185,-0.053635,0.062346,-0.014302
2,-0.007311,-0.028811,-0.016608,0.011277,-0.021316,0.028592,0.047581,0.024223,0.017517,0.051258,...,0.017628,-0.078403,-0.019696,-0.080405,-0.049069,0.029821,-0.024627,-0.000960,0.022202,0.080329
3,-0.011313,0.000566,-0.007299,-0.006520,0.031624,0.063596,-0.001244,0.022708,-0.056117,-0.080726,...,0.004823,0.001006,0.012556,-0.039923,-0.015621,0.006838,-0.053398,0.015160,0.018328,0.011306
4,-0.035984,-0.013296,0.042556,-0.038282,-0.064348,0.047298,0.069403,-0.021978,-0.013662,0.014092,...,0.015038,-0.020210,-0.013434,-0.066156,-0.064930,0.052278,-0.034463,-0.070829,-0.051718,0.076446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,-0.052119,0.026219,0.020975,-0.038791,-0.069494,-0.014427,-0.046219,-0.005500,-0.019874,0.053124,...,0.019693,-0.032995,-0.046715,-0.078461,0.018016,0.028285,-0.015144,-0.027879,0.051958,0.090900
4996,-0.043919,0.036142,-0.063115,-0.018882,-0.078779,0.013240,-0.007157,-0.061925,0.067555,0.063422,...,0.084790,-0.035510,0.038430,-0.052308,-0.019230,-0.067377,0.044026,0.001222,-0.000705,0.060241
4997,-0.059360,0.056116,-0.016031,0.026995,-0.070078,0.034754,-0.022925,0.031955,0.027193,0.035765,...,0.076989,-0.064937,-0.014233,-0.040007,-0.040482,-0.063832,-0.025371,-0.072617,-0.038270,0.079579
4998,0.008174,0.024839,-0.021256,-0.004913,-0.092515,0.011708,0.019855,0.040935,-0.012003,-0.042280,...,0.023364,-0.022795,0.005160,-0.038095,0.050096,-0.036349,-0.007740,-0.008400,0.023201,0.103882


In [52]:
%%time
matrix = np.array(sample_2ekm.iloc[:,12:524])
embedding2 = umap.UMAP(n_components = 2, metric = "cosine", n_neighbors = 5 ).fit_transform(matrix)

CPU times: user 10.1 s, sys: 581 ms, total: 10.7 s
Wall time: 10.8 s


In [53]:
tmp = pd.DataFrame(embedding2, columns=["x", "y"])
tweede_kamer_wv = pd.concat(
    [
        sample_2ekm,
        tmp
    ],
     axis=1
)

In [57]:
tweede_kamer_wv

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,...,504,505,506,507,508,509,510,511,x,y
0,1998-12-01,,179,vliegenthart,PvdA,1234.0,False,17,Ik ben nog wel 20 à 25 minuten bezig.,NL-TweedeKamer,...,-0.012416,0.014532,-0.025066,0.060782,0.015138,-0.100352,0.073999,0.020567,-2.157726,-2.906723
1,2017-12-20,,422,de voorzitter,,,True,36,Ik neem aan dat er geen bezwaar tegen bestaat ...,NL-TweedeKamer,...,-0.014315,-0.012349,0.006541,-0.041419,-0.073185,-0.053635,0.062346,-0.014302,-3.622990,0.817148
2,2018-02-15,,123,de heer sjoerdsma,D66,45.0,False,34,"Voorzitter, nog één zin. Maar laat ik ook zegg...",NL-TweedeKamer,...,-0.019696,-0.080405,-0.049069,0.029821,-0.024627,-0.000960,0.022202,0.080329,-1.235990,-1.065046
3,1995-11-02,,408,van de camp,CDA,1157.0,False,47,Voorzitter! Het uitwerken van die meerdere var...,NL-TweedeKamer,...,0.012556,-0.039923,-0.015621,0.006838,-0.053398,0.015160,0.018328,0.011306,-0.123754,0.967712
4,2006-02-01,,269,de wit,SP,1363.0,False,348,Ik heb aandacht gevraagd voor de positie van d...,NL-TweedeKamer,...,-0.013434,-0.066156,-0.064930,0.052278,-0.034463,-0.070829,-0.051718,0.076446,-1.839680,5.852355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2015-10-01,,678,wiebes,other,,False,163,De constructie van de postcoderoos is uitgebre...,NL-TweedeKamer,...,-0.046715,-0.078461,0.018016,0.028285,-0.015144,-0.027879,0.051958,0.090900,-0.588214,3.456949
4996,1997-04-23,,85,remkes,VVD,828.0,False,117,"Ja, maar waar het mij om gaat, is dat er onder...",NL-TweedeKamer,...,0.038430,-0.052308,-0.019230,-0.067377,0.044026,0.001222,-0.000705,0.060241,0.659127,4.548415
4997,1996-06-06,,52,van de vondervoort,PvdA,1234.0,False,497,Ik begrijp dat u over de tekst struikelt. Maar...,NL-TweedeKamer,...,-0.014233,-0.040007,-0.040482,-0.063832,-0.025371,-0.072617,-0.038270,0.079579,1.138860,1.979780
4998,2001-10-18,,299,ten hoopen,CDA,1157.0,False,72,"Ik heb wel een standpunt, maar als dat had wil...",NL-TweedeKamer,...,0.005160,-0.038095,0.050096,-0.036349,-0.007740,-0.008400,0.023201,0.103882,-0.136727,1.509650


In [58]:
plotdata = tweede_kamer_wv.dropna(subset=["speaker", "party"])

fig = px.scatter(
    plotdata,
    x = "x",
    y = "y",
    color = "party",
    hover_name = "text",
    width = 1900, height = 1200
)
fig

In [56]:
plotdata

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,datum
0,1994-12-20,,1,marijnissen,SP,1363.0,False,561,Mijnheer de voorzitter! Ik vertel de minister ...,NL-TweedeKamer,NLD,1994-12-20
1,1994-12-20,,2,melkert,PvdA,1234.0,False,706,Mijnheer de voorzitter! Mag ik allereerst de h...,NL-TweedeKamer,NLD,1994-12-20
2,1994-12-20,,3,marijnissen,SP,1363.0,False,304,Mijnheer de voorzitter! Hoewel ik het antwoord...,NL-TweedeKamer,NLD,1994-12-20
3,1994-12-20,,4,melkert,PvdA,1234.0,False,374,Mijnheer de voorzitter! Wat is onrechtvaardig?...,NL-TweedeKamer,NLD,1994-12-20
4,1994-12-20,,5,rosenmöller,GL,1537.0,False,412,Voorzitter! Afgelopen zaterdag stond in NRC Ha...,NL-TweedeKamer,NLD,1994-12-20
...,...,...,...,...,...,...,...,...,...,...,...,...
1143349,2019-07-04,,953,de heer hiddema,FvD,5855.0,False,118,"De volgende motie. Motie De Kamer, gehoord de ...",NL-TweedeKamer,NLD,2019-07-04
1143351,2019-07-04,,955,de heer hiddema,FvD,5855.0,False,5,Dat was mijn maritieme bijdrage.,NL-TweedeKamer,NLD,2019-07-04
1143355,2019-07-04,,959,de heer emiel van dijk,PVV,298.0,False,21,Ik vroeg eigenlijk alleen om een uitspraak van...,NL-TweedeKamer,NLD,2019-07-04
1143358,2019-07-04,,962,de heer van ojik,GL,1537.0,False,65,Ik zou graag weten op basis waarvan de staatss...,NL-TweedeKamer,NLD,2019-07-04
