In [1]:
%matplotlib inline
import pandas as pd
import re
from lxml import etree 
from bz2file import BZ2File
import bz2
import codecs
import xml
import glob
import os
import bs4
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
from pattern.nl import parsetree, pprint
from pattern.metrics import readability
from xml import parsers
import xml.parsers.expat
from xml.etree import cElementTree as ET
from xml.dom.minidom import parse
import matplotlib.pyplot as plt
from IPython.display import display, HTML 

In [29]:
def parse_troonrede(f):
    '''Read a troonrede file, extract all text and output a list of lists, 
    each element is a paragraph containing a list of sentences,
    each sentence is parsed.'''
    soup= BeautifulSoup(open(f).read())
    ourdiv=soup.find('div', id="post-content") 
    ourpars= [parsetree(p.text, lemmata=True, Relations=True) for p in ourdiv.findAll('p')]
    return ourpars

# this applies step 2 to all troonredes 
def parse_corpus(folder):
    alltroonredes= glob.glob(os.path.join(folder, '*.html'))
    troonredes={}
    for troonrede in alltroonredes:
        key= troonrede.split('\\')[1].replace('.html','')
        value= parse_troonrede(troonrede)
        troonredes[key]=value
    return troonredes

parsedtroonredes= parse_corpus('files')
print "Troonrede parsing"

Troonrede parsing


In [27]:
def verwerk_tekst(parsedcorpus):
    woorden_per_paragraaf= [list([ w for s in p for w in s])  for p in parsedcorpus  ]
    return woorden_per_paragraaf

tekst= {k:verwerk_tekst(parsedtroonredes[k]) for k in parsedtroonredes}

In [76]:
'''Functie om het aantal lettergrepen te bepalen van een woord'''
def syllablesCount(word):
    count = 0
    vowels = 'aeiouy'
    word = word.lower().strip(".:;?!")
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if count == 0:
        count +=1
    return count

print syllablesCount('.')

IndexError: string index out of range

In [86]:
'''Per jaar de hoeveelheid paragrafen, zinnen en woorden'''
redes = {}
dataPerYear = {}
for troonrede in parsedtroonredes:
    year = troonrede
    paragraphs = 0
    sentences = 0
    words = 0
    syllables = 0
    rede = ""
    for paragraph in parsedtroonredes[troonrede]:
        paragraphs += 1
        sentences += len(paragraph)
        for sentence in paragraph:
            words += len(sentence)
            for word in sentence:
                if word.string == "." or word.string == ",":
                    rede += word.string
                else:
                    rede += " " + word.string
                    try:
                        syllables += syllablesCount(word.string)
                    except:
                        pass
    redes[year] = rede            
    data = [paragraphs,sentences,words,syllables]
    dataPerYear[year] = data
print "dataPerYear bevat per jaar: aantal paragrafen, aantal zinnen, aantal woorden en aantal lettergrepen in de gehele troonrede"
print "redes bevat per jaar de gehele troonrede als tekst"

dataPerYear bevat per jaar: aantal paragrafen, aantal zinnen, aantal woorden en aantal lettergrepen in de gehele troonrede
redes bevat per jaar de gehele troonrede als tekst


In [82]:
'''Bepalen van AVI niveau per jaar, hoe lager de score, hoe moeilijker de tekst'''
for year in dataPerYear:
    gemZinslengte = dataPerYear[year][2]/dataPerYear[year][1]
    gemWoordlengte = dataPerYear[year][3]/dataPerYear[year][2]
    leesindex = 195 - (2*gemZinslengte) - (200/3)*gemWoordlengte
    print year, leesindex

1818 75
1832 59
1833 41
1830 39
1831 39
1836 83
1834 53
1835 57
1838 83
1839 83
1946 77
2014 91
2011 93
2010 97
2013 93
2012 93
1955 83
1954 83
1957 85
1956 87
1959 73
1958 83
1829 49
1828 45
1825 67
1824 75
1827 69
1826 51
1821 67
1820 61
1823 79
1822 75
1923 85
1925 89
1926 87
1927 81
1928 87
1929 87
1878 91
1933 77
1932 81
1931 81
1930 87
1937 67
1936 79
1935 79
1934 75
1939 85
1938 69
1849 83
1848 57
1843 73
1842 79
1841 71
1840 67
1847 75
1846 75
1845 81
1844 85
1876 93
1877 89
1874 89
1875 79
1872 93
1873 89
1870 95
1871 97
1904 89
1900 87
1879 95
1986 91
1987 93
1984 91
1985 89
1982 91
1983 91
1980 91
1981 91
1988 89
1989 89
1898 91
1899 89
1894 77
1895 95
1896 91
1897 93
1891 93
1892 89
1893 97
1964 89
1965 87
1966 89
1967 85
1960 81
1961 83
1962 83
1963 85
1887 93
1886 97
1885 87
1884 79
1883 93
1882 99
1880 97
1979 95
1978 93
1976 91
2002 91
2003 95
2000 97
2001 97
2006 99
2007 99
2004 95
2005 99
2008 103
2009 91


In [83]:
'''Tekst leesbaarheidsniveau per jaar aan de hand van de "Flesch–Kincaid readability test"'''
for rede in redes:
    print rede, readability(redes[rede])

1818 0.216673506422
1832 0.0877921441774
1833 0.0
1830 0.0385644638626
1831 0.0349000552842
1836 0.194669555481
1834 0.0419161691023
1835 0.0735012400398
1838 0.223526926523
1839 0.189330432499
1946 0.175158087918
2014 0.27404701228
2011 0.247524728746
2010 0.228889826625
2013 0.245697736564
2012 0.225965436253
1955 0.196737182796
1954 0.21256373494
1957 0.172356124079
1956 0.225174638755
1959 0.155663668245
1958 0.224419669088
1829 0.0655925905797
1828 0.034945362246
1825 0.166248421053
1824 0.230216402157
1827 0.135810336844
1826 0.0805861583578
1821 0.16936137129
1820 0.17662027881
1823 0.247078812996
1822 0.211082463606
1923 0.152717693498
1925 0.189773118351
1926 0.233841445575
1927 0.188198616378
1928 0.201834639956
1929 0.188345492958
1878 0.26993364259
1933 0.122224719394
1932 0.225510974175
1931 0.193307328487
1930 0.183211176471
1937 0.0588513698993
1936 0.126039739471
1935 0.176198972332
1934 0.14663
1939 0.199912827324
1938 0.11691668318
1849 0.251343395168
1848 0.167328010