In [7]:
%matplotlib inline
import pandas as pd
import re
from lxml import etree 
from bz2file import BZ2File
import bz2
import codecs
import xml
import glob
import os
import bs4
import collections
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
from pattern.nl import parsetree, pprint
from pattern.metrics import readability
from xml import parsers
import xml.parsers.expat
from xml.etree import cElementTree as ET
from xml.dom.minidom import parse
import matplotlib.pyplot as plt
from IPython.display import display, HTML 

In [2]:
def parse_troonrede(f):
    '''Read a troonrede file, extract all text and output a list of lists, 
    each element is a paragraph containing a list of sentences,
    each sentence is parsed.'''
    soup= BeautifulSoup(open(f).read())
    ourdiv=soup.find('div', id="post-content") 
    ourpars= [parsetree(p.text, lemmata=True, Relations=True) for p in ourdiv.findAll('p')]
    return ourpars

# this applies step 2 to all troonredes 
def parse_corpus(folder):
    alltroonredes= glob.glob(os.path.join(folder, '*.html'))
    troonredes={}
    for troonrede in alltroonredes:
        key= troonrede.split('\\')[1].replace('.html','')
        value= parse_troonrede(troonrede)
        troonredes[key]=value
    return troonredes

parsedtroonredes= parse_corpus('files')
print "Troonrede parsing"

Troonrede parsing




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [3]:
def processText(parsedcorpus):
    wordsPerPara= [list([ w for s in p for w in s])  for p in parsedcorpus  ]
    return wordsPerPara

tekst= {k:processText(parsedtroonredes[k]) for k in parsedtroonredes}

In [4]:
'''Functie om het aantal lettergrepen te bepalen van een woord'''
def syllablesCount(word):
    count = 0
    vowels = 'aeiouy'
    word = word.lower().strip(".:;?!")
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if count == 0:
        count +=1
    return count

In [5]:
'''Per jaar de hoeveelheid paragrafen, zinnen en woorden'''
redes = {}
dataPerYear = {}
for troonrede in parsedtroonredes:
    year = troonrede
    paragraphs = 0
    sentences = 0
    words = 0
    syllables = 0
    rede = ""
    for paragraph in parsedtroonredes[troonrede]:
        paragraphs += 1
        sentences += len(paragraph)
        for sentence in paragraph:
            words += len(sentence)
            for word in sentence:
                if word.string == "." or word.string == ",":
                    rede += word.string
                else:
                    rede += " " + word.string
                    try:
                        syllables += syllablesCount(word.string)
                    except:
                        pass
    redes[year] = rede            
    data = [paragraphs,sentences,words,syllables]
    dataPerYear[year] = data
print "- dataPerYear bevat per jaar: aantal paragrafen, aantal zinnen, aantal woorden en aantal lettergrepen in de gehele troonrede"
print "- redes bevat per jaar de gehele troonrede als tekst"

- dataPerYear bevat per jaar: aantal paragrafen, aantal zinnen, aantal woorden en aantal lettergrepen in de gehele troonrede
- redes bevat per jaar de gehele troonrede als tekst


In [9]:
'''Bepalen van AVI niveau per jaar, hoe lager de score, hoe moeilijker de tekst'''
for year in sorted(dataPerYear):
    gemZinslengte = dataPerYear[year][2]/dataPerYear[year][1]
    gemWoordlengte = dataPerYear[year][3]/dataPerYear[year][2]
    leesindex = 195 - (2*gemZinslengte) - (200/3)*gemWoordlengte
    print year, leesindex

1818 75
1820 61
1821 67
1822 75
1823 79
1824 75
1825 67
1826 51
1827 69
1828 45
1829 51
1830 39
1831 39
1832 59
1833 41
1834 53
1835 59
1836 83
1838 83
1839 83
1840 67
1841 71
1842 79
1843 75
1844 85
1845 81
1846 75
1847 75
1848 57
1849 83
1870 95
1871 97
1872 93
1873 89
1874 89
1875 79
1876 93
1877 89
1878 91
1879 95
1880 97
1882 99
1883 93
1884 81
1885 87
1886 97
1887 93
1891 93
1892 89
1893 97
1894 77
1895 95
1896 93
1897 93
1898 91
1899 89
1900 87
1904 89
1923 85
1925 89
1926 87
1927 81
1928 87
1929 87
1930 87
1931 81
1932 81
1933 77
1934 75
1935 79
1936 79
1937 67
1938 69
1939 85
1946 77
1954 83
1955 83
1956 87
1957 85
1958 83
1959 73
1960 81
1961 83
1962 85
1963 85
1964 89
1965 87
1966 89
1967 85
1976 91
1978 93
1979 95
1980 91
1981 91
1982 91
1983 91
1984 91
1985 89
1986 91
1987 93
1988 89
1989 89
2000 97
2001 97
2002 91
2003 95
2004 95
2005 99
2006 99
2007 99
2008 103
2009 91
2010 97
2011 95
2012 93
2013 93
2014 93


In [11]:
'''Tekst leesbaarheidsniveau per jaar aan de hand van de "Flesch–Kincaid readability test"'''
for rede in sorted(redes):
    print rede, (readability(redes[rede])*100)

1818 21.9686751337
1820 17.381832088
1821 17.4356530716
1822 21.0736280376
1823 24.2323457591
1824 23.0216402157
1825 16.684184888
1826 8.24225802139
1827 13.5569300653
1828 3.72675768968
1829 6.8566058129
1830 3.78638607595
1831 3.67942741935
1832 9.0654939759
1833 0.0
1834 4.0769539749
1835 7.9655632515
1836 19.4084420156
1838 22.3172774899
1839 18.7728996656
1840 13.709185567
1841 16.7469725527
1842 17.0549591529
1843 12.1424920799
1844 22.7245703226
1845 20.9697295597
1846 12.0201630417
1847 18.5037771084
1848 16.4074150943
1849 25.1052285143
1870 22.3959712644
1871 27.5113567062
1872 22.8651306857
1873 20.9431542331
1874 21.9104210526
1875 17.8167115282
1876 23.0936708408
1877 20.0242608117
1878 26.3679350649
1879 29.7935294282
1880 20.8926818182
1882 25.0127727273
1883 23.7571205242
1884 20.1094603524
1885 18.8962843137
1886 22.0910714286
1887 23.4358093923
1891 17.9484166667
1892 22.6765806452
1893 25.78925
1894 14.2607217391
1895 28.7075780642
1896 25.2864021226
1897 26.3599400