In [None]:
%matplotlib inline
import pandas as pd
import re
from lxml import etree 
from bz2file import BZ2File
import bz2
import codecs
import xml
import glob
import os
import bs4
import collections
import itertools
import math
import networkx as nx
import community
import pickle
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
from pattern.nl import parsetree, pprint, singularize, pluralize
from pattern.metrics import readability
from xml import parsers
import xml.parsers.expat
from xml.etree import cElementTree as ET
from xml.dom.minidom import parse
import matplotlib.pyplot as plt
from IPython.display import display, HTML 

In [None]:
def parse_troonrede(f):
    '''Read a troonrede file, extract all text and output a list of lists, 
    each element is a paragraph containing a list of sentences,
    each sentence is parsed.'''
    soup= BeautifulSoup(open(f).read())
    ourdiv=soup.find('div', id="post-content") 
    ourpars= [parsetree(p.text, lemmata=True, Relations=True) for p in ourdiv.findAll('p')[1:-1]]
    return ourpars

# this applies step 2 to all troonredes 
def parse_corpus(folder):
    alltroonredes= glob.glob(os.path.join(folder, '*.html'))
    troonredes={}
    for troonrede in alltroonredes:
        key= troonrede.split('\\')[1].replace('.html','')
        value= parse_troonrede(troonrede)
        troonredes[key]=value
    return troonredes

parsedtroonredes= parse_corpus('files')
print "Troonrede parsing"

In [None]:
'''Gehele corpus met lemmatiseren'''
def processText(parsedcorpus):
    wordsPerPara= [list([ w.lemma for s in p for w in s.nouns])  for p in parsedcorpus  ]
    return wordsPerPara

tekst= {k:processText(parsedtroonredes[k]) for k in parsedtroonredes}

In [None]:
'''Alle gelemmatiseerde woorden en hoevaak ze voorkomen in het corpus'''
lemmaDict = {}
for troonrede in sorted(tekst):
    for paragraph in tekst[troonrede]:
        for word in paragraph:
            try:
                if str(word) in lemmaDict:
                    lemmaDict[str(word)] += 1
                else:
                    lemmaDict[str(word)] = 1
            except:
                pass

In [None]:
'''Gebruikt de dictionary van gelemmatiseerde woorden om een dictionary te maken van alle mogelijke combinaties van 2 woorden'''
wordlist = []
fullCombiDict = {}
for word in lemmaDict:
    word = re.sub(r'[^\w]', '', word)
    if len(word) > 2:
        wordlist.append(word)
for combi in itertools.combinations(sorted(wordlist),2):
    fullCombiDict[combi] = 0

In [None]:
'''Bouwen van de collocatie dataframe voor alle troonredes'''
yearCombiDict = {}
for troonrede in tekst:
    combiDict = {}
    for paragraph in tekst[troonrede]:
        words = []
        for word in paragraph:
            word = re.sub(r'[^\w]', '', word)
            if len(word) > 2:
                words.append(word)
        for combi in itertools.combinations(sorted(words),2):
            if combi in fullCombiDict:
                fullCombiDict[combi] += 1
            else:
                pass
            if combi in combiDict:
                combiDict[combi] += 1
            else:
                combiDict[combi] = 1
    yearCombiDict[troonrede] = combiDict

In [None]:
'''Functie om P(w) te berekenen'''
def findW(word):
    count = 0
    for troonrede in sorted(tekst):
        for paragraph in tekst[troonrede]:
            if word in paragraph:
                count += 1
    value = float(count)/paragraphCount
    return value

'''Functie om P(w,c) te berekenen'''
def findWC(W,C):
    count = 0
    for troonrede in sorted(tekst):
        for paragraph in tekst[troonrede]:
            if W in paragraph and C in paragraph:
                count += 1
    value = float(count)/paragraphCount
    return value

'''Functie om I(w,c) te berekenen'''
def findI(W1,W2):
    total = 0
    for word in lemmaDict:
        if word != W1 and word != W2:
            part1 = findWC(W1,word)
            part2 = findW(W1)
            part3 = findW(word)
            if part1 != 0 and (part2*part3) != 0: 
                check = math.log(part1/(part2*part3))
                if check > 0:
                    total += check
    return float(total)
                
'''Functie om de score S(w1,w2) te bepalen'''
def score(W1,W2):
    score = 0
    part1 = findI(W1,W2)
    if part1 == 0:
        return score
    part2 = findI(W2,W1)
    if part2 == 0:
        return score
    score = min(part1,part2)/part1
    return score

In [None]:
'''Berekenen van de proximity score over alle collocaties'''
def networkBuilder ():
    counter = 1
    netwerkDict = {}
    for combi in fullCombiDict:
        if counter < 101:
            if fullCombiDict[combi] > 0:
                if combi[0] != combi[1]:
                    print counter
                    weight = score(combi[0],combi[1])
                    if weight > 0:
                        netwerkDict[combi] = weight
                    counter += 1
            else:
                pass
        else:
            return netwerkDict
    return netwerkDict

In [None]:
%timeit netwerkDict = networkBuilder()