In [13]:
import nltk.data
from bs4 import BeautifulSoup
import operator
from nltk.probability import *
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
import string
import re

# Idea: get the most common words (done), represent which deputees have these words in common
# or which deputees talk the most of certain topics

In [16]:
# a set containing the deputees' names
deputes_list = set()

# this will contain the deputees' speeches (the key being their name)
deputes_paroles = {}

# this will contain their titles (for exemple: Conseiller d'Etat)
deputes_titres = {}

# We compile the regular expression used to remove the name and title from the beginning of each speech
p = re.compile("\.")

def open_memorial(filepath):
    f = open(filepath, 'r')
    html = f.read()
    doc = BeautifulSoup(html, 'html.parser')

    memos = doc.select('.txtMemo p')

    for memo in memos:
        depute = memo.select('a.style_pdg_gras')
        if depute:
            nom_depute = depute[0].string
            deputes_list.add(nom_depute)
            if not nom_depute in deputes_paroles.keys():
                deputes_paroles[nom_depute] = []
            speech = memo.get_text()
            # We remove the name and title: "M. Nom Député (Parti). Bla bla blaaa"
            startpos = re.search(p, speech[3:]).start()
            deputes_paroles[nom_depute].append(speech[startpos+5:])
            if not nom_depute in deputes_titres.keys():
                deputes_titres[nom_depute] = []
                deputes_titres[nom_depute].append(speech[:startpos+4])
            else:
                if speech[:startpos+4] not in deputes_titres[nom_depute]:
                    deputes_titres[nom_depute].append(speech[:startpos+4])

for o in range(0, 19): # year 2016
    open_memorial('memoriaux/memoriaux' + str(o) + '.html')

print("Députés enregistrés:", len(deputes_list))
print("Speeches enregistrés:", sum([len(deputes_paroles[i]) for i in deputes_paroles]))

Députés enregistrés: 96
Speeches enregistrés: 734


In [17]:
# tokenizer = nltk.data.load('tokenizers/punkt/PY3/french.pickle')
tokenizer = TreebankWordTokenizer()
french_stopwords = set(stopwords.words('french'))
french_stopwords.update(['les', 'alors', '(', ')', ',', '-', '.', 'M', 'M.', 'Mme', 'a', 'être'])

remove_punctuation_map = dict((ord(char), None) for char in string.punctuation if char not in ["'"])

def analyze_text(text):
    tokens = tokenizer.tokenize(text) 
    ntokens = [token for token in tokens if token.lower() not in french_stopwords]
    ntokens = [token.translate(remove_punctuation_map) for token in ntokens]
    ntokens = [token for token in ntokens if token != '']
    
    fdist = FreqDist(ntokens)
    sorted_x = sorted(fdist.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_x

# print(analyze_text(deputes_paroles['M. Patrick Lussi'][1])[:10]) # use this line to see a specific speech

# This prints our results before we do the json export:

for key in deputes_paroles.keys():
    blob_text = str.join(' ', deputes_paroles[key])
    nb_speeches = len(deputes_paroles[key])
    if nb_speeches > 2:
        print(str.join('|', deputes_titres[key]), "("+str(nb_speeches)+" prises de parole):")
        result = analyze_text(blob_text)
        
        for word, count in result[:15]:
            if count > 2:
                print(str(count) + "\t" + word)
        print("\n")

M. Romain de Sainte Marie (S).|M. Romain de Sainte Marie (S), rapporteur de deuxième minorité. (15 prises de parole):
23	loi
23	projet
18	plus
17	cette
14	bien
13	Genève
12	canton
12	commission
11	motion
11	qu'il
10	si
10	très
9	matière
8	M
8	part


Mme Emilie Flamand-Lew (Ve).|Mme Emilie Flamand-Lew (Ve), rapporteuse. (8 prises de parole):
11	cette
8	loi
8	police
7	députés
6	Messieurs
6	Mesdames
6	comme
5	ils
5	Monsieur
5	l'annuité
5	président
5	c'est
5	victimes
5	aussi
5	plus


M. Stéphane Florey (UDC). (19 prises de parole):
13	président
13	Merci
13	cette
10	si
10	Conseil
10	Monsieur
9	bien
9	pétition
9	c'est
8	groupe
8	cela
7	d'Etat
7	rapport
7	C'est
7	commission


M. Mathias Buschbeck (Ve).|M. Mathias Buschbeck (Ve), rapporteur de première minorité ad interim.|M. Mathias Buschbeck (Ve), rapporteur de première minorité. (11 prises de parole):
16	cette
14	projet
13	loi
11	si
11	traversée
10	comme
10	plus
8	compromis
8	c'est
8	lac
7	mobilité
7	bien
7	non
7	tout
7	fait


M. François  

In [20]:
# Preparation for our json: we transform the dictionary into a list of dictionaries

results_list = []

for key in deputes_paroles.keys():
    
    blob_text = str.join(' ', deputes_paroles[key])
    nb_speeches = len(deputes_paroles[key])
    if nb_speeches > 2:
        result = analyze_text(blob_text)
        
        name = key
        words = []
        
        for word, count in result[:3]:
            words.append(word)
        
        results_list.append({'name': key, 'words': words})
    
results_list

[{'name': 'M. Romain de Sainte Marie', 'words': ['loi', 'projet', 'plus']},
 {'name': 'Mme Emilie Flamand-Lew', 'words': ['cette', 'loi', 'police']},
 {'name': 'M. Stéphane Florey', 'words': ['président', 'Merci', 'cette']},
 {'name': 'M. Mathias Buschbeck', 'words': ['cette', 'projet', 'loi']},
 {'name': 'M. François  Longchamp', 'words': ['loi', 'Conseil', "C'est"]},
 {'name': 'M. Michel Baud', 'words': ['caméras', "c'est", 'demande']},
 {'name': 'Mme Salika Wenger', 'words': ["c'est", 'faire', 'plus']},
 {'name': 'M. Boris Calame', 'words': ['loi', 'commission', 'Conseil']},
 {'name': 'Mme Isabelle Brunier', 'words': ['bien', 'effet', 'si']},
 {'name': 'M. Thierry Cerutti',
  'words': ['commission', 'députés', 'président']},
 {'name': 'Mme Lydia Schneider Hausser',
  'words': ['président', 'cette', 'Mesdames']},
 {'name': 'Mme Sarah Klopmann', 'words': ['président', 'cette', 'Merci']},
 {'name': 'M. Eric Leyvraz', 'words': ['loi', 'cela', "d'Etat"]},
 {'name': 'M. François Lance', '

In [19]:
# We generate a json file containing the nodes data

import json

data = []    

## return an array with the other nodes to connect to
def import_words(word_list):
    lines = []
    for word in word_list:
        lines.append('Title.' + word)
    return lines

def import_authors(author_list):
    lines = []
    for author in author_list:
        lines.append('Author.' + author)
    return lines

## create the node data
def generate_word(word, deputee):
    timports = []
    timports.extend(import_authors(deputee))
    
    element = {
        "name": "Title." + word,
        "size": 0,
        "imports": import_authors(deputee)
    }
    return element

def generate_author(deputee, words):
    element = {
        "name": "Author." + deputee,
        "size": 0,
        "imports": import_words(words)
    }
    return element

for item in results_list:
    deputee_name = item['name']

    for single_word in item['words']:
        data.append(generate_word(single_word, [deputee_name]))

    deputee = generate_author(deputee_name, item['words'])
    data.append(deputee)

## export
with open('microdata2.json', 'w') as outfile:
    json.dump(data, outfile, indent=4, separators=(',', ': '))

In [28]:
from IPython.core.display import display, HTML
print("Here is the result:")
display(HTML('<iframe src="http://paulronga.ch/test/dataviz.html" width="900" height="900"></iframe>'))

from IPython.display import IFrame
print("Here is the result:")
display(IFrame('http://paulronga.ch/test/dataviz.html', width=900, height=900))


Here is the result:


Here is the result:


In [29]:
HTML('<iframe src="http://paulronga.ch/test/dataviz.html" width="900" height="900"></iframe>')

In [30]:
IFrame('http://paulronga.ch/test/dataviz.html', width=900, height=900)