This notebook aims to ingest the Hansard XML dumps, extract the information I care about, and write it to CSV files.

In [1]:
import os
import re
import sys

from bs4 import BeautifulSoup
import  pandas as pd

In [2]:
data_files = os.listdir('./data/')
data_files = [f for f in data_files if re.match('.*xml',f)]
data_files = ['./data/'+data_file for data_file in data_files]

In [3]:
class Speech:
    """Class to represent a speech in parliament"""
    
    def __init__(self,speech):
        """Should only be called from the constructor for Debate"""
        self.name = speech.find('name').get_text()
        self.text = speech.find('talk.text').get_text()
        self.party = speech.find('party').get_text()
        
class Debate:
    """Class to represent a debate in parliament"""
    
    def __init__(self,debate):
        """Should only be called from the constructor for SittingDay"""
        self.title = debate.find('title')
        self.speeches = [Speech(speech) for speech in debate.find_all('speech')]#should be a list
        
class SittingDay:
    """Class to represent a whole parliamentary sitting day"""
    
    def __init__(self,day):
        """takes a beautifulsoup object"""
        self.date = day.find('date').get_text()
        self.debates = [Debate(debate) for debate in day.find_all('debate')]
        self.chamber = day.find('chamber').get_text()
        if self.chamber == 'House of Reps':
            self.chamber = 'HoR'
        
    def write_debates(self,path = '.'):
        pass
        
    def write_speeches(self,path='.'):
        """Write all the debates to a csv file (one file per day)"""
        #ToDo: extend to also include votes
        text = list()
        name = list()
        party = list()
        date = list()
        for debate in self.debates:
            for speech in debate.speeches:
                text.append(speech.text)
                name.append(speech.name)
                party.append(speech.party)
        date = [self.date]*len(text)
        data_frame = pd.DataFrame({'name':name,'party':party,'date':date,'text':text})
        data_frame.to_csv(path + '/' + self.date+ '-' + self.chamber +'-speeches.csv',encoding='utf-8')

```python
#single-threaded version 
for file in data_files[1:128]:
    with open(file,'r') as f:
        sitting_day = SittingDay(BeautifulSoup(f.read(),'xml'))
        sitting_day.write_speeches('.')
```

Since building the BeautifulSoup representation of each XML file is somewhat slow, it's worth parallelising the process:

In [None]:
%%time
import multiprocessing

def xml_to_csv(x):
    data_file = x[0]
    path = x[1]
    with open(data_file,'r') as f:
        SittingDay(BeautifulSoup(f.read(),'xml')).write_speeches(path)
        return 'done'

input = [[data_file,'./csv'] for data_file in data_files]

pool = multiprocessing.Pool(multiprocessing.cpu_count())
pool.map(xml_to_csv,input)

In [18]:
example_file = 'data/2016-03-02-HoR.xml'
with open(example_file,'r') as f:
    x= f.read()

BeautifulSoup(x).find_all('division')

[<division>
 <division.header>
 </division.header>
 <division.data>
 <ayes>
 <num.votes>57</num.votes>
 <title>AYES</title>
 <names>
 <name>Albanese, AN</name>
 <name>Bandt, AP</name>
 <name>Bird, SL</name>
 <name>Bowen, CE</name>
 <name>Brodtmann, G</name>
 <name>Burke, AE</name>
 <name>Burke, AS</name>
 <name>Butler, MC</name>
 <name>Butler, TM</name>
 <name>Byrne, AM</name>
 <name>Chalmers, JE</name>
 <name>Champion, ND</name>
 <name>Chesters, LM</name>
 <name>Clare, JD</name>
 <name>Claydon, SC</name>
 <name>Collins, JM</name>
 <name>Conroy, PM</name>
 <name>Danby, M</name>
 <name>Dreyfus, MA</name>
 <name>Elliot, MJ</name>
 <name>Ellis, KM</name>
 <name>Feeney, D</name>
 <name>Ferguson, LDT</name>
 <name>Fitzgibbon, JA</name>
 <name>Giles, AJ</name>
 <name>Gray, G</name>
 <name>Griffin, AP</name>
 <name>Hall, JG (teller)</name>
 <name>Hayes, CP</name>
 <name>Husic, EN</name>
 <name>Jones, SP</name>
 <name>King, CF</name>
 <name>Leigh, AK</name>
 <name>Macklin, JL</name>
 <name>Mac