This notebook aims to ingest the Hansard XML dumps, extract the information I care about, and write it to CSV files.

In [1]:
import os
import re
import sys
import multiprocessing

from bs4 import BeautifulSoup
import  pandas as pd

In [2]:
data_files = os.listdir('./data/')
data_files = [f for f in data_files if re.match('.*xml',f)]
data_files = ['./data/'+data_file for data_file in data_files]

In [3]:
class Speech:
    """Class to represent a speech in parliament"""
    
    def __init__(self,speech):
        """Should only be called from the constructor for Debate"""
        self.name = speech.find('name').get_text()
        self.text = speech.find('talk.text').get_text()
        self.party = speech.find('party').get_text()

class Division:
    
    def __init__(self,division):
        self.result = division.find('division.result').get_text().strip()
        self.ayes = [name_tag.get_text() for name_tag in division.find('ayes').find_all('name')]
        try:
            self.nays = [name_tag.get_text() for name_tag in division.find('nays').find_all('name')]
        except:
            self.nays = []
        parent = division.parent
        grandparent = parent.parent
        self.parent_title = parent.title.get_text()
        self.grandparent_title = grandparent.title.get_text()
        
class Subdebate:
    
    def __init__(self,subdebate):
        self.title = subdebate.find('title')
        self.speeches = [Speech(speech) for speech in subdebate.find_all('speech')]
        self.divisions = [Division(division) for division in subdebate.find_all('division')]

class Debate:
    """Class to represent a debate in parliament"""
    
    def __init__(self,debate):
        """Should only be called from the constructor for SittingDay"""
        self.title = debate.find('title')
        self.subdebates = list()#FixMe
        subdebate = True
        i = 1
        while subdebate:
            subdebate=debate.find('subdebate.'+str(i))
            i+=1
            if subdebate:
                self.subdebates.append(Subdebate(subdebate))
        
        
        
class SittingDay:
    """Class to represent a whole parliamentary sitting day"""
    
    def __init__(self,day):
        """takes a beautifulsoup object"""
        self.date = day.find('date').get_text()
        self.debates = [Debate(debate) for debate in day.find_all('debate')]
        self.chamber = day.find('chamber').get_text()
        if self.chamber == 'House of Reps':
            self.chamber = 'HoR'
        
    def write_divisions(self,path = '.'):
        name = list()
        vote = list()
        date = list()
        parent_title = list()
        grandparent_title = list()
        for debate in self.debates:
            for subdebate in debate.subdebates:
                for division in subdebate.divisions:
                    all_names = division.ayes + division.nays
                    for member in all_names:
                        name.append(member)
                        if member in division.ayes:
                            vote.append('aye')
                        if member in division.nays:
                            vote.append('nay')
                        date.append(self.date)
                        parent_title.append(division.parent_title)
                        grandparent_title.append(division.grandparent_title)
                            
        data_frame = pd.DataFrame({'name':name,'vote':vote,'parent_title':parent_title,'grandparent_title':grandparent_title,'date':date})
        data_frame.to_csv(path+'/'+self.date + '-' + self.chamber + '-' + 'divisions.csv',encoding = 'utf-8',index = False)
        
    def write_speeches(self,path='.'):
        """Write all the debates to a csv file (one file per day)"""
        #ToDo: extend to also include votes
        text = list()
        name = list()
        party = list()
        date = list()
        debate_title = list()
        subdebate_title = list()
        for debate in self.debates:
            for subdebate in debate.subdebates:
                for speech in subdebate.speeches:
                    text.append(speech.text)
                    name.append(speech.name)
                    party.append(speech.party)
                    date.append(self.date)
                    debate_title.append(debate.title)
                    subdebate_title.append(subdebate.title)
        date = [self.date]*len(text)
        chamber = [self.chamber]*len(text)
        data_frame = pd.DataFrame({'name':name,'party':party,'date':date,'text':text,'chamber':chamber,'debate_title':debate_title,'subdebate_title':subdebate_title})
        data_frame.to_csv(path + '/' + self.date+ '-' + self.chamber +'-speeches.csv',encoding='utf-8',index=False)
        
    def write_proceedings(self,speeches_path='.',divisions_path='.'):
        self.write_speeches(speeches_path)
        self.write_divisions(divisions_path)

FixMe:
* ~~Split up debates into subdebates~~
* ~~Add divisions~~
* Add dictionary to associate MPs' names as listed in divisions with their names as listed elsewhere. 

Since building the BeautifulSoup representation of each XML file is somewhat slow, it's worth parallelising the process:

In [4]:
def xml_to_csv(x):
    data_file = x[0]
    speeches_path = x[1]
    divisions_path = x[2]
    with open(data_file,'r') as f:
        SittingDay(BeautifulSoup(f.read(),'xml')).write_proceedings(speeches_path,divisions_path)

(speeches_path,divisions_path) = ('../tidied_parliamentary_data/speeches','../tidied_parliamentary_data/divisions')

try:
    os.makedirs(speeches_path)
except OSError:
    pass #nothing to do if the directories already exist


try:
    os.makedirs(divisions_path)
except OSError:
    pass #nothing to do if the directories already exist

input = [(data_file,speeches_path,divisions_path) for data_file in data_files]

In [5]:
%%time
pool = multiprocessing.Pool(multiprocessing.cpu_count())
pool.map(xml_to_csv,input)

AttributeError: 'Division' object has no attribute 'grantparent_title'

In [6]:
df = input[0][0]
with open(df,'r') as f:
    x= f.read()
soup = BeautifulSoup(x).find('chamber.xscript')
soup.find('division').parent.title.get_text()

'Second Reading'