In [1]:
corpus_path = "corpora/hansard/hansard/"

In [2]:
import pandas as pd
import numpy as np
from glob import glob
from os import path
import itertools

In [3]:
import xml.etree.ElementTree as ET

In [4]:
def try_find_text(ele, xpath):
    found_ele = ele.find(xpath)
    if not(found_ele is None):
        return found_ele.text or ""
    else:
        return ""
    
def nuke_whitespace(string):
    return ' '.join(string.split()).strip()

In [5]:
def load_speech(speech_ele):
    name = try_find_text(speech_ele, ".//name")
    electorate = try_find_text(speech_ele, ".//electorate")
    party = try_find_text(speech_ele, ".//party")
    ingov = try_find_text(speech_ele, ".//in.gov")
    ingov = bool(int(ingov)) if ingov else ""
    first_speech = try_find_text(speech_ele, ".//first.speech")
    first_speech = bool(int(first_speech)) if first_speech else ""
    
    
    text = speech_ele.text or ""
    text+=" " + " ".join([para.text or "" for para in speech_ele.findall("./para")])
    text=nuke_whitespace(text)
    return name,electorate,party, ingov, first_speech, text
        

In [None]:
def load_file(filename):
        xml = ET.ElementTree(file=filename)
        date = xml.find(".//date").text
        
        for debate in xml.findall(".//debate"):
            debate_type = try_find_text(debate, ".//type")
            debate_title = try_find_text(debate, ".//title")
                
            speeches = debate.findall(".//speech")
            for ss in speeches:
                name,electorate,party, ingov, first_speech, text = load_speech(ss)
                if text and party: #we need to at least know the party and the text
                    yield date,debate_type, debate_title, name,electorate,party, ingov, first_speech, text
        

def load_corpus(corpus_path):
    from glob import glob
    import os.path
    filenames = glob(os.path.join(corpus_path,"*.xml"))
    
    file_records = [load_file(filename) for filename in filenames]
    all_speeches = list(itertools.chain(*file_records))
    df = pd.DataFrame(all_speeches, columns=["date", "debate_type", "debate_title", "name","electorate","party", "ingov", "first_speech", "text"])
    
    return df


In [None]:
corpus = load_corpus(corpus_path)

In [None]:
#   AG          - Australian Greens
#   ALP         - Labour
#   AUS         - Australia First??
#   CLP         - Country Liberal Party?
#   IND         - Independant (no party)
#   Ind         - Independant (no party)
#   Ind.            - Independant (no party)
#   LP          - Liberal
#   N/A         - same as UNKNONW? or guest?
#   NATS        - Nationals
#   NP          - Nationals
#   NPActing    - Nationals
#   Nats        - Nationals
#   NatsWA      - Nationals WA
#   PUP         - Palmer United
#   UNKNOWN


def get_consistent_name(name):
    if name in frozenset("IND Ind Ind.".split()):
        return "IND"
    elif name in frozenset("NATS NP NPActing Nats NatsWA".split()):
        return "NATS"
    elif pd.isnull(name):
        return "NOT SPECIFIED"
    else:
        return name

corpus.party = corpus.party.apply(get_consistent_name)


In [None]:
def get_meta_party(name):
    if name in frozenset("NATS CLP LP".split()):
        return "LNC"
    elif name=="ALP":
        return "ALP"
    else:
        return "OTHER"
    
corpus["metaparty"] = corpus.party.apply(get_meta_party)


In [None]:
def formalise(name):
    first, last = name.split()
    return last+", "+first + ", MP"

prime_ministers = frozenset(map(formalise, ("Bob Hawke","Paul Keating","John Howard", "Julia Gillard","Kevin Rudd","Tony Abbott")))
opposition_leaders = frozenset(map(formalise, ("John Howard","Andrew Peacock","John Hewson","Alexander Downer","John Howard","Kim Beazley","Simon Crean","Mark Latham","Kim Beazley","Kevin Rudd","Brendan Nelson","Malcolm Turnbull","Tony Abbott","Chris Bowen","Bill Shorten")))
corpus["prime_minister"] = corpus.name.apply(lambda name: name in prime_ministers)
corpus["opposition_leader"] = corpus.name.apply(lambda name: name in opposition_leaders)


In [None]:
corpus.debate_type= corpus.debate_type.str.upper()  #Normalise Casing
corpus.loc[pd.isnull(corpus.debate_title),"debate_title"] = "NOT_SPECIFIED"


In [None]:
corpus.to_csv("serialised/hansard.csv")

In [None]:
!git add hansard.ipynb

In [None]:
!git commit -m="made hansard normalise party names, correctly for NATS"

In [None]:
!git push