In [9]:
corpus_path = "corpora/hansard/hansard/"

In [10]:
import pandas as pd
import numpy as np
from glob import glob
from os import path
import itertools

In [11]:
import xml.etree.ElementTree as ET

In [12]:
def try_find_text(ele, xpath):
    found_ele = ele.find(xpath)
    if not(found_ele is None):
        return found_ele.text or ""
    else:
        return ""
    
def nuke_whitespace(string):
    return ' '.join(string.split()).strip()

In [13]:
def load_speech_metadata(speech_ele):
    name = try_find_text(speech_ele, ".//name")
    electorate = try_find_text(speech_ele, ".//electorate")
    party = try_find_text(speech_ele, ".//party")
    ingov = try_find_text(speech_ele, ".//in.gov")
    ingov = bool(int(ingov)) if ingov else ""
    first_speech = try_find_text(speech_ele, ".//first.speech")
    first_speech = bool(int(first_speech)) if first_speech else "" 
    return name,electorate,party, ingov, first_speech

def load_speech(speech_ele):
    name,electorate,party, ingov, first_speech = load_speech_metadata(speech_ele)   
    
    text=" ".join([pp.text or "" for pp in itertools.chain(
                                speech_ele.findall(".//span[@class='HPS-Normal']"),
                                speech_ele.findall(".//para") )]) # Get Both pre and post 2012
    text=nuke_whitespace(text)
    return name,electorate,party, ingov, first_speech, text

def load_file(filename):
        xml = ET.ElementTree(file=filename)
        date = xml.find(".//date").text
        
        for debate in xml.findall(".//debate"):
            debate_type = try_find_text(debate, ".//type")
            debate_title = try_find_text(debate, ".//title") #TODO: Make this work for Post 2012 (something about subdebate titles)
                
            speeches = debate.findall(".//speech")
            for ss in speeches:
                name,electorate,party, ingov, first_speech, text = load_speech(ss)
                if text and party: #we need to at least know the party and the text
                    yield date,debate_type, debate_title, name, electorate, party, ingov, first_speech, text
        

In [14]:

def load_corpus(corpus_path):
    from glob import glob
    import os.path
    filenames = glob(os.path.join(corpus_path,"*.xml"))
    
    file_records = [load_file(filename) for filename in filenames]
    all_speeches = list(itertools.chain(*file_records))
    df = pd.DataFrame(all_speeches, columns=["date", "debate_type", "debate_title", "name","electorate","party", "ingov", "first_speech", "text"])
    
    return df


In [15]:
#corpus = load_corpus("corpora/hansard/hansard_2012_to_2014/")
corpus = load_corpus("corpora/hansard/hansard_1998_to_2012/")
corpus

Unnamed: 0,date,debate_type,debate_title,name,electorate,party,ingov,first_speech,text
0,2011-09-14,BILLS,BILLS,"Abbott, Tony, MP",Warringah,LP,,,Let us consider the record of this Prime Minis...
1,2011-09-14,BILLS,BILLS,"Dreyfus, Mark, MP",Isaacs,ALP,,,After decades of parliamentary debate about cl...
2,2011-09-14,BILLS,BILLS,"Hunt, Greg, MP",Flinders,LP,,,"So let us begin with the heart of the matter, ..."
3,2011-09-14,BILLS,BILLS,"Leigh, Andrew, MP",Fraser,ALP,,,One way of regarding climate change mitigation...
4,2011-09-14,BILLS,BILLS,"Macfarlane, Ian, MP",Groom,LP,,,This is a tax which will destroy Australia's c...
5,2011-09-14,BILLS,BILLS,"Neumann, Shayne, MP",Blair,ALP,,,So he is another sceptic. He has gone from bei...
6,2011-09-14,BILLS,BILLS,"Bishop, Julie, MP",Curtin,LP,,,We do not simply oppose it because the Prime M...
7,2011-09-14,BILLS,BILLS,"Perrett, Graham, MP",Moreton,ALP,,,As the largest polluter per person in the worl...
8,2011-09-14,BILLS,BILLS,"Ruddock, Philip, MP",Berowra,LP,,,I am strongly of the view that Australia shoul...
9,2011-09-14,BILLS,BILLS,"Crean, Simon, MP",Hotham,ALP,,,Those successes that I talk about all came fro...


In [16]:
list(corpus.debate_type.value_counts().items())

[('Bills', 26506),
 ('BILLS', 6727),
 ('Adjournment', 6234),
 ('Statements by Members', 3928),
 ('Committees', 2976),
 ("Private Members' Business", 2858),
 ('Miscellaneous', 2650),
 ('Matters of Public Importance', 2459),
 ('ADJOURNMENT', 1557),
 ('Constituency Statements', 1320),
 ('Grievance Debate', 1145),
 ('Personal Explanations', 1109),
 ('STATEMENTS BY MEMBERS', 1047),
 ('Condolences', 905),
 ("PRIVATE MEMBERS' BUSINESS", 901),
 ('Ministerial Statements', 848),
 ('CONSTITUENCY STATEMENTS', 792),
 ('COMMITTEES', 779),
 ('MATTERS OF PUBLIC IMPORTANCE', 738),
 ('Motions', 491),
 ('Distinguished Visitors', 432),
 ('Business', 416),
 ("Governor-General's Speech", 415),
 ('CONDOLENCES', 389),
 ('MOTIONS', 372),
 ('Ministerial Arrangements', 358),
 ('Questions Without Notice', 258),
 ('Documents', 255),
 ('STATEMENTS ON INDULGENCE', 244),
 ('Delegation Reports', 239),
 ('BUSINESS', 222),
 ('PRIVATE MEMBERS’ BUSINESS', 221),
 ('GRIEVANCE DEBATE', 198),
 ('MINISTERIAL STATEMENTS', 191),

In [17]:
#   AG          - Australian Greens
#   ALP         - Labour
#   AUS         - Australia First??
#   CLP         - Country Liberal Party?
#   IND         - Independant (no party)
#   Ind         - Independant (no party)
#   Ind.            - Independant (no party)
#   LP          - Liberal
#   N/A         - same as UNKNONW? or guest?
#   NATS        - Nationals
#   NP          - Nationals
#   NPActing    - Nationals
#   Nats        - Nationals
#   NatsWA      - Nationals WA
#   PUP         - Palmer United
#   UNKNOWN


def get_consistent_name(name):
    if name in frozenset("IND Ind Ind.".split()):
        return "IND"
    elif name in frozenset("NATS NP NPActing Nats NatsWA".split()):
        return "NATS"
    elif pd.isnull(name):
        return "NOT SPECIFIED"
    else:
        return name

corpus.party = corpus.party.apply(get_consistent_name)


In [18]:
def get_meta_party(name):
    if name in frozenset("NATS CLP LP".split()):
        return "LNC"
    elif name=="ALP":
        return "ALP"
    else:
        return "OTHER"
    
corpus["metaparty"] = corpus.party.apply(get_meta_party)


In [19]:
def formalise(name):
    first, last = name.split()
    return last+", "+first + ", MP"

prime_ministers = frozenset(map(formalise, ("Bob Hawke","Paul Keating","John Howard", "Julia Gillard","Kevin Rudd","Tony Abbott")))
opposition_leaders = frozenset(map(formalise, ("John Howard","Andrew Peacock","John Hewson","Alexander Downer","John Howard","Kim Beazley","Simon Crean","Mark Latham","Kim Beazley","Kevin Rudd","Brendan Nelson","Malcolm Turnbull","Tony Abbott","Chris Bowen","Bill Shorten")))
corpus["prime_minister"] = corpus.name.apply(lambda name: name in prime_ministers)
corpus["opposition_leader"] = corpus.name.apply(lambda name: name in opposition_leaders)


In [20]:
corpus.debate_type= corpus.debate_type.str.upper()  #Normalise Casing
corpus.loc[pd.isnull(corpus.debate_title),"debate_title"] = "-"
corpus.loc[corpus.debate_type=="PRIVATE MEMBERS’ BUSINESS", "debate_type"] = "PRIVATE MEMBERS' BUSINESS"

In [21]:
#corpus.to_csv("serialised/hansard_2012_2014.csv")
corpus.to_csv("serialised/hansard_1998_2012.csv")

In [22]:
!git add hansard.ipynb

In [23]:
!git commit -m=""

[master b27da4d] =
 1 file changed, 351 insertions(+), 1402 deletions(-)
 rewrite prototypes/Corpus/hansard.ipynb (84%)


In [24]:
!git push

To git@github.com:oxinabox/phd.git
 ! [rejected]        master -> master (fetch first)
error: failed to push some refs to 'git@github.com:oxinabox/phd.git'
hint: Updates were rejected because the remote contains work that you do
hint: not have locally. This is usually caused by another repository pushing
hint: to the same ref. You may want to first integrate the remote changes
hint: (e.g., 'git pull ...') before pushing again.
hint: See the 'Note about fast-forwards' in 'git push --help' for details.
