In [None]:
!

In [None]:
corpus_path = "corpora/hansard/hansard/"

In [None]:
import pandas as pd
import numpy as np
from glob import glob
from os import path
import itertools

In [None]:
import xml.etree.ElementTree as ET

In [None]:
def try_find_text(ele, xpath):
    found_ele = ele.find(xpath)
    if not(found_ele is None):
        return found_ele.text or ""
    else:
        return ""
    
def nuke_whitespace(string):
    return ' '.join(string.split()).strip()

In [190]:
d=xml.findall("//debate")[1]
d.findall("./subdebate.1//title")[2].text

  if __name__ == '__main__':


'Low Aromatic Fuel Bill 2013, Public Service Amendment Bill 2013, Water Amendment (Water for the Environment Special Account) Bill 2013, Parliamentary Service Amendment Bill 2013, Social Security and Other Legislation Amendment (Income Support Bonus) Bill 2013, Crimes Legislation Amendment (Slavery, Slavery-like Conditions and People Trafficking) Bill 2013'

In [155]:
def load_speech_metadata(speech_ele):
    name = try_find_text(speech_ele, ".//name")
    electorate = try_find_text(speech_ele, ".//electorate")
    party = try_find_text(speech_ele, ".//party")
    ingov = try_find_text(speech_ele, ".//in.gov")
    ingov = bool(int(ingov)) if ingov else ""
    first_speech = try_find_text(speech_ele, ".//first.speech")
    first_speech = bool(int(first_speech)) if first_speech else "" 
    return name,electorate,party, ingov, first_speech

def load_speech(speech_ele):
    name,electorate,party, ingov, first_speech = load_speech_metadata(speech_ele)   
    
    text=" ".join([pp.text or "" for pp in itertools.chain(
                                speech_ele.findall(".//span[@class='HPS-Normal']"),
                                speech_ele.findall(".//para") )]) # Get Both pre and post 2012
    text=nuke_whitespace(text)
    return name,electorate,party, ingov, first_speech, text

def load_file(filename):
        xml = ET.ElementTree(file=filename)
        date = xml.find(".//date").text
        
        for debate in xml.findall(".//debate"):
            debate_type = try_find_text(debate, ".//type")
            debate_title = try_find_text(debate, ".//title") #TODO: Make this work for Post 2012 (something about subdebate titles)
                
            speeches = debate.findall(".//speech")
            for ss in speeches:
                name,electorate,party, ingov, first_speech, text = load_speech(ss)
                if text and party: #we need to at least know the party and the text
                    yield date,debate_type, debate_title, name, electorate, party, ingov, first_speech, text
        

In [156]:

def load_corpus(corpus_path):
    from glob import glob
    import os.path
    filenames = glob(os.path.join(corpus_path,"*.xml"))
    
    file_records = [load_file(filename) for filename in filenames]
    all_speeches = list(itertools.chain(*file_records))
    df = pd.DataFrame(all_speeches, columns=["date", "debate_type", "debate_title", "name","electorate","party", "ingov", "first_speech", "text"])
    
    return df


In [192]:
corpus = load_corpus("corpora/hansard/hansard_2012_to_2014/")
corpus

Unnamed: 0,date,debate_type,debate_title,name,electorate,party,ingov,first_speech,text
0,2013-06-27,MINISTERIAL ARRANGEMENTS,MINISTERIAL ARRANGEMENTS,"Rudd, Kevin, MP",Griffith,ALP,,,I inform the House that last evening the Feder...
1,2013-06-27,BUSINESS,BUSINESS,"Morrison, Scott, MP",Cook,LP,,,This parliament has sought to have this matter...
2,2013-06-27,BUSINESS,BUSINESS,"Albanese, Anthony, MP",Grayndler,ALP,,,The only section of this bill that remains to ...
3,2013-06-27,COMMITTEES,COMMITTEES,"Oakeshott, Robert, MP",Lyne,Ind.,,,Leave granted. In accordance with standing ord...
4,2013-06-27,COMMITTEES,COMMITTEES,"Oakeshott, Robert, MP",Lyne,Ind.,,,Leave granted.
5,2013-06-27,PARLIAMENTARY REPRESENTATION,PARLIAMENTARY REPRESENTATION,"Oakeshott, Robert, MP",Lyne,Ind.,,,I have just come from a morning tea with some ...
6,2013-06-27,COMMITTEES,COMMITTEES,"Marino, Nola, MP",Forrest,LP,,,In accordance with standing order 39(f) the re...
7,2013-06-27,COMMITTEES,COMMITTEES,"Neumann, Shayne, MP",Blair,ALP,,,Question agreed to.
8,2013-06-27,COMMITTEES,COMMITTEES,"Neumann, Shayne, MP",Blair,ALP,,,Question agreed to.
9,2013-06-27,COMMITTEES,COMMITTEES,"Neumann, Shayne, MP",Blair,ALP,,,Question agreed to.


In [197]:
list(corpus.debate_type.value_counts().items())

[('BILLS', 6974),
 ("PRIVATE MEMBERS' BUSINESS", 1579),
 ('STATEMENTS BY MEMBERS', 1545),
 ('ADJOURNMENT', 1520),
 ('CONSTITUENCY STATEMENTS', 1307),
 ('MATTERS OF PUBLIC IMPORTANCE', 980),
 ('COMMITTEES', 744),
 ('CONDOLENCES', 475),
 ('MOTIONS', 430),
 ('STATEMENTS ON INDULGENCE', 398),
 ('BUSINESS', 326),
 ('GRIEVANCE DEBATE', 177),
 ('MINISTERIAL STATEMENTS', 163),
 ("GOVERNOR-GENERAL'S SPEECH", 128),
 ('DOCUMENTS', 100),
 ("AUDITOR-GENERAL'S REPORTS", 61),
 ('PERSONAL EXPLANATIONS', 55),
 ('DELEGATION REPORTS', 46),
 ('PETITIONS', 39),
 ('PARLIAMENTARY OFFICE HOLDERS', 36),
 ('PARLIAMENTARY REPRESENTATION', 33),
 ('STATEMENT BY THE SPEAKER', 21),
 ('PRIVILEGE', 16),
 ('DISTINGUISHED VISITORS', 15),
 ('STATEMENTS', 9),
 ('QUESTIONS WITHOUT NOTICE', 9),
 ('MINISTERIAL ARRANGEMENTS', 8),
 ('PARTY OFFICE HOLDERS', 7),
 ('QUESTIONS WITHOUT NOTICE: ADDITIONAL ANSWERS', 4),
 ('QUESTIONS TO THE SPEAKER', 3),
 ('TARIFF PROPOSALS', 3),
 ('ADDRESS BY THE PRIME MINISTER OF JAPAN', 2),
 ('PARL

In [198]:
#   AG          - Australian Greens
#   ALP         - Labour
#   AUS         - Australia First??
#   CLP         - Country Liberal Party?
#   IND         - Independant (no party)
#   Ind         - Independant (no party)
#   Ind.            - Independant (no party)
#   LP          - Liberal
#   N/A         - same as UNKNONW? or guest?
#   NATS        - Nationals
#   NP          - Nationals
#   NPActing    - Nationals
#   Nats        - Nationals
#   NatsWA      - Nationals WA
#   PUP         - Palmer United
#   UNKNOWN


def get_consistent_name(name):
    if name in frozenset("IND Ind Ind.".split()):
        return "IND"
    elif name in frozenset("NATS NP NPActing Nats NatsWA".split()):
        return "NATS"
    elif pd.isnull(name):
        return "NOT SPECIFIED"
    else:
        return name

corpus.party = corpus.party.apply(get_consistent_name)


In [199]:
def get_meta_party(name):
    if name in frozenset("NATS CLP LP".split()):
        return "LNC"
    elif name=="ALP":
        return "ALP"
    else:
        return "OTHER"
    
corpus["metaparty"] = corpus.party.apply(get_meta_party)


In [200]:
def formalise(name):
    first, last = name.split()
    return last+", "+first + ", MP"

prime_ministers = frozenset(map(formalise, ("Bob Hawke","Paul Keating","John Howard", "Julia Gillard","Kevin Rudd","Tony Abbott")))
opposition_leaders = frozenset(map(formalise, ("John Howard","Andrew Peacock","John Hewson","Alexander Downer","John Howard","Kim Beazley","Simon Crean","Mark Latham","Kim Beazley","Kevin Rudd","Brendan Nelson","Malcolm Turnbull","Tony Abbott","Chris Bowen","Bill Shorten")))
corpus["prime_minister"] = corpus.name.apply(lambda name: name in prime_ministers)
corpus["opposition_leader"] = corpus.name.apply(lambda name: name in opposition_leaders)


In [201]:
corpus.debate_type= corpus.debate_type.str.upper()  #Normalise Casing
corpus.loc[pd.isnull(corpus.debate_title),"debate_title"] = "NOT_SPECIFIED"


In [202]:
corpus.to_csv("serialised/hansard_2012_2014.csv")

In [203]:
!git add hansard.ipynb

In [204]:
!git commit -m="made hansard normalise roughly work post 2012"

[master bfc3bad] =made hansard normalise roughly work post 2012
 1 file changed, 1103 insertions(+), 35 deletions(-)


In [205]:
!git push

Counting objects: 5, done.
Delta compression using up to 16 threads.
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 8.42 KiB | 0 bytes/s, done.
Total 5 (delta 3), reused 0 (delta 0)
To git@github.com:oxinabox/phd.git
   4794fd9..bfc3bad  master -> master
