In [2]:
corpus_path = "corpora/hansard/hansard/"

In [3]:
import pandas as pd
import numpy as np
from glob import glob
from os import path
import itertools

In [4]:
import xml.etree.ElementTree as ET

In [8]:
def try_find_text(ele, xpath):
    found_ele = ele.find(xpath)
    if not(found_ele is None):
        return found_ele.text or ""
    else:
        return ""
    
def nuke_whitespace(string):
    return ' '.join(string.split()).strip()

In [9]:
def load_speech(speech_ele):
    name = try_find_text(speech_ele, ".//name")
    electorate = try_find_text(speech_ele, ".//electorate")
    party = try_find_text(speech_ele, ".//party")
    ingov = try_find_text(speech_ele, ".//in.gov")
    ingov = bool(int(ingov)) if ingov else ""
    first_speech = try_find_text(speech_ele, ".//first.speech")
    first_speech = bool(int(first_speech)) if first_speech else ""
    
    
    text = speech_ele.text or ""
    text+=" " + " ".join([para.text or "" for para in speech_ele.findall("./para")])
    text=nuke_whitespace(text)
    return name,electorate,party, ingov, first_speech, text
        

In [10]:
def load_file(filename):
        xml = ET.ElementTree(file=filename)
        date = xml.find(".//date").text
        speeches = xml.findall(".//speech")
        for ss in speeches:
            name,electorate,party, ingov, first_speech, text = load_speech(ss)
            if text and party: #we need to at least know the party and the text
                yield date, name,electorate,party, ingov, first_speech, text
        

def load_corpus(corpus_path):
    from glob import glob
    import os.path
    filenames = glob(os.path.join(corpus_path,"*.xml"))
    
    file_records = [load_file(filename) for filename in filenames]
    all_speeches = list(itertools.chain(*file_records))
    df = pd.DataFrame(all_speeches, columns=["date", "name","electorate","party", "ingov", "first_speech", "text"])
    
    return df


In [11]:
corpus = load_corpus(corpus_path)

In [12]:
#   AG          - Australian Greens
#   ALP         - Labour
#   AUS         - Australia First??
#   CLP         - Country Liberal Party?
#   IND         - Independant (no party)
#   Ind         - Independant (no party)
#   Ind.            - Independant (no party)
#   LP          - Liberal
#   N/A         - same as UNKNONW? or guest?
#   NATS        - Nationals
#   NP          - Nationals
#   NPActing    - Nationals
#   Nats        - Nationals
#   NatsWA      - Nationals WA
#   PUP         - Palmer United
#   UNKNOWN


def get_consistent_name(name):
    if name in frozenset("IND Ind Ind.".split()):
        return "IND"
    elif name in frozenset("NATS NP NPActing Nats NatsWA".split()):
        return "NATS"
    elif pd.isnull(name):
        return "NOT SPECIFIED"
    else
        return name

corpus.party = corpus.party.apply(get_consistent_name)


In [3]:
import pandas as pd
pd.isnull(pd.np.nan)

True

In [13]:
def get_meta_party(name):
    if name in frozenset("NATS CLP LP".split()):
        return "LNC"
    elif name=="ALP":
        return "ALP"
    else:
        return "OTHER"
    
corpus["metaparty"] = corpus.party.apply(get_meta_party)


In [14]:
corpus

Unnamed: 0,date,name,electorate,party,ingov,first_speech,text,metaparty
0,2003-08-12,"Crean, Simon, MP",Hotham,ALP,False,False,Leave granted.,ALP
1,2003-08-12,"Howard, John, MP",Bennelong,LP,True,False,That this House: (1) endorses the Government's...,LNC
2,2003-08-12,"Crean, Simon, MP",Hotham,ALP,False,False,Under very difficult circumstances the Solomon...,ALP
3,2003-08-12,"Downer, Alexander, MP",Mayo,LP,True,False,Ever since its independence the Solomon Island...,LNC
4,2003-08-12,"Rudd, Kevin, MP",Griffith,ALP,False,False,"Finally, we support this mission because it ac...",ALP
5,2003-08-12,"Latham, Mark, MP",Werriwa,ALP,False,False,"Of course you would be aware, Mr Speaker, as a...",ALP
6,2003-08-12,"Latham, Mark, MP",Werriwa,ALP,False,False,It is the usual practice in this House for the...,ALP
7,2003-08-12,"Abbott, Tony, MP",Warringah,LP,True,False,Yet again on this issue the opposition have al...,LNC
8,2003-08-12,"Gillard, Julia, MP",Lalor,ALP,False,False,The obligation on the Prime Minister was quite...,ALP
9,2003-08-12,"Lloyd, Jim, MP",Robertson,LP,True,False,Question agreed to.,LNC


In [20]:
corpus.to_csv("serialised/hansard.csv")

In [21]:
!git add hansard.ipynb

In [22]:
!git commit -m="made hansard normalise party names"

[master 7eb4983] =made hansard normalise party names
 1 file changed, 269 insertions(+), 121 deletions(-)


In [23]:
!git push

Counting objects: 11, done.
Delta compression using up to 16 threads.
Compressing objects: 100% (11/11), done.
Writing objects: 100% (11/11), 14.61 KiB | 0 bytes/s, done.
Total 11 (delta 7), reused 0 (delta 0)
To git@github.com:oxinabox/phd.git
   57ceabc..7eb4983  master -> master
