# Exploring the legislation xml

Here I'll be pulling apart the xml files that we've managed to scrape. The idea is to look for interesting structures that would be worth rolling up into a dataset, or some worthwhile visualisations.

In [63]:
import os, re
import random
from bs4 import BeautifulSoup, element

In [9]:
xml_filepaths = []
for root, dirs, files in os.walk("../legislation/xml"):
    xml_filepaths += [os.path.join(root, f) for f in files if f.endswith(".xml")]

In [10]:
xml_filepaths[0]

'../legislation/xml/subscribe/act/public/1998/0118/1.0/096be8ed8009d734.xml'

We parse the xml file using the `BeautifulSoup` library.

In [175]:
xml_file = random.choice(xml_filepaths)
with open(xml_file) as f:
    soup = BeautifulSoup(f.read(), "lxml-xml")

In [176]:
soup.find('title').contents[0]

'Trade in Endangered Species Amendment Act 1999'

In [177]:
body = soup.find('body')
body

<body id="DLM25209" prov-type="section" xml:lang="en-NZ"><prov id="DLM25210" irdnumbering="no" skeleton="no" toc="yes" xml:lang="en-NZ"><label auto.number="no" denominator="yes">1</label><heading xml:lang="en-NZ">Short Title</heading><prov.body><subprov skeleton="no" xml:lang="en-NZ"><label auto.number="no" denominator="no"/><para xml:lang="en-NZ"><text>This Act may be cited as the Trade in Endangered Species Amendment Act 1999, and is part of the <citation jurisdiction="nz"><atidlm:link atidlm:branchLabel="_NOVERSION_" atidlm:key="6609" atidlm:name="Trade in Endangered Species Act 1989" atidlm:state="UNCHANGED" atidlm:xmlId="DLM453602" xmlns:atidlm="http://www.arbortext.com/namespace/atidlm"><atidlm:linkcontent>Trade in Endangered Species Act 1989</atidlm:linkcontent><atidlm:resourcepair atidlm:isIncludeOrRef="1" atidlm:markupType="leg-title" atidlm:seqNum="6397" atidlm:sourceLocator="file:///C:/epicuser/Doctypes/NZAct/fragxyxz_DLM25205.xml" atidlm:sourceName="DLM453602" atidlm:source

In [178]:
cfTags = body.find_all('cf')
for cfTag in cfTags:
    print(cfTag)
    cfTag.decompose()

In [179]:
notesTags = body.find_all('notes')
for notesTag in notesTags:
    print(notesTag)
    notesTag.decompose()

In [180]:
labels = body.find_all('label')
for label in labels:
    text = label.contents
    if len(text) > 0:
        label.replace_with('\n\n' + text[0] + ' ')
    else:
        label.replace_with('\n')
labels

[<label auto.number="no" denominator="yes">1</label>,
 <label auto.number="no" denominator="no"/>,
 <label auto.number="no" denominator="yes">2</label>,
 <label auto.number="no" denominator="no"/>]

In [181]:
#process rules for how to handle various tags
defTerms = body.find_all('def-term')
for defTerm in defTerms:
    print(defTerm)
    contents = defTerm.contents
    if len(contents) > 0:
        if isinstance(contents[0], element.Tag):
            print(contents)
            content = contents[0].extract()
            newTag = soup.new_tag('text')
            newTag.string = '\n\n'
            defTerm.insert_before(newTag)
            defTerm.insert_before(content)
        if isinstance(contents[0], str):
            defTerm.replace_with('*' + contents[0] + '*')

In [182]:
defParas = body.find_all('def-para')
for defPara in defParas:
    print(defPara)
    contents = defPara.contents
    if len(contents) > 0:
        content = contents[0].extract()
        newTag = soup.new_tag('text')
        newTag.string = '\n\n'
        defPara.insert_before(newTag)
        defPara.insert_before(content)
defParas

[]

In [183]:
crossheads = body.find_all('crosshead')
for crosshead in crossheads:
    print(crosshead)
    crosshead.replace_with('\n\n')
    print(crosshead)
crossheads

[]

In [186]:
eqnLines = body.find_all('eqn-line')
for eqnLine in eqnLines:
    print(eqnLine)
    text = eqnLine.contents
    if len(text) > 0:
        eqnLine.replace_with('\n\n ' + text[0] + '\n\n')
eqnLines

[]

In [185]:
variableDefs = body.find_all('variable-def')
for variableDef in variableDefs:
    print(variableDef)
    contents = variableDef.contents
    if len(contents) > 0:
        if isinstance(contents[0], element.Tag):
            content = contents[0].extract()
            newTag = soup.new_tag('text')
            newTag.string = '\n\n'
            variableDef.insert_before(newTag)
            variableDef.insert_before(content)
        if isinstance(contents[0], str):
            variableDef.replace_with('\n\n' + contents[0] + ' ')
variableDefs

[]

In [105]:
bodyString = body.get_text()

In [187]:
print(bodyString[:500])



1 Short Title and commencement

1 This Act may be cited as the Child Support Amendment Act 1993, and shall be read together with and deemed part of the Child Support Act 1991 (hereinafter referred to as the principal Act).

2 This Act shall be deemed to have come into force on the 18th day of December 1991.

2 Custodian in receipt of social security benefit must apply for formula assessment
This section substituted s 9(3) of the principal Act.

3 Automatic applications for formula assessment i
