# Prosecution Project OAI-PMH Demo

In [1]:
# pyoai provides a nice python API for OAI-PMH (http://infrae.com/download/OAI/pyoai)
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
# other libraries we'll use
from datetime import datetime
from collections import defaultdict
from IPython.display import HTML, display
import tabulate

Simple OAI-PMH harvester in Python:

In [2]:
URL = 'https://oai.prosecutionproject-test.griffith.edu.au/oai'
records = []

# setup harvester
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(URL, registry, force_http_get=True)

# track the start time of the harvest
_start = datetime.now()

# fetch all records from oai
for record in client.listRecords(metadataPrefix='oai_dc'):
    records.append(record[1].getMap())
    
# record finish time
_finish = datetime.now()

# work out how long the harvester took
t_d = (_finish-_start).total_seconds()
print("Harvested {} records in {} seconds".format(len(records), t_d))

Harvested 94388 records in 499.207413 seconds


Create a naive top 10 list of the most common offences in the data

In [15]:
# do a simple occurence count of offences (witout any datacleaning)
offences = defaultdict(int)
offpjur = defaultdict(lambda: defaultdict(int))
jurisdiction = set()
for _rec in records: 
    _offence = _rec['title'][0].split(',')[1].strip().title()
    _jurisdiction = _rec['identifier'][0].split(':')[-1].split('/')[0]
    jurisdiction.add(_jurisdiction)
    offences[_offence] += 1    
    offpjur[_jurisdiction][_offence] += 1
    
# remove unknown offences
del(offences['[Unknown Offence]'])

# find the top 10 offences across all records and juristictions 
top10 = [[_of, _count] for _of, _count in sorted(offences.items(), reverse=True, key=lambda x: x[1])[:10]]
# Add the counts for each juristiction
top10 = [row + [offpjur[_j][row[0]] for _j in sorted(list(jurisdiction))] for row in top10]

# make it look pretty
_headers = ['Offence', 'Total']+[x.replace('SC','') for x in sorted(list(jurisdiction))]
display(HTML(tabulate.tabulate(top10, tablefmt='html', headers=_headers)))

Offence,Total,NSW,NT,QLD,SA,TAS,VIC,WA
Larceny,11850,952,35,1695,2550,1827,3711,1080
Murder,3426,1005,84,494,190,208,831,614
Forgery,3195,572,4,351,341,457,1139,331
Horse Stealing,2765,689,6,833,169,180,743,145
Housebreaking,2633,8,12,52,345,456,1685,75
False Pretences,2589,216,7,592,396,224,899,255
Perjury,2221,265,11,166,101,330,1250,98
Burglary,1998,211,2,140,152,562,799,132
Manslaughter,1971,674,15,156,118,114,679,215
Stealing,1954,447,62,865,7,3,38,532
