# Install Packages

In [None]:
%matplotlib inline

In [None]:
!sudo -H pip install elasticsearch numpy pandas requests xmltodict

Collecting elasticsearch
  Downloading elasticsearch-2.3.0-py2.py3-none-any.whl (51kB)
[K    100% |████████████████████████████████| 61kB 3.0MB/s
Collecting xmltodict
  Downloading xmltodict-0.10.2.tar.gz
Collecting urllib3<2.0,>=1.8 (from elasticsearch)
  Downloading urllib3-1.16-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 5.9MB/s
Building wheels for collected packages: xmltodict
  Running setup.py bdist_wheel for xmltodict ... [?25l- done
[?25h  Stored in directory: /root/.cache/pip/wheels/2a/dc/70/da8958d7089d994c8614bc38210f64855f09615e85707bf615
Successfully built xmltodict
Installing collected packages: urllib3, elasticsearch, xmltodict
  Found existing installation: urllib3 1.7.1
[33m    DEPRECATION: Uninstalling a distutils installed project (urllib3) has been deprecated and will be removed in a future version. This is due to the fact that uninstalling a distutils project will only partially uninstall the project.[0m
    Uninstalling 

In [None]:
import codecs
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import json
import numpy as np
import os
import pandas as pd
import requests
import xml
import xmltodict

# Reload the Metadata

In [None]:
# If we don't have any of the cached data files, download them from S3

if not os.path.isdir('iati'):
    !aws s3 --region us-west-2 cp s3://mdang.w210/iati.tar.gz .
    !tar -zxf iati.tar.gz

if not os.path.isfile('metadata.json'):
    !aws s3 cp s3://mdang.w210/metadata.json.gz .
    !gunzip metadata.json.gz

download: s3://mdang.w210/iati.tar.gz to ./iati.tar.gz
download: s3://mdang.w210/metadata.json.gz to ./metadata.json.gz


In [None]:
with codecs.open('metadata.json', 'r', 'utf-8') as metadata_file:
    metadata = json.load(metadata_file)

In [None]:
count = len(metadata)
count

4126

# Choose a Random Data Set

In [None]:
# Choose a random data set to explore

random_metadata = None

while random_metadata is None:
    index = np.random.randint(count)
    random_metadata = metadata[index]

    if not random_metadata['isopen']:
        random_metadata = None

random_title = random_metadata['title']
random_short_name = random_metadata['name']

random_iati_filename = 'iati/' + random_short_name + '.xml'

print random_title
print 'http://iatiregistry.org/dataset/%s' % random_short_name

European Union Activity File - Nepal
http://iatiregistry.org/dataset/ec-devco-np


In [None]:
random_iati_filename

u'iati/ec-devco-np.xml'

# Explore the Random Data Set

In [None]:
# Find out how many activities we have

with open(random_iati_filename, 'rb') as random_iati_file:
    activities_xml = xmltodict.parse(random_iati_file, xml_attribs=True)
    activities = activities_xml['iati-activities']['iati-activity']

print len(activities)

258


In [None]:
# Print out one of the activities

print json.dumps(activities[0], indent = 2)

{
  "@default-currency": "EUR",
  "@last-updated-datetime": "2016-04-11T15:10:44",
  "@version": "1.04",
  "@xml:lang": "en",
  "@hierarchy": "1",
  "reporting-org": {
    "@ref": "XI-IATI-EC_DEVCO",
    "@type": "15",
    "#text": "European Commission - Development and Cooperation-EuropeAid"
  },
  "iati-identifier": "EU-1-1998/010-887",
  "title": "COLLEGE PHILOSOPHIQUE DE SHECHEN (SHEDRA) - NEPAL",
  "description": {
    "@type": "1",
    "#text": "Mise en place d'un college destine a l'enseignement de hautniveau des sciences traditionnelles tibetaines,Boudanath,Katmandou"
  },
  "activity-status": {
    "@code": "3",
    "#text": "Completion"
  },
  "activity-date": [
    {
      "@iso-date": "1998-07-01",
      "@type": "start-planned"
    },
    {
      "@iso-date": "1998-07-01",
      "@type": "start-actual"
    },
    {
      "@iso-date": "2003-06-11",
      "@type": "end-planned"
    },
    {
      "@iso-date": "2003-06-11",
      "@type": "end-actual"
    }
  ],
  "contact-in

# Add Random Data Set to Elasticsearch

Following the Elasticsearch DSL Python tutorial for the following.

* http://elasticsearch-dsl.readthedocs.io/en/latest/

In [None]:
# Create an Elasticsearch connection

connections.create_connection(hosts = ['localhost'])

es = Elasticsearch()

In [None]:
for activity in activities:
    document_id = '%s.%s' % (random_short_name, activity['iati-identifier'])
    print document_id

    try:
        es.index(index = 'iati-activities', doc_type = 'iatiactivity', id = document_id, body = activity)
    except:
        print 'Failed to add'

ec-devco-np.EU-1-1998/010-887
ec-devco-np.EU-1-2000/2589/0
Failed to add directly
ec-devco-np.EU-1-2004/064-096
ec-devco-np.EU-1-2004/16751/0
Failed to add directly
ec-devco-np.EU-1-2005/094-675
ec-devco-np.EU-1-2005/095-157
ec-devco-np.EU-1-2005/095-462
ec-devco-np.EU-1-2005/103-144
ec-devco-np.EU-1-2005/110-999
ec-devco-np.EU-1-2005/112-787
ec-devco-np.EU-1-2005/112-979
ec-devco-np.EU-1-2005/17237/0
ec-devco-np.EU-1-2005/17635/0
Failed to add directly
ec-devco-np.EU-1-2006/118-620
ec-devco-np.EU-1-2006/118-644
ec-devco-np.EU-1-2006/118-883
ec-devco-np.EU-1-2006/119-337
ec-devco-np.EU-1-2006/119-627
ec-devco-np.EU-1-2006/126-070
ec-devco-np.EU-1-2006/126-110
ec-devco-np.EU-1-2006/126-183
ec-devco-np.EU-1-2006/129-635
ec-devco-np.EU-1-2006/18408/0
Failed to add directly
ec-devco-np.EU-1-2006/18441/0
Failed to add directly
ec-devco-np.EU-1-2007/133-412
ec-devco-np.EU-1-2007/133-415
ec-devco-np.EU-1-2007/133-430
ec-devco-np.EU-1-2007/133-443
ec-devco-np.EU-1-2007/133-463
ec-devco-np.EU-1

Now we can check Elasticsearch and see what the index looks like.

In [None]:
!echo http://$(curl -s http://169.254.169.254/latest/meta-data/public-hostname):5601/

http://ec2-54-149-233-245.us-west-2.compute.amazonaws.com:5601/


As an example of EDA that's possible using Elasticsearch and Kibana, you can look into the values on the description by doing the following:

1. Navigate to the Discover tab.
2. Select the gear icon to change the Available Fields to only included indexed fields (for visualization)
3. As an example, check the description.#text field

# Add All Data to Elasticsearch

In [None]:
# Choose a random data set to explore

success = 0
failure = 0
processed = 0

for metadata_item in metadata:
    title = metadata_item['title']
    short_name = metadata_item['name']

    # Progress update every 100 XML files so that we know the process
    # is actually still processing its 4000 files.

    processed += 1

    if processed % 100 == 0:
        print 'Processed', processed, 'of', count, 'files'

    iati_filename = 'iati/' + short_name + '.xml'

    # If the data set is not open or we have other errors, we won't
    # have a data file to work with.

    if not os.path.isfile(iati_filename):
        continue

    # Extract the activities from the XML as a dictionary

    activities = []

    had_xml_error = False

    with open(iati_filename, 'rb') as iati_file:
        try:
            activities_xml = xmltodict.parse(iati_file, xml_attribs=True)

            root_item = activities_xml['iati-activities']

            if 'iati-activity' in root_item:
                activities = root_item['iati-activity']
        except:
            had_xml_error = True

    # Build up the set of bulk actions for this specific XML file

    bulk_actions = []

    for activity in activities:
        try:
            document_id = '%s.%s' % (random_short_name, activity['iati-identifier'])
            bulk_actions.append({
                '_index': 'iati-activities',
                '_type': 'iatiactivity',
                '_id': document_id,
                'doc': activity
            })
        except:
            had_xml_error = True

    # Insert the documents in bulk and report if we receive any errors

    result = bulk(es, bulk_actions, stats_only = True, raise_on_error = False)

    if had_xml_error:
        print 'XML handling error processing', short_name

    if result[1] != 0:
        print result[1], 'elasticsearch errors processing',  short_name


1856 elasticsearch errors processing sdc_ch-140402
2097 elasticsearch errors processing sdc_ch-150410
3446 elasticsearch errors processing sdc_ch-160408
68 elasticsearch errors processing ia_nam-o100
XML handling error processing acord-cd
XML handling error processing acord-tz
208 elasticsearch errors processing ia_nam-o29
2 elasticsearch errors processing addinternational-289
4 elasticsearch errors processing addinternational-298
2 elasticsearch errors processing addinternational-798
XML handling error processing addinternational-998
6 elasticsearch errors processing addinternational-activities
2 elasticsearch errors processing addinternational-bd
2 elasticsearch errors processing addinternational-bf
4 elasticsearch errors processing addinternational-gb
XML handling error processing addinternational-in
XML handling error processing addinternational-kh
XML handling error processing addinternational-sd
XML handling error processing addinternational-tz
XML handling error processing addin