# Install Packages

In [None]:
%matplotlib inline

In [None]:
!sudo -H pip install pymongo geocoder



In [None]:
from collections import defaultdict
from datetime import datetime
import functools
import geocoder
import io
import itertools
import json
import os
import pandas
import pickle
from pprint import PrettyPrinter
import pymongo
import re
import requests
import six

# Data Check

In [None]:
mongodb_host = 'ec2-54-213-157-84.us-west-2.compute.amazonaws.com'
mongodb_port = '27017'

In [None]:
conn=pymongo.MongoClient('mongodb://%s:%s' % (mongodb_host, mongodb_port))

# Check database names to test connection and see what's here
conn.database_names()

[u'iati', u'local']

In [None]:
# Make iati database
db = conn.iati

# Create new activities collection and check that manual renaming worked

activities=db.activities
activities_metadata=db.activities_metadata
transactions=db.transactions
organizations=db.organizations
organizations_metadata=db.organizations_metadata

db.collection_names()

[u'organizations',
 u'activities',
 u'cleaned_orgs_full',
 u'scores',
 u'activities_metadata',
 u'transactions']

In [None]:
print activities_metadata.count(), activities.count()

3562 579320


In [None]:
print transactions.count()

2251178


In [None]:
print organizations_metadata.count(), organizations.count()

0 7131


# Trim to Searchable Graph Data

We'll now access the graph database via the REST API.

* http://neo4j-rest-client.readthedocs.io/en/latest/

## Add Organizations

In [None]:
short_names_by_ref = {}

In [None]:
def add_short_name(item):
    global short_names_by_ref

    if '@ref' not in item:
        return

    short_names = None

    if '#text' in item:
        short_names = [item['#text']]
        return
    elif 'narrative' in item:
        narrative = item['narrative']

        if type(narrative) == dict:
            if '#text' in narrative:
                short_names = [narrative['#text']]
        elif type(narrative) == list:
            short_names = []
            for subnarrative in narrative:
                if '#text' in subnarrative:
                    short_names.append(subnarrative['#text'])
        else:
            short_names = [narrative]

    if short_names is None or len(short_names) == 0:
        return

    ref = item['@ref']

    if ref not in short_names_by_ref:
        short_names_by_ref[ref] = set()

    short_names_by_ref[ref] |= set(short_names)

In [None]:
short_names_by_ref = {}

for organization in organizations.find():
    if 'reporting-org' not in organization:
        continue

    reporting_org = organization['reporting-org']

    if type(reporting_org) == list:
        for item in reporting_org:
            add_short_name(item)
    elif type(reporting_org) == dict:
        add_short_name(reporting_org)

In [None]:
len(short_names_by_ref)

71

In [None]:
with open('short_names_by_ref.pickle', 'w') as f:
    pickle.dump(short_names_by_ref, f)

In [None]:
!rm -f short_names.tar.gz
!tar -cf short_names.tar short_names_*.pickle
!gzip short_names.tar

In [None]:
short_names_by_ref

{u'1301': {u'Transparency International Bangladesh'},
 u'21033': {u'Transparency International'},
 u'41119': {u'United Nations Population Fund'},
 u'46002': {u'Groupe de le Banque Africaine de D\xe9veloppement'},
 u'47122': {u'The Global Alliance for Vaccination and Immunisation'},
 u'Amnesty International Nederland': {u'Amnesty International The Netherlands'},
 u'BE-10': {u'Belgian Development Cooperation'},
 u'BE-BCE_KBO-0264814354': {u'Agence Belge de d\xe9veloppement (CTB)',
  u'Belgisch ontwikkelingsagentschap (BTC)'},
 u'CA-3': {u'Affaires \xe9trang\xe8res, Commerce et D\xe9veloppement Canada',
  u'Foreign Affairs, Trade and Development Canada'},
 u'CH-4': {u'Swiss Agency for Development and Cooperation SDC'},
 u'DAC-1601': {u'Bill and Melinda Gates Foundation'},
 u'EIGEN': {u'PAX'},
 u'ES-DIR3-E04585801': {u'ES-DIR3-E04585801-009-001091'},
 u'GB-CHC-1075920': {u'Indigo Trust'},
 u'GB-CHC-1089490': {u'Hope and Homes for Children'},
 u'GB-CHC-1093861': {u'International Medical Cor

In [None]:
!aws s3 cp short_names.tar.gz s3://mdang.w210/ --acl public-read

upload: ./short_names.tar.gz to s3://mdang.w210/short_names.tar.gz


## Add Activities

## Utility Methods

Utility method that will extract text from a node.

In [None]:
def get_text(item):
    if type(item) == dict:
        if '#text' in item:
            return get_text(item['#text'])
        elif 'narrative' in item:
            if type(item['narrative']) != list:
                return get_text(item['narrative'])

    if isinstance(item, six.string_types):
        return item
    else:
        return ''

Utility method that will remove a list from the XML and return it.

In [None]:
def remove_list(activity, attribute):
    if attribute not in activity:
        return []

    value = activity[attribute]
    del activity[attribute]

    if type(value) != list:
        value = [value]

    return value

Utility method that will remove a list from the XML and replace it with a list consisting of attribute values.

In [None]:
def replace_list(activity, key, attribute):
    item_list = remove_list(activity, key)

    if item_list is None or len(item_list) == 0:
        return

    value = [
        item[attribute]
            for item in item_list
                if item is not None and attribute in item and item[attribute] != ''
    ]

    if len(value) > 0:
        activity[key] = value

Utility method that flattens a nested dictionary into a one-level dictionary.

In [None]:
# Based off of the following StackOverflow post for flattening dictionaries
# http://stackoverflow.com/questions/6027558/flatten-nested-python-dictionaries-compressing-keys

def flatten(d, parent_key=None):
    items = []

    for k, v in d.items():
        new_key = parent_key + '_' + k if parent_key is not None else k

        new_key = new_key.replace('@', '')
        new_key = new_key.replace('#', '')
        new_key = new_key.replace(':', '')
        new_key = new_key.replace('-', '_')

        if type(v) == dict:
            items.extend(flatten(v, new_key))
        elif type(v) == list:
            sub_items = [get_text(item) for item in v]
            items.append((new_key, sub_items))
        else:
            items.append((new_key, v))

    return items

## Location

Utility methods for extracting location information.

In [None]:
def get_location(location):
    if 'point' in location:
        point = location['point']

        if point is not None and 'pos' in point:
            pos = point['pos']

            if pos is not None:
                pos = [float(x.replace(',', '.')) for x in point['pos'].split(' ')]

                try:
                    lookup = geocoder.google(pos, method = 'reverse')
                    return lookup.address
                except:
                    pass

    if 'name' not in location:
        return None

    location_name = get_text(location['name'])

    if location_name is not None:
        return location_name

    return None

In [None]:
def get_activity_location(activity):
    if 'location' not in activity:
        return None

    location = activity['location']

    if location is None:
        return None

    if type(location) == list:
        activity_location = [get_location(item) for item in location]
    else:
        activity_location = [get_location(location)]

    return [item for item in activity_location if item is not None]

## Recipient Country

In [None]:
country_url = 'http://iatistandard.org/202/codelists/downloads/clv3/json/en/Country.json'
countries_json = requests.get(country_url).json()
countries = { item['code']: item['name'] for item in countries_json['data'] }

In [None]:
def get_recipient_country(activity):
    location_name = []

    if 'recipient-country' in activity:
        country = activity['recipient-country']

        if '@code' in country:
            code = country['@code']

            if code in countries:
                return countries[code]

    return None

## Sector

In [None]:
def get_sectors(activity):
    if 'sector' not in activity:
        return []

    sectors = activity['sector']

    if type(sectors) == dict:
        sectors = [sectors]

    return [sector['@code'] for sector in sectors if '@code' in sector]

## Organizations

In [None]:
def is_valid_ref_format(ref):
    if ref.find('-') == -1:
        return False

    if ref.find(' ') != -1:
        return False

    return True

In [None]:
def add_organization(organization):
    global organization_nodes

    if '@ref' not in organization:
        return None

    organization_ref = organization['@ref']

    if organization_ref not in organization_nodes:
        if not is_valid_ref_format(organization_ref):
            return None

        organization_node = {'ref': organization_ref}

        organization_nodes[organization_ref] = organization_node
    else:
        organization_node = organization_nodes[organization_ref]

    return organization_node

## Activity Edges

In [None]:
activity_fields = set([
    'iati-identifier',
    'description',
    'reporting-org',
    'participating-org',
    'recipient-country',
    'location',
    'policy-marker',
    'sector'
])

In [None]:
def add_activity_node(activity):
    global activity_fields, activity_nodes

    if 'iati-identifier' not in activity:
        return

    activity = {
        key : value for key, value in activity.iteritems()
            if key in activity_fields
    }

    activity_key = activity['iati-identifier']

    if activity_key in activity_nodes:
        return

    reporters = remove_list(activity, 'reporting-org')
    participants = remove_list(activity, 'participating-org')

    replace_list(activity, 'activity-date', '@iso-date')
    replace_list(activity, 'sector', '@code')
    replace_list(activity, 'policy-marker', '@code')

    recipient_country = get_recipient_country(activity)

    if recipient_country is None:
        if 'recipient-country' in activity:
            del activity['recipient-country']
    else:
        activity['recipient-country'] = recipient_country

    location = get_activity_location(activity)

    if location is None:
        if 'location' in activity:
            del activity['location']
    else:
        activity['location'] = location

    if 'description' in activity:
        description_text = get_text(activity['description']).strip().lower()

        tokenized_text = set([
            item for item in re.split('[^a-z]+', description_text.strip().lower())
                if len(item) > 0
        ])

        activity['description'] = tokenized_text
        activity['description_raw'] = description_text

    else:
        activity['description'] = set()
        activity['description_raw'] = ''

    try:
        relationships = []

        activity_node = activity
        activity_nodes[activity_key] = activity_node

        for reporter, participant in itertools.product(reporters, participants):
            add_activity_relationship(activity_key, relationships, reporter, reporter)
            add_activity_relationship(activity_key, relationships, reporter, participant)

        activity_relationships[activity_key] = relationships
    except Exception as e:
        print json.dumps(activity, indent=2)
        raise e

In [None]:
def add_activity_relationship(activity_key, relationships, reporter, participant):
    global publisher_nodes

    reporter_node = add_organization(reporter)
    participant_node = add_organization(participant)

    if reporter_node is None or participant_node is None:
        return

    if reporter_node['ref'] not in publisher_nodes:
        publisher_nodes[reporter_node['ref']] = reporter_node

    has_relationship = False

    for relationship in relationships:
        if relationship['start'] == reporter_node['ref'] and relationship['end'] == participant_node['ref']:
            has_relationship = True
            break

    if not has_relationship:
        ref_edge = {
            'start': reporter_node['ref'],
            'activity': activity_key,
            'end': participant_node['ref']
        }

        relationships.append(ref_edge)

## Load Activities and Organizations

In [None]:
publisher_nodes = {}
organization_nodes = {}
activity_nodes = {}
activity_relationships = {}

activities_count = activities.count()

print datetime.now(), 'Started processing'

with open('graph_fill.txt', 'w', 0) as debug_file:
    for num, activity in enumerate(activities.find()):
        if num % 10000 == 0:
            print datetime.now(), 'Processed', num, 'of', activities_count
            print >> debug_file, datetime.now(), 'Processed', num, 'of', activities_count

        add_activity_node(activity)

    print datetime.now(), 'Finished processing'
    print >> debug_file, datetime.now(), 'Finished processing'

2016-08-14 01:38:24.105153 Started processing
2016-08-14 01:38:24.113274 Processed 0 of 579320
2016-08-14 01:39:39.455916 Processed 10000 of 579320
2016-08-14 01:40:36.756065 Processed 20000 of 579320
2016-08-14 01:42:22.350465 Processed 30000 of 579320
2016-08-14 01:43:31.377944 Processed 40000 of 579320
2016-08-14 01:45:00.447408 Processed 50000 of 579320
2016-08-14 01:47:37.608630 Processed 60000 of 579320
2016-08-14 01:49:07.913076 Processed 70000 of 579320
2016-08-14 01:50:01.977034 Processed 80000 of 579320
2016-08-14 01:51:29.645964 Processed 90000 of 579320
2016-08-14 01:52:45.046003 Processed 100000 of 579320
2016-08-14 01:54:17.855444 Processed 110000 of 579320
2016-08-14 01:56:08.393541 Processed 120000 of 579320
2016-08-14 01:59:15.942951 Processed 130000 of 579320
2016-08-14 02:00:02.089409 Processed 140000 of 579320
2016-08-14 02:03:03.841892 Processed 150000 of 579320
2016-08-14 02:04:18.308357 Processed 160000 of 579320
2016-08-14 02:06:00.875998 Processed 170000 of 579

In [None]:
len(publisher_nodes)

310

In [None]:
len(organization_nodes)

10500

In [None]:
len(activity_nodes)

559502

In [None]:
len(activity_relationships)

559502

## Confirm the Keys in Our Nodes

In [None]:
node_keys = set()

for node in activity_nodes.itervalues():
    node_keys |= set(node.keys())

In [None]:
node_keys

{u'description',
 'description_raw',
 u'iati-identifier',
 u'location',
 'policy-marker',
 u'recipient-country',
 'sector'}

## Store as Pickled Files

In [None]:
with open('graph_publisher_nodes.pickle', 'w') as f:
    pickle.dump(publisher_nodes, f)

In [None]:
with open('graph_organization_nodes.pickle', 'w') as f:
    pickle.dump(organization_nodes, f)

In [None]:
with open('graph_activity_nodes.pickle', 'w') as f:
    pickle.dump(activity_nodes, f)

In [None]:
with open('graph_activity_relationships.pickle', 'w') as f:
    pickle.dump(activity_relationships, f)

In [None]:
!rm graph_pickle.tar.gz
!tar -cf graph_pickle.tar graph_*.pickle
!gzip graph_pickle.tar

rm: cannot remove ‘graph_pickle.tar.gz’: No such file or directory


In [None]:
!aws s3 cp graph_pickle.tar.gz s3://mdang.w210/ --acl public-read

upload: ./graph_pickle.tar.gz to s3://mdang.w210/graph_pickle.tar.gz
