# Install Packages

In [None]:
%matplotlib inline

In [None]:
!sudo -H pip install pymongo neo4jrestclient



In [None]:
from collections import defaultdict
from datetime import datetime
import functools
import io
import itertools
import json
from neo4jrestclient.client import GraphDatabase, Node, Path
import os
import pandas
import pickle
import pymongo
import re

# Data Check

In [None]:
mongodb_host = 'ec2-54-213-2-79.us-west-2.compute.amazonaws.com'
mongodb_port = '27017'

In [None]:
conn=pymongo.MongoClient('mongodb://%s:%s' % (mongodb_host, mongodb_port))

# Check database names to test connection and see what's here
conn.database_names()

[u'iati', u'local']

In [None]:
# Make iati database
db = conn.iati

# Create new activities collection and check that manual renaming worked

activities=db.activities
activities_metadata=db.activities_metadata
transactions=db.transactions
organizations=db.organizations
organizations_metadata=db.organizations_metadata

db.collection_names()

[u'organizations',
 u'activities',
 u'activities_metadata',
 u'transactions',
 u'organizations_metadata']

In [None]:
print activities_metadata.count(), activities.count()

3562 579320


In [None]:
print transactions.count()

1775777


In [None]:
print organizations_metadata.count(), organizations.count()

203 7131


# Insert Into Neo4J

We'll now access the graph database via the REST API.

* http://neo4j-rest-client.readthedocs.io/en/latest/

In [None]:
neo4j_host = 'ec2-54-213-2-79.us-west-2.compute.amazonaws.com'
neo4j_port = '7474'

In [None]:
gdb = GraphDatabase('http://%s:%s/db/data/' % (neo4j_host, neo4j_port))

## Add Organizations

In [None]:
organizations.find_one()

{u'@default-currency': u'USD',
 u'@generated-datetime': u'2016-07-06T01:02:22.395',
 u'@last-updated-datetime': u'2013-11-09T13:02:41.000',
 u'@version': u'1.03',
 u'@w210-key': u'globalgiving-org',
 u'@xml:lang': u'en',
 u'_id': ObjectId('5785d992ea05914e24d33b29'),
 u'document-link': [{u'@format': u'text/html',
   u'@url': u'https://www.globalgiving.org/donate/1284/little-friends-for-peace-inc/',
   u'title': u"Organisation's GlobalGiving Page"},
  {u'@format': u'text/html',
   u'@url': u'http://www.lffp.org',
   u'title': u'Organisation Website'}],
 u'iati-identifier': u'US-GG-1284',
 u'name': u'Little Friends for Peace Inc',
 u'reporting-org': {u'#text': u'GlobalGiving',
  u'@ref': u'US-EIN-300108263',
  u'@type': u'21'}}

## Add Activities

In [None]:
label_organizations = gdb.labels.create('Organization')
label_activities = gdb.labels.create('Activity')

In [None]:
def remove_list(activity, attribute):
    if attribute not in activity:
        return []

    value = activity[attribute]
    del activity[attribute]

    if type(value) != list:
        value = [value]

    return value

In [None]:
activity_nodes = {}
organization_nodes = {}

In [None]:
def add_organization(organization):
    global organization_nodes

    if '@ref' not in organization:
        return None

    organization_ref = organization['@ref']

    if organization_ref not in organization_nodes:
        organization_node = label_organizations.create(
            name = organization_ref)

        organization_nodes[organization_ref] = organization_node
    else:
        organization_node = organization_nodes[organization_ref]

    return organization_node

In [None]:
def add_activity_relationship(activity_node, relationships, reporter, participant):
    reporter_node = add_organization(reporter)
    participant_node = add_organization(participant)

    if reporter_node is None or participant_node is None:
        return

    if 'Publisher' not in reporter_node.labels:
        reporter_node.labels.add('Publisher')

    has_relationship = False

    for relationship in relationships:
        if relationship.start == reporter_node and relationship.end == participant_node:
            has_relationship = True
            break

    if not has_relationship:
        if reporter_node != participant_node:
            reporter_node.relationships.create('Reported', activity_node)

        relationship = activity_node.relationships.create('Has_Participant', participant_node)
        relationships.append(relationship)

In [None]:
def get_text(item):
    if type(item) == dict:
        if '#text' in item:
            return get_text(item['#text'])
        else:
            return ''

    return item

In [None]:
# Based off of the following StackOverflow post for flattening dictionaries
# http://stackoverflow.com/questions/6027558/flatten-nested-python-dictionaries-compressing-keys

def flatten(d, parent_key=None):
    items = []

    for k, v in d.items():
        new_key = parent_key + '_' + k if parent_key is not None else k

        new_key = new_key.replace('@', '')
        new_key = new_key.replace('#', '')
        new_key = new_key.replace(':', '')
        new_key = new_key.replace('-', '_')

        if type(v) == dict:
            items.extend(flatten(v, new_key))
        elif type(v) == list:
            sub_items = [get_text(item) for item in v]
            items.append((new_key, sub_items))
        else:
            items.append((new_key, v))

    return items

In [None]:
activity_fields = set([
    'iati-identifier',
    'reporting-org',
    'participating-org',
    'recipient-country',
    'recipient-region',
    'location',
    'policy-marker',
    'sector'
])

In [None]:
def replace_list(activity, key, attribute):
    item_list = remove_list(activity, key)

    if item_list is None or len(item_list) == 0:
        return

    value = [
        item[attribute]
            for item in item_list
                if item is not None and attribute in item and item[attribute] != ''
    ]

    if len(value) > 0:
        activity[key] = value

In [None]:
def get_sectors(activity):
    if 'sector' not in activity:
        return []

    sectors = activity['sector']

    if type(sectors) == dict:
        sectors = [sectors]

    return [sector['@code'] for sector in sectors if '@code' in sector]

In [None]:
def add_activity_node(activity):
    global activity_nodes

    if 'iati-identifier' not in activity:
        return

    activity = {
        key : value for key, value in activity.iteritems()
            if key in activity_fields
    }

    activity_key = activity['iati-identifier']

    if activity_key in activity_nodes:
        return

    del activity['iati-identifier']

    reporters = remove_list(activity, 'reporting-org')
    participants = remove_list(activity, 'participating-org')

    replace_list(activity, 'activity-date', '@iso-date')
    replace_list(activity, 'sector', '@code')
    replace_list(activity, 'policy-marker', '@code')

    activity = dict(flatten(activity))

    try:
        relationships = []

        activity_node = label_activities.create(name = activity_key, **activity)
        activity_nodes[activity_key] = activity_node

        for reporter, participant in itertools.product(reporters, participants):
            add_activity_relationship(activity_node, relationships, reporter, reporter)
            add_activity_relationship(activity_node, relationships, reporter, participant)
    except Exception as e:
        print json.dumps(activity, indent=2)
        raise e

In [None]:
activity_nodes = {}
organization_nodes = {}

activities_count = activities.count()

print datetime.now(), 'Clearing existing graph'

query = "MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE n,r"
result = gdb.query(q=query)

print datetime.now(), 'Started processing'

with open('graph_fill.txt', 'w', 0) as debug_file:
    for num, activity in enumerate(activities.find()):
        if num % 10000 == 0:
            print datetime.now(), 'Processed', num, 'of', activities_count
            print >> debug_file, datetime.now(), 'Processed', num, 'of', activities_count

        add_activity_node(activity)

    print datetime.now(), 'Finished processing'
    print >> debug_file, datetime.now(), 'Finished processing'

2016-08-01 05:18:50.477990 Clearing existing graph
2016-08-01 05:19:01.746346 Started processing
2016-08-01 05:19:01.754366 Processed 0 of 579320
2016-08-01 05:29:10.136493 Processed 10000 of 579320
2016-08-01 05:38:19.753823 Processed 20000 of 579320
2016-08-01 05:45:16.521487 Processed 30000 of 579320
2016-08-01 05:52:45.864003 Processed 40000 of 579320
2016-08-01 06:00:17.804106 Processed 50000 of 579320
2016-08-01 06:09:16.443076 Processed 60000 of 579320
2016-08-01 06:17:40.131950 Processed 70000 of 579320


## Add Indexes

In [None]:
result = gdb.query('CREATE INDEX ON :Organization(name)')
result = gdb.query('CREATE INDEX ON :Activity(sector)')
result = gdb.query('CREATE INDEX ON :Activity(policy_marker)')
result = gdb.query('CREATE INDEX ON :Activity(location_description)')

## Run Queries

In [None]:
query = '''
MATCH (n1)-[r1:Reported]-(a)-[r2:Has_Participant]-(n2)
WHERE a.location_description STARTS WITH 'Ghana' AND
(
    EXISTS(a.policy_marker) AND
    ANY(m in a.policy_marker WHERE m IN ['02','05','06','07','08'])
)
AND
(
    EXISTS(a.sector) AND
    a.sector IN ['14010','14015','14020','14021','14022','14030','14031','14032','14040','14081']
)
RETURN n1,n2
'''

In [None]:
results = gdb.query(query, returns=[Node,Node])

In [None]:
results.rows

In [None]:
for result in results:
    print result

This is the example query used in the slides:

    MATCH p=()-[r:Reported]->()
    WHERE r.location_description STARTS WITH 'Ghana' AND
    (
        EXISTS(r.policy_marker) AND
        ANY(m in r.policy_marker WHERE m IN ['02','05','06','07','08'])
    )
    AND
    (
        r.sector_code IN ['14010','14015','14020','14021','14022','14030','14031','14032','14040','14081']
    )
    RETURN p