# Install Packages

In [None]:
%matplotlib inline

In [None]:
!sudo -H pip install pymongo



In [None]:
from collections import defaultdict
from datetime import datetime
import functools
import io
import itertools
import json
import os
import pandas
import pickle
from pprint import PrettyPrinter
import pymongo
import re

In [None]:
printer = PrettyPrinter()

# Data Check

In [None]:
mongodb_host = 'ec2-54-213-2-79.us-west-2.compute.amazonaws.com'
mongodb_port = '27017'

In [None]:
conn=pymongo.MongoClient('mongodb://%s:%s' % (mongodb_host, mongodb_port))

# Check database names to test connection and see what's here
conn.database_names()

[u'iati', u'local']

In [None]:
# Make iati database
db = conn.iati

# Create new activities collection and check that manual renaming worked

activities=db.activities
activities_metadata=db.activities_metadata

db.collection_names()

[u'organizations',
 u'activities',
 u'activities_metadata',
 u'transactions',
 u'organizations_metadata']

In [None]:
print activities_metadata.count(), activities.count()

3562 579320


# Cache in Memory

## Cache Metadata

In [None]:
all_metadata = {}
metadata_count = activities_metadata.count()

print datetime.now(), 'Started processing'

for num, metadata_item in enumerate(activities_metadata.find()):
    if num % 1000 == 0:
        print datetime.now(), 'Processed', num, 'of', metadata_count

    all_metadata[metadata_item['@w210-key']] = metadata_item

print datetime.now(), 'Finished processing'

2016-07-28 06:57:57.528750 Started processing
2016-07-28 06:57:57.537295 Processed 0 of 3562
2016-07-28 06:57:57.632291 Processed 1000 of 3562
2016-07-28 06:57:57.755748 Processed 2000 of 3562
2016-07-28 06:57:57.800810 Processed 3000 of 3562
2016-07-28 06:57:57.802248 Finished processing


In [None]:
len(all_metadata)

3562

## Store Names from Metadata

In [None]:
iati_urls = {}

In [None]:
for metadata in activities_metadata.find():
    filename = metadata['@w210-key']

    if 'organization' not in metadata:
        continue

    organization = metadata['organization']

    if organization is None:
        continue

    urlname = organization['name']
    title = organization['title']

    iati_urls[filename] = (urlname, title)

In [None]:
len(iati_urls)

3550

So we see that about 12 of the organizations don't have a clean URL we can easily derive. Let's see which ones those are.

In [None]:
set(all_metadata.keys()) - set(iati_urls.keys())

{u'aai-2015',
 u'acord-289',
 u'btc-ctb-all',
 u'dapp-mw-activities',
 u'ddp-activityfile1',
 u'ia_nam-o379',
 u'mamac-activities',
 u'mapaction-gb',
 u'mapaction-mw',
 u'st_vluchteling-activities',
 u'unops-at',
 u'yipl-activities'}

All of these seem to be okay, so we'll go ahead and pickle this file.

In [None]:
with open('iati_urls.pickle', 'w') as f:
    pickle.dump(iati_urls, f)

## Cache Activities

Bring all the activities out of MongoDB and into memory so that we can process them much faster. Iterating over the MongoDB collection takes minutes. Iterating over an in-memory version containing only the fields we're interested in takes seconds.

However, we need a lot of memory to fit everything, so we'll only include a few of the fields which might be useful when constructing the graph.

* http://iatistandard.org/202/activity-standard/elements/

In [None]:
all_activities = []
activities_count = activities.count()

fields = set([
    '@w210-key',
    'iati-identifier',
    'other-identifier',
    'reporting-org',
    'participating-org',
    'activity-status',
    'recipient-country',
    'recipient-region',
    'location',
    'sector',
    'transaction'
])

print datetime.now(), 'Started processing'

for num, activity in enumerate(activities.find()):
    if num % 50000 == 0:
        print datetime.now(), 'Processed', num, 'of', activities_count

    activity_copy = { key: value for key, value in activity.iteritems() if key in fields }

    all_activities.append(activity_copy)

print datetime.now(), 'Finished processing'

2016-07-28 07:09:42.107587 Started processing
2016-07-28 07:09:42.114161 Processed 0 of 579320
2016-07-28 07:09:52.733305 Processed 50000 of 579320
2016-07-28 07:10:03.904216 Processed 100000 of 579320
2016-07-28 07:10:15.481560 Processed 150000 of 579320
2016-07-28 07:10:28.677899 Processed 200000 of 579320
2016-07-28 07:10:38.064521 Processed 250000 of 579320
2016-07-28 07:10:47.166304 Processed 300000 of 579320
2016-07-28 07:11:03.271878 Processed 350000 of 579320
2016-07-28 07:11:14.547123 Processed 400000 of 579320
2016-07-28 07:11:28.687019 Processed 450000 of 579320
2016-07-28 07:11:45.847730 Processed 500000 of 579320
2016-07-28 07:11:51.408295 Processed 550000 of 579320
2016-07-28 07:12:02.321137 Finished processing


In [None]:
len(all_activities)

579320

In [None]:
iati_urls_by_ref = {}

for activity in all_activities:
    if 'reporting-org' not in activity:
        continue

    filename = activity['@w210-key']

    if filename in iati_urls:
        iati_name = iati_urls[filename][0]
    else:
        iati_name = filename[0:filename.find('-')]

    iati_url = 'https://www.iatiregistry.org/publisher/%s' % iati_name

    organization = activity['reporting-org']

    if '@ref' not in organization:
        continue

    organization_ref = organization['@ref']

    if organization_ref not in iati_urls_by_ref:
        iati_urls_by_ref[organization_ref] = set([iati_url])
    else:
        iati_urls_by_ref[organization_ref].add(iati_url)

Let's pickle the file.

In [None]:
with open('iati_urls_by_ref.pickle', 'w') as f:
    pickle.dump(iati_urls_by_ref, f)

# Check Fields (Summary)

Let's get a sense of how populated each field is in the data.

In [None]:
# Count how many times each of the keys of interest appears
# in our activity data set.

field_counts = { key: 0 for key in fields }
field_keys = field_counts.keys()

print datetime.now(), 'Started processing'

for activity in all_activities:
    for key in field_keys:
        if key in activity:
            field_counts[key] += 1

print datetime.now(), 'Finished processing'

field_counts

2016-07-23 20:30:16.032295 Started processing
2016-07-23 20:30:18.958755 Finished processing


{'@w210-key': 579320,
 'activity-status': 568179,
 'iati-identifier': 578806,
 'location': 267279,
 'other-identifier': 170308,
 'participating-org': 570215,
 'recipient-country': 451309,
 'recipient-region': 193359,
 'reporting-org': 579320,
 'sector': 393588,
 'transaction': 546455}

Based on these counts, we'll prefer anything where we can:

* Include the file name
* Include the participating organization
* Include the reporting organization
* Include the transaction information

# Check Fields (One Record)

In [None]:
activity = all_activities[0]

In [None]:
activity.keys()

[u'sector',
 u'reporting-org',
 u'transaction',
 u'participating-org',
 u'iati-identifier',
 u'recipient-country',
 u'@w210-key',
 u'other-identifier',
 u'activity-status']

In [None]:
activity['iati-identifier']

u'XM-OCHA-FTS-CAR-15/P-HR-RL/94360/R/13864'

In [None]:
# http://iatistandard.org/202/activity-standard/iati-activities/iati-activity/sector/
# Vocabulary: http://iatistandard.org/202/codelists/SectorVocabulary/

activity['sector']

[{u'@code': u'72050',
  u'@vocabulary': u'1',
  u'narrative': u'Relief coordination; protection and support services'},
 {u'@code': u'7',
  u'@vocabulary': u'99',
  u'narrative': u'Protection/Human rights/Rule of law'}]

In [None]:
# Ref: http://iatistandard.org/202/activity-standard/iati-activities/iati-activity/participating-org/
# Type: http://iatistandard.org/202/codelists/OrganisationType/

activity['reporting-org']

{u'@ref': u'XM-OCHA-FTS',
 u'@secondary-reporter': u'1',
 u'@type': u'40',
 u'narrative': u'UNOCHA-FTS'}

In [None]:
# Ref: http://iatistandard.org/202/activity-standard/iati-activities/iati-activity/participating-org/
# Role: http://iatistandard.org/202/codelists/OrganisationRole/
# Type: http://iatistandard.org/202/codelists/OrganisationType/

activity['participating-org']

[{u'@ref': u'XM-OCHA-FTS13864',
  u'@role': u'4',
  u'narrative': u'Vitalit\xe9 Plus'},
 {u'@ref': u'XM-OCHA-FTS7622',
  u'@role': u'1',
  u'@type': u'40',
  u'narrative': u'Common Humanitarian Fund'}]

In [None]:
# Code: http://iatistandard.org/202/codelists/ActivityStatus/

activity['activity-status']

{u'@code': u'2'}

In [None]:
# Code: http://iatistandard.org/202/codelists/Country/

activity['recipient-country']

{u'@code': u'CF'}

In [None]:
# FinanceType: http://iatistandard.org/202/codelists/FinanceType/
# TransactionType: http://iatistandard.org/202/codelists/TransactionType/

activity['transaction']

{u'@generated-datetime': u'2016-06-25T22:49:09',
 u'@ref': u'248592',
 u'@w210-activity': u'XM-OCHA-FTS-CAR-15/P-HR-RL/94360/R/13864',
 u'@w210-key': u'ocha_fts-central_african_republic_2015',
 u'finance-type': {u'@code': u'110'},
 u'fts:contribution-category': {u'#text': u'Allocation',
  u'@code': u'1',
  u'@xmlns:fts': u'nothing'},
 u'provider-org': {u'@ref': u'XM-OCHA-FTS7622',
  u'narrative': u'Common Humanitarian Fund'},
 u'receiver-org': {u'@ref': u'XM-OCHA-FTS13864',
  u'narrative': u'Vitalit\xe9 Plus'},
 u'transaction-date': {u'@iso-date': u'2015-08-28'},
 u'transaction-type': {u'@code': u'3'},
 u'value': {u'#text': u'231332', u'@value-date': u'2015-08-28'}}

# Find Usable Field

In [None]:
def flatten(item_list):
    if type(item_list) != list:
        return [item_list]

    return_value = []

    # Recursively flatten lists of lists

    for item in item_list:
        if type(item) == list:
            return_value += flatten(item)
        else:
            return_value.append(item)

    return return_value

In [None]:
def get_text(element, attribute):

    if element is None:
        return None

    if attribute not in element:
        return None

    try:
        value = element[attribute]
    except Exception as e:
        print element, attribute
        raise e

    if type(value) == dict:
        if '#text' in value:
            return value['#text']

        return None

    if type(value) != list:
        return value

    return [
        item['#text'] if type(item) == dict and '#text' in item else
            None if type(item) == dict else item
        for item in value
    ]

In [None]:
def get_node_list(parent, field_keys):
    value = parent

    for key in field_keys:

        # If we have a dictionary, we simply access the attribute

        if type(value) == dict:
            if key not in value:
                return []

            value = value[key]
            continue

        # If we have something that is neither a dict nor a list, we
        # cannot navigate further down the JSON object, so we were
        # unable to find what we needed.

        if type(value) != list:
            return []

        # If we have a list, then we'll check the key in each element
        # of the list.

        value = [ item[key] for item in value if item is not None and key in item ]

    if value is None:
        return []

    if type(value) != list:
        value = [value]

    return value

In [None]:
def get_value(activity, field_keys):
    node_list = get_node_list(activity, field_keys)

    return_value = []

    for node in node_list:
        ref = get_text(node, '@ref')
        narrative = get_text(node, 'narrative')

        if narrative is not None:
            return_value.append({
                'ref': ref, 'narrative': narrative
            })

        narrative = get_text(node, '@narrative')

        if narrative is not None:
            return_value.append({
                'ref': ref, 'narrative': narrative
            })

    return return_value

In [None]:
def get_values(field_path):
    field_keys = field_path.split('.')

    return_values = []

    for activity in all_activities:
        new_values = get_value(activity, field_keys)

        for value in new_values:
            value['file_id'] = activity['@w210-key']

        return_values += new_values

    return return_values

# Check Data

In [None]:
def get_new_version_values(element_name):
    return pandas.DataFrame(get_values(element_name))

## Reporting Organizations

In [None]:
reporters = get_new_version_values('reporting-org')

In [None]:
reporters.head()

Unnamed: 0,file_id,narrative,ref
0,ocha_fts-central_african_republic_2015,UNOCHA-FTS,XM-OCHA-FTS
1,ocha_fts-central_african_republic_2015,UNOCHA-FTS,XM-OCHA-FTS
2,ocha_fts-central_african_republic_2015,UNOCHA-FTS,XM-OCHA-FTS
3,ocha_fts-central_african_republic_2015,UNOCHA-FTS,XM-OCHA-FTS
4,ocha_fts-central_african_republic_2015,UNOCHA-FTS,XM-OCHA-FTS


In [None]:
reporters.tail()

Unnamed: 0,file_id,narrative,ref
377877,dfid-998_2,Department for International Development,GB-GOV-1
377878,dfid-998_2,Department for International Development,GB-GOV-1
377879,dfid-998_2,Department for International Development,GB-GOV-1
377880,dfid-998_2,Department for International Development,GB-GOV-1
377881,dfid-998_2,Department for International Development,GB-GOV-1


## Other Identifier

In [None]:
others = get_new_version_values('other-identifier.owner-org')

In [None]:
others.head()

Unnamed: 0,file_id,narrative,ref
0,ocha_fts-central_african_republic_2015,UN OCHA Financial Tracking Service,XM-OCHA-FTS
1,ocha_fts-central_african_republic_2015,UN OCHA Financial Tracking Service,XM-OCHA-FTS
2,ocha_fts-central_african_republic_2015,UN OCHA Financial Tracking Service,XM-OCHA-FTS
3,ocha_fts-central_african_republic_2015,UN OCHA Financial Tracking Service,XM-OCHA-FTS
4,ocha_fts-central_african_republic_2015,UN OCHA Financial Tracking Service,XM-OCHA-FTS


In [None]:
others.tail()

Unnamed: 0,file_id,narrative,ref
112472,dfid-998_2,DFID previous reporting-org identifier,GB-GOV-1
112473,dfid-998_2,DFID,GB-GOV-1
112474,dfid-998_2,DFID previous reporting-org identifier,GB-GOV-1
112475,dfid-998_2,DFID,GB-GOV-1
112476,dfid-998_2,DFID previous reporting-org identifier,GB-GOV-1


## Participating Organizations

In [None]:
participants = get_new_version_values('participating-org')

In [None]:
participants.head()

Unnamed: 0,file_id,narrative,ref
0,ocha_fts-central_african_republic_2015,Vitalité Plus,XM-OCHA-FTS13864
1,ocha_fts-central_african_republic_2015,Common Humanitarian Fund,XM-OCHA-FTS7622
2,ocha_fts-central_african_republic_2015,Norwegian Refugee Council,XM-OCHA-FTS5834
3,ocha_fts-central_african_republic_2015,Common Humanitarian Fund,XM-OCHA-FTS7622
4,ocha_fts-central_african_republic_2015,ACT Alliance / Lutheran World Federation,XM-OCHA-FTS5502


In [None]:
participants.tail()

Unnamed: 0,file_id,narrative,ref
1394942,dfid-998_2,Department for International Development,GB-GOV-1
1394943,dfid-998_2,Other.,52000
1394944,dfid-998_2,UNITED KINGDOM,GB
1394945,dfid-998_2,Department for International Development,GB-GOV-1
1394946,dfid-998_2,"University, college or other teaching institut...",51000


## Transaction Providers

In [None]:
transaction_providers = get_new_version_values('transaction.provider-org')

In [None]:
transaction_providers.head()

Unnamed: 0,file_id,narrative,ref
0,ocha_fts-central_african_republic_2015,Common Humanitarian Fund,XM-OCHA-FTS7622
1,ocha_fts-central_african_republic_2015,Common Humanitarian Fund,XM-OCHA-FTS7622
2,ocha_fts-central_african_republic_2015,Common Humanitarian Fund,XM-OCHA-FTS7622
3,ocha_fts-central_african_republic_2015,Common Humanitarian Fund,XM-OCHA-FTS7622
4,ocha_fts-central_african_republic_2015,Common Humanitarian Fund,XM-OCHA-FTS7622


In [None]:
transaction_providers.tail()

Unnamed: 0,file_id,narrative,ref
496768,dfid-998_2,Department for International Development,GB-GOV-1
496769,dfid-998_2,Department for International Development,GB-GOV-1
496770,dfid-998_2,Department for International Development,GB-GOV-1
496771,dfid-998_2,Department for International Development,GB-GOV-1
496772,dfid-998_2,Department for International Development,GB-GOV-1


## Transaction Receivers

In [None]:
transaction_receivers = get_new_version_values('transaction.receiver-org')

In [None]:
transaction_receivers.head()

Unnamed: 0,file_id,narrative,ref
0,ocha_fts-central_african_republic_2015,Vitalité Plus,XM-OCHA-FTS13864
1,ocha_fts-central_african_republic_2015,Norwegian Refugee Council,XM-OCHA-FTS5834
2,ocha_fts-central_african_republic_2015,ACT Alliance / Lutheran World Federation,XM-OCHA-FTS5502
3,ocha_fts-central_african_republic_2015,Vitalité Plus,XM-OCHA-FTS13864
4,ocha_fts-central_african_republic_2015,Vitalité Plus,XM-OCHA-FTS13864


In [None]:
transaction_receivers.tail()

Unnamed: 0,file_id,narrative,ref
509281,dfid-998_2,Journal Transaction,Excluded
509282,dfid-998_2,Journal Transaction,Excluded
509283,dfid-998_2,Climate Policy Initiative,
509284,dfid-998_2,Emerging Markets Private Equity Association (E...,
509285,dfid-998_2,TRANSITION INTERNATIONAL,52000


# Check Ref-Names

## Check Ref to Name Mapping

In [None]:
def get_names(name_lookup, df):
    for file_id, ref, narrative in zip(df['file_id'], df['ref'], df['narrative']):
        if ref is None or narrative is None:
            continue

        ref = ref.strip()

        if len(ref) == 0:
            continue

        if type(narrative) != list:
            narrative = narrative.strip()

            if len(narrative) > 0:
                name_lookup[ref].add(narrative)

            continue

        for item in narrative:
            if item is None:
                continue

            item = item.strip()

            if len(item) > 0:
                name_lookup[ref].add(item)

In [None]:
name_lookup = defaultdict(set)

In [None]:
get_names(name_lookup, reporters)
get_names(name_lookup, others)
get_names(name_lookup, participants)
get_names(name_lookup, transaction_providers)
get_names(name_lookup, transaction_receivers)

In [None]:
len(name_lookup)

8240

In [None]:
multiple_names = []

for ref, names in name_lookup.iteritems():
    if len(names) > 1:
        multiple_names.append({'ref': ref, 'narrative': names})

In [None]:
len(multiple_names)

1543

In [None]:
pandas.DataFrame(multiple_names).head()

Unnamed: 0,narrative,ref
0,{Fundación Española para la Cooperación Solida...,AC-326
1,{The Campaign for Popular Education (CAMPE) is...,501119
2,{Gana Unnayan Kendra (GUK) as a non-government...,501114
3,"{Shalom Foundation, Nyein is one of the more c...",504764
4,"{Dutch Employers Cooperation Programme, DECP, ...",NL-KVK-27284008


## Check Name to Ref Mapping

In [None]:
def get_refs(ref_lookup, df):
    for file_id, ref, narrative in zip(df['file_id'], df['ref'], df['narrative']):
        if ref is None or narrative is None:
            continue

        ref = ref.strip()

        if len(ref) == 0:
            continue

        if type(narrative) != list:
            narrative = narrative.strip()

            if len(narrative) > 0:
                ref_lookup[narrative].add(ref)

            continue

        for item in narrative:
            if item is None:
                continue

            item = item.strip()

            if len(item) > 0:
                ref_lookup[item].add(ref)

In [None]:
ref_lookup = defaultdict(set)

In [None]:
get_refs(ref_lookup, reporters)
get_refs(ref_lookup, others)
get_refs(ref_lookup, participants)
get_refs(ref_lookup, transaction_providers)
get_refs(ref_lookup, transaction_receivers)

In [None]:
len(ref_lookup)

15994

In [None]:
multiple_refs = []

for name, refs in ref_lookup.iteritems():
    if len(refs) > 1:
        multiple_refs.append({'narrative': name, 'ref': refs})

In [None]:
len(multiple_refs)

908

In [None]:
pandas.DataFrame(multiple_refs).head()

Unnamed: 0,narrative,ref
0,UNDP (Direct Execution),"{03332, 02186, 03235, 00946, 01940, 02975, 004..."
1,SCIDEV.NET,"{23000, 20000}"
2,YoungInnovations Pvt. Ltd.,"{NP-CRO-45995-063-064, NP-CRO-45995/063/064}"
3,Heart to Heart International,"{US-EIN-48-1108359, XM-OCHA-FTS5538}"
4,PRI Head Office,"{GB-COH-04154075-PRI-001, GB-COH-04154075-PPA-..."


## Save the Refs/Names

In [None]:
with open('lookup_by_ref.pickle', 'w') as f:
    pickle.dump(name_lookup, f)

In [None]:
with open('lookup_by_name.pickle', 'w') as f:
    pickle.dump(ref_lookup, f)

In [None]:
!rm -f lookup.tar.gz
!tar -cf lookup.tar lookup_by_ref.pickle lookup_by_name.pickle
!gzip lookup.tar

In [None]:
!aws s3 cp lookup.tar.gz s3://mdang.w210/ --acl public-read

upload: ./lookup.tar.gz to s3://mdang.w210/lookup.tar.gz


# Create Graph Files

In [None]:
def clean_text(text):
    return re.sub('\s+', ' ', text).strip()

In [None]:
def get_edge_list(
    activity, field_keys, left_element, left_child, right_element, right_child,
    left_reporting_org_fallback, right_reporting_org_fallback):

    node_list = get_node_list(activity, field_keys)

    return_value = []

    left_fallback = None

    if left_reporting_org_fallback is not None:
        left = get_node_list(activity, ['reporting-org'])
        left_fallback = get_text(left[0], left_reporting_org_fallback)

    right_fallback = None

    if right_reporting_org_fallback is not None:
        right = get_node_list(activity, ['reporting-org'])
        right_fallback = get_text(right[0], right_reporting_org_fallback)

    for node in node_list:
        if left_element not in node or right_element not in node:
            continue

        left = get_node_list(node, [left_element])
        right = get_node_list(node, [right_element])

        for left_node, right_node in itertools.product(left, right):
            left_list = get_text(left_node, left_child)
            right_list = get_text(right_node, right_child)

            if left_list is None:
                left_list = left_fallback

            if right_list is None:
                right_list = right_fallback

            if left_list is None or right_list is None:
                continue

            if type(left_list) != list:
                left_list = [left_list]

            if type(right_list) != list:
                right_list = [right_list]

            return_value += [
                (activity['@w210-key'], clean_text(left_value), clean_text(right_value))
                    for left_value, right_value in itertools.product(left_list, right_list)
                        if left_value is not None and right_value is not None
            ]

    return return_value

In [None]:
def get_edges(
    field_path, left_element, left_child, right_element, right_child,
    left_reporting_org_fallback = None, right_reporting_org_fallback = None):

    if field_path is None:
        field_keys = []
    else:
        field_keys = field_path.split('.')

    return_values = []

    for activity in all_activities:
        new_values = get_edge_list(
            activity, field_keys, left_element, left_child, right_element, right_child,
            left_reporting_org_fallback, right_reporting_org_fallback)

        return_values += new_values

    return return_values

## Graph from Root Elements

In [None]:
root_edges_ref = get_edges(
    None, 'reporting-org', '@ref', 'participating-org', '@ref')

In [None]:
len(root_edges_ref)

1542373

In [None]:
with open('graph_root_ref.txt', 'w') as f:
    pickle.dump(root_edges_ref, f)

In [None]:
root_edges_narrative = get_edges(
    None, 'reporting-org', 'narrative', 'participating-org', 'narrative')

In [None]:
len(root_edges_narrative)

1565548

In [None]:
with open('graph_root_narrative.txt', 'w') as f:
    pickle.dump(root_edges_ref, f)

## Graph from Transaction Elements

In [None]:
transaction_edges_ref = get_edges(
    'transaction', 'provider-org', '@ref', 'receiver-org', '@ref', '@ref', None)

In [None]:
len(transaction_edges_ref)

436654

In [None]:
with open('graph_transaction_ref.txt', 'w') as f:
    pickle.dump(transaction_edges_ref, f)

In [None]:
transaction_edges_narrative = get_edges(
    'transaction', 'provider-org', 'narrative', 'receiver-org', 'narrative', 'narrative', None)

In [None]:
len(transaction_edges_narrative)

575730

In [None]:
with open('graph_transaction_narrative.txt', 'w') as f:
    pickle.dump(root_edges_ref, f)

## Graph from Recipient Country

In [None]:
recipient_edges_ref = get_edges(
    None, 'recipient-country', '@code', 'reporting-org', '@ref')

In [None]:
len(recipient_edges_ref)

462486

In [None]:
with open('graph_country_ref.txt', 'w') as f:
    pickle.dump(recipient_edges_ref, f)

# Load Graphs

In [None]:
def get_node_id(node_ids, name):

    # Increment the counter if we haven't seen it

    if name not in node_ids:
        node_ids[name] = len(node_ids)

    return node_ids[name]

In [None]:
def load_graph(file_name):
    node_ids = {}

    get_graph_node_id = functools.partial(get_node_id, node_ids)

    # Iterate once in order to initialize the node ID dictionary

    print datetime.now(), 'Identifying nodes in', file_name

    with open(file_name, 'r') as graph_file:
        edges = pickle.load(graph_file)

        for activity_file, source_name, target_name in edges:
            source_id = get_graph_node_id(source_name)
            target_id = get_graph_node_id(target_name)

    node_count = len(node_ids)

    graph_matrix = dok_matrix((node_count, node_count))

    print datetime.now(), 'Building sparse matrix for', file_name

    # Iterate again in order to populate the sparse matrix

    with open(file_name, 'r') as graph_file:
        edges = pickle.load(graph_file)

        for activity_file, source_name, target_name in edges:
            source_id = get_graph_node_id(source_name)
            target_id = get_graph_node_id(target_name)

            graph_matrix[source_id, target_id] += 1

    print datetime.now(), 'Finished processing', file_name

    return node_ids, graph_matrix

# Find Dangling Nodes

In [None]:
def get_dangling_nodes(check_axis, graph_ids, graph):

    # Create a reverse lookup table

    graph_names = { value: key for key, value in graph_ids.iteritems() }

    # Sum by the axis and identify the non-zero entries

    sums = graph.sum(axis = check_axis)
    sums = sums.reshape((sums.shape[1 - check_axis], 1))

    return [graph_names[index] for index, value in enumerate(sums) if value == 0]

In [None]:
get_source_nodes = functools.partial(get_dangling_nodes, 0)
get_sink_nodes = functools.partial(get_dangling_nodes, 1)

# Run against All Graphs

In [None]:
!rm *.ids
!rm *.graph

rm: cannot remove ‘*.ids’: No such file or directory
rm: cannot remove ‘*.graph’: No such file or directory


In [None]:
graph_stats = []
graph_data = []

for file_name in os.listdir('.'):
    if file_name[0:5] != 'graph' or file_name[-4:] != '.txt':
        continue

    # Load the file from cache if we've already done the computation once

    if os.path.isfile(file_name + '.ids') and os.path.isfile(file_name + '.graph'):
        print datetime.now(), 'Loading cached graph for', file_name

        with open(file_name + '.ids', 'r') as id_file:
            graph_ids = pickle.load(id_file)

        with open(file_name + '.graph', 'r') as graph_file:
            graph = pickle.load(graph_file)

    # Otherwise, perform the computation and save the resulting computations

    else:
        graph_ids, graph = load_graph(file_name)

        with open(file_name + '.ids', 'w') as id_file:
            pickle.dump(graph_ids, id_file)

        with open(file_name + '.graph', 'w') as graph_file:
            pickle.dump(graph, graph_file)

    source_nodes = get_source_nodes(graph_ids, graph)
    sink_nodes = get_sink_nodes(graph_ids, graph)

    graph_stats.append({
        'graph file': file_name,
        'total nodes': len(graph_ids),
        'has both edges': len(graph_ids) - len(source_nodes) - len(sink_nodes),
        'has only outgoing edges': len(source_nodes),
        'has only incoming edges': len(sink_nodes)
    })

    graph_data.append({
        'file': file_name,
        'graph': graph,
        'all_nodes': graph_ids,
        'source_nodes': source_nodes,
        'sink_nodes': sink_nodes
    })

2016-07-08 08:51:56.241010 Identifying nodes in graph_root_narrative.txt
2016-07-08 08:52:07.905261 Building sparse matrix for graph_root_narrative.txt
2016-07-08 08:52:42.609892 Finished processing graph_root_narrative.txt
2016-07-08 08:52:43.652097 Loading cached graph for graph_transaction_ref.txt
2016-07-08 08:52:44.041622 Identifying nodes in graph_transaction_narrative.txt
2016-07-08 08:52:55.470379 Building sparse matrix for graph_transaction_narrative.txt
2016-07-08 08:53:30.061735 Finished processing graph_transaction_narrative.txt
2016-07-08 08:53:31.074964 Loading cached graph for graph_root_ref.txt


In [None]:
pandas.DataFrame(graph_stats)

Unnamed: 0,graph file,has both edges,has only incoming edges,has only outgoing edges,total nodes
0,graph_root_narrative.txt,165,14326,26,14517
1,graph_transaction_ref.txt,159,6980,1471,8610
2,graph_transaction_narrative.txt,165,14326,26,14517
3,graph_root_ref.txt,162,14338,29,14529


# Check a Graph

In [None]:
test_graph = graph_data[3]

In [None]:
test_graph['file']

'graph_root_ref.txt'

In [None]:
all_nodes = set(test_graph['all_nodes'])
source_nodes = set(test_graph['source_nodes'])
sink_nodes = set(test_graph['sink_nodes'])

Identify all nodes that have both incoming and outgoing edges.

In [None]:
sorted(all_nodes - source_nodes - sink_nodes)

[u'',
 u'21-PK-WWF',
 u'41108',
 u'41119',
 u'41122',
 u'41304',
 u'41AAA',
 u'44000',
 u'46002',
 u'46004',
 u'47045',
 u'47122',
 u'47134',
 u'47135',
 u'BE-10',
 u'BE-BCE_KBO-0264814354',
 u'CA-3',
 u'CA-CRA-89980-1815-RR0001',
 u'CH-4',
 u'DAC-1601',
 u'DE-1',
 u'DK-1',
 u'ES-DIR3-E04585801',
 u'FI-3',
 u'FI-PRO-1498487-2',
 u'GB-1-202615',
 u'GB-2',
 u'GB-3',
 u'GB-6',
 u'GB-7',
 u'GB-CHC-1001349',
 u'GB-CHC-1017255',
 u'GB-CHC-1043843',
 u'GB-CHC-1046001',
 u'GB-CHC-1050327',
 u'GB-CHC-1055436',
 u'GB-CHC-1065705',
 u'GB-CHC-1068839',
 u'GB-CHC-1071659',
 u'GB-CHC-1071886',
 u'GB-CHC-1074937',
 u'GB-CHC-1075920',
 u'GB-CHC-1079358',
 u'GB-CHC-1085096',
 u'GB-CHC-1089879',
 u'GB-CHC-1092236',
 u'GB-CHC-1098106',
 u'GB-CHC-1098752',
 u'GB-CHC-1099776',
 u'GB-CHC-1105489',
 u'GB-CHC-1109789',
 u'GB-CHC-1112734',
 u'GB-CHC-1115109',
 u'GB-CHC-1120413',
 u'GB-CHC-1127488',
 u'GB-CHC-1128267-8',
 u'GB-CHC-1128536',
 u'GB-CHC-1133342',
 u'GB-CHC-1157009',
 u'GB-CHC-1837621',
 u'GB-CHC-2

In [None]:
sorted(source_nodes)

[u'GB-CHC-1029161',
 u'GB-CHC-1089490',
 u'GB-CHC-208724',
 u'GB-CHC-254781',
 u'GB-CHC-261488',
 u'GB-CHC-290836',
 u'GB-CHC-328206',
 u'GB-COH-00637978',
 u'GB-COH-01926828',
 u'GB-COH-02394229',
 u'GB-COH-03877777',
 u'GB-COH-04154075',
 u'GB-COH-294860',
 u'GB-COH-7557881',
 u'GB-SC-044007',
 u'IM-CR-017899B',
 u'MW-CNM-21',
 u'NL-CCI-20081098 ',
 u'NL-KVK-32037590',
 u'NL-KVK-34308169',
 u'NL-KVK-40530953',
 u'NL-KVK-56484038',
 u'NL-KvK-30214009',
 u'NP-CRO-45995/063/064',
 u'UG-NGB-5914-1107',
 u'US-EIN-13-3287064NAM',
 u'US-EIN-941191246',
 u'XM-DAC-918-3',
 u'XM-OCHA-FTS']

In [None]:
sorted(sink_nodes)

[u'\n',
 u' ',
 u' GB-CHC-1112734 ',
 u' GB-COH-7557881',
 u' NL-KVK-51018586 ',
 u' NL-KVK-KVK 41151952',
 u'00001',
 u'00001-07',
 u'00002',
 u'00004',
 u'00005',
 u'00006',
 u'00007',
 u'00008',
 u'00009',
 u'00010',
 u'00011',
 u'00012',
 u'00013',
 u'00014',
 u'00015',
 u'00019',
 u'00020',
 u'00021',
 u'00023',
 u'00024',
 u'00027',
 u'00028',
 u'00029',
 u'00030',
 u'00032',
 u'00039',
 u'00040',
 u'00041',
 u'00042',
 u'00043',
 u'00044',
 u'00045',
 u'00054',
 u'00056',
 u'00057',
 u'00058',
 u'00060',
 u'00062',
 u'00064',
 u'00067',
 u'00068',
 u'00069',
 u'00070',
 u'00071',
 u'00072',
 u'00073',
 u'00077',
 u'00080',
 u'00081',
 u'00082',
 u'00085',
 u'00086',
 u'00087',
 u'00088',
 u'00089',
 u'00092',
 u'00093',
 u'00096',
 u'00097',
 u'00098',
 u'00100',
 u'00101',
 u'00102',
 u'00103',
 u'00105',
 u'00113',
 u'00114',
 u'00115',
 u'00119',
 u'00120',
 u'00121',
 u'00123',
 u'00124',
 u'00125',
 u'00128',
 u'00129',
 u'00130',
 u'00131',
 u'00132',
 u'00133',
 u'00135',