# Install Packages

In [None]:
%matplotlib inline

In [None]:
!sudo -H pip install chardet pandas pymongo xmltodict



In [None]:
import chardet
from datetime import datetime
import functools
import io
import json
import os
import pandas
import pickle
import pprint
import pymongo
import xmltodict

# Reload the Metadata

In [None]:
# If we don't have any of the cached data files, download them from S3

if not os.path.isdir('iati'):
    !aws s3 --region us-west-2 cp s3://mdang.w210/iati.tar.gz .
    !tar -zxf iati.tar.gz

if not os.path.isfile('activities_metadata.json'):
    !aws s3 cp s3://mdang.w210/activities_metadata.json.gz .
    !gunzip activities_metadata.json.gz

if not os.path.isdir('iati-orgs'):
    !aws s3 --region us-west-2 cp s3://mdang.w210/iati-orgs.tar.gz
    !tar -zxf iati-orgs.tar.gz

if not os.path.isfile('organization_metadata.json'):
    !aws s3 cp s3://mdang.w210/organization_metadata.json.gz .

In [None]:
with io.open('activities_metadata.json', 'r', encoding = 'utf8') as f:
    activity_metadata_json = json.load(f)
    activity_metadata_dict = { item['name']: item for item in activity_metadata_json }

with io.open('organization_metadata.json', 'r', encoding = 'utf8') as f:
    organization_metadata_json = json.load(f)
    organization_metadata_dict = { item['name']: item for item in organization_metadata_json }

# Putting it all together

Recycling content from earlier to import everything directly to activity level in one go.  This should help deal with the size issue we encountered earlier.

First, I'll manually rename the existing collections to make it easier to keep track of things.  "activities" is now "xml_docs" and "simple_activities" is now "first_pass_activities".  I'll probably drop these both once I've got the real "activities" collection made (which happens below)

In [None]:
conn=pymongo.MongoClient()

# Check database names to test connection and see what's here
conn.database_names()

# Make iati database
db = conn.iati

In [None]:
db.drop_collection('activities')
db.drop_collection('activities_metadata')
db.drop_collection('organizations')
db.drop_collection('organizations_metadata')
db.drop_collection('transactions')

In [None]:
# Create new activities collection and check that manual renaming worked

activities=db.activities
activities_metadata=db.activities_metadata

organizations=db.organizations
organizations_metadata=db.organizations_metadata

transactions=db.transactions

db.collection_names()

[]

## Activities

In [None]:
def insert_activity(entries, f, metadata_key, generated_datetime):
    if type(entries) != list:
        entries = [entries]

    activity_transactions = []

    for entry in entries:
        if 'transaction' in entry:
            entry_transactions = entry['transaction']

            if type(entry_transactions) != list:
                entry_transactions = [entry_transactions]

            for transaction in entry_transactions:
                if 'iati-identifier' in entry:
                    transaction['@w210-activity'] = entry['iati-identifier']

                transaction[u'@w210-key'] = metadata_key
                transaction[u'@generated-datetime'] = generated_datetime

            activity_transactions += entry_transactions

        entry[u'@w210-key'] = metadata_key
        entry[u'@generated-datetime'] = generated_datetime

    if len(entries) > 0:
        activities.insert_many(entries)

    if len(activity_transactions) > 0:
        transactions.insert_many(activity_transactions)

In [None]:
def insert_activities(f, i):
    # Extract the activities, but store the remainder as metadata we
    # can join with for other purposes.

    if 'iati-activity' not in i['iati-activities']:
        return

    entries = i['iati-activities']['iati-activity']
    del i['iati-activities']['iati-activity']

    metadata_key = f.name[5:-4]

    if metadata_key in activity_metadata_dict:
        i.update(activity_metadata_dict[metadata_key])

    # Save this alongside each entry

    i['@w210-key'] = metadata_key

    activities_metadata.insert_one(i)

    generated_datetime = None

    if '@generated-datetime' in i['iati-activities']:
        generated_datetime = i['iati-activities']['@generated-datetime']

    insert_activity(entries, f, metadata_key, generated_datetime)

In [None]:
def import_activity_xml(f, xml):
    i = xmltodict.parse(xml)

    # Check for errors in activity structure

    try:
        insert_activities(f, i)
    except Exception as e:
        activity_failures[f.name] = e

In [None]:
def import_activity_document(f):
    global activity_failures, activity_xml_failures

    # The XML file may be encoded as UTF-16 or really any other encoding.
    # If it doesn't contain the "<iati-activities" tag, then it may be
    # due to an encoding problem.

    xml=f.read()

    if xml.find('iati-activities') == -1:
        charset = chardet.detect(xml)['encoding']
        xml = xml.decode(charset)

        # If after decoding the file, we still don't have the tag we were
        # looking for, we probably wound up with an HTML file.

        if xml.find('iati-activities') == -1:
            activity_xml_failures[f.name] = 'html'
            return

    try:
        import_activity_xml(f, xml)
        return
    except Exception as e:
        pass

    try:
        charset = chardet.detect(xml)['encoding']
        xml = xml.decode(charset)

        import_activity_xml(f, xml)
    except Exception as e:
        activity_xml_failures[f.name] = e

In [None]:
activity_failures={}
activity_xml_failures={}

path_to_data = 'iati'

files = os.listdir(path_to_data)

print datetime.now(), 'Started processing'

for num, i in enumerate(os.listdir(path_to_data)):
    if num % 100 == 0:
        print datetime.now(), 'Processed', num, 'of', len(files)

    if not i.endswith(".xml"):
        continue

    with open(path_to_data+'/'+i) as f:
        import_activity_document(f)

print datetime.now(), 'Finished processing'

2016-07-10 15:59:34.503979 Started processing
2016-07-10 15:59:34.505726 Processed 0 of 3591
2016-07-10 16:01:18.711691 Processed 100 of 3591
2016-07-10 16:02:22.121229 Processed 200 of 3591
2016-07-10 16:03:23.672192 Processed 300 of 3591
2016-07-10 16:04:03.608214 Processed 400 of 3591
2016-07-10 16:05:10.387374 Processed 500 of 3591
2016-07-10 16:06:46.726610 Processed 600 of 3591
2016-07-10 16:08:21.990811 Processed 700 of 3591
2016-07-10 16:09:32.672804 Processed 800 of 3591
2016-07-10 16:10:55.906091 Processed 900 of 3591
2016-07-10 16:12:07.690869 Processed 1000 of 3591
2016-07-10 16:13:17.715977 Processed 1100 of 3591
2016-07-10 16:14:08.524673 Processed 1200 of 3591
2016-07-10 16:15:00.285376 Processed 1300 of 3591
2016-07-10 16:15:52.603794 Processed 1400 of 3591
2016-07-10 16:16:34.878111 Processed 1500 of 3591
2016-07-10 16:17:24.813415 Processed 1600 of 3591
2016-07-10 16:18:31.411999 Processed 1700 of 3591
2016-07-10 16:19:43.130588 Processed 1800 of 3591
2016-07-10 16:20

In [None]:
print activities.count(), transactions.count()
print len(activity_xml_failures)
print len(activity_failures)

579320 2251178
17
0


In [None]:
activities.find_one()

{u'@default-currency': u'USD',
 u'@generated-datetime': u'2016-06-25T22:49:09',
 u'@hierarchy': u'1',
 u'@last-updated-datetime': u'2016-04-29T10:55:00',
 u'@w210-key': u'ocha_fts-central_african_republic_2015',
 u'@xml:lang': u'en',
 u'_id': ObjectId('578270e7b6b1a62fd0a941d4'),
 u'activity-date': [{u'@iso-date': u'2015-01-01', u'@type': u'1'},
  {u'@iso-date': u'2015-12-31', u'@type': u'3'}],
 u'activity-status': {u'@code': u'2'},
 u'budget': [{u'@type': u'1',
   u'period-end': {u'@iso-date': u'2015-12-31'},
   u'period-start': {u'@iso-date': u'2015-01-01'},
   u'value': {u'#text': u'0', u'@value-date': u'2015-01-01'}},
  {u'@type': u'2',
   u'period-end': {u'@iso-date': u'2015-12-31'},
   u'period-start': {u'@iso-date': u'2015-01-01'},
   u'value': {u'#text': u'0', u'@value-date': u'2015-01-01'}}],
 u'description': {u'@type': u'2', u'narrative': u'n/a'},
 u'fts:appeal-title': {u'#text': u'Central African Republic 2015',
  u'@xmlns:fts': u'nothing'},
 u'fts:appeal-type': {u'#text': u

In [None]:
activities_metadata.find_one()

{u'@w210-key': u'ocha_fts-central_african_republic_2015',
 u'_id': ObjectId('578270e7b6b1a62fd0a941d3'),
 u'author': None,
 u'author_email': u'fts@un.org',
 u'extras': [{u'id': u'e906c5e5-a071-40a8-94fd-d931c9ac5cc4',
   u'key': u'activity_count',
   u'package_id': u'20503922-5f25-4f12-aa7a-d5fbe75f4f31',
   u'revision_id': u'c6b7a1f2-6b54-46d1-802f-32ba375ca175',
   u'revision_timestamp': u'2016-05-25T01:10:16.712910',
   u'state': u'active',
   u'value': u'424'},
  {u'id': u'fa8fc149-01d4-4eb9-aef9-df4e0bad332b',
   u'key': u'country',
   u'package_id': u'20503922-5f25-4f12-aa7a-d5fbe75f4f31',
   u'revision_id': u'3bcc55c8-f438-4061-90cf-7899561b3402',
   u'revision_timestamp': u'2015-02-04T15:21:17.601387',
   u'state': u'active',
   u'value': u'CF'},
  {u'id': u'58c0efed-304c-4e90-9333-120f9b02a314',
   u'key': u'data_updated',
   u'package_id': u'20503922-5f25-4f12-aa7a-d5fbe75f4f31',
   u'revision_id': u'446a1905-0baa-4d22-a160-01cfd0235f0f',
   u'revision_timestamp': u'2016-06-0

In [None]:
transactions.find_one()

{u'@generated-datetime': u'2016-06-25T22:49:09',
 u'@ref': u'248592',
 u'@w210-activity': u'XM-OCHA-FTS-CAR-15/P-HR-RL/94360/R/13864',
 u'@w210-key': u'ocha_fts-central_african_republic_2015',
 u'_id': ObjectId('578270e7b6b1a62fd0a94380'),
 u'finance-type': {u'@code': u'110'},
 u'fts:contribution-category': {u'#text': u'Allocation',
  u'@code': u'1',
  u'@xmlns:fts': u'nothing'},
 u'provider-org': {u'@ref': u'XM-OCHA-FTS7622',
  u'narrative': u'Common Humanitarian Fund'},
 u'receiver-org': {u'@ref': u'XM-OCHA-FTS13864',
  u'narrative': u'Vitalit\xe9 Plus'},
 u'transaction-date': {u'@iso-date': u'2015-08-28'},
 u'transaction-type': {u'@code': u'3'},
 u'value': {u'#text': u'231332', u'@value-date': u'2015-08-28'}}

## Organizations

In [None]:
organization_failures={}
organization_xml_failures={}

In [None]:
def import_organization_xml(f, xml):
    i = xmltodict.parse(xml)

    # Extract the organizations, but store the remainder as metadata we
    # can join with for other purposes.

    entries = i['iati-organisations']['iati-organisation']
    del i['iati-organisations']['iati-organisation']

    metadata_key = f.name[10:-4]

    if metadata_key in organization_metadata_dict:
        i.update(organization_metadata_dict[metadata_key])

    # Save this alongside each entry

    i['@w210-key'] = metadata_key

    organizations_metadata.insert_one(i)

    generated_datetime = None

    if '@generated-datetime' in i['iati-organisations']:
        generated_datetime = i['iati-organisations']['@generated-datetime']

    if type(entries) != list:
        entries = [entries]

    for entry in entries:
        entry['@w210-key'] = metadata_key
        entry['@generated-datetime'] = generated_datetime

    organizations.insert_many(entries)

In [None]:
def import_organization_document(f):
    global organization_failures, organization_xml_failures

    # The XML file may be encoded as UTF-16 or really any other encoding.
    # If it doesn't contain the "<iati-activities" tag, then it may be
    # due to an encoding problem.

    xml=f.read()

    if xml.find('iati-organisations') == -1:
        if xml.find('iati-activities') != -1:
            organization_xml_failures[f.name] = 'activities'
            return

        charset = chardet.detect(xml)['encoding']
        xml = xml.decode(charset)

        # If after decoding the file, we still don't have the tag we were
        # looking for, we probably wound up with an HTML file.

        if xml.find('iati-organisations') == -1:
            organization_xml_failures[f.name] = 'html'
            return

    try:
        import_organization_xml(f, xml)
        return
    except Exception as e:
        pass

    try:
        charset = chardet.detect(xml)['encoding']
        xml = xml.decode(charset)

        import_organization_xml(f, xml)
    except Exception as e:
        organization_xml_failures[f.name] = e

In [None]:
organization_failures={}
organization_xml_failures={}

path_to_data = 'iati-orgs'

files = os.listdir(path_to_data)

print datetime.now(), 'Started processing'

for num, i in enumerate(os.listdir(path_to_data)):
    if num % 100 == 0:
        print datetime.now(), 'Processed', num, 'of', len(files)

    if not i.endswith(".xml"):
        continue

    with open(path_to_data+'/'+i) as f:
        try:
            import_organization_document(f)
        except:
            print f.name

print datetime.now(), 'Finished processing'

2016-07-10 16:39:53.533116 Started processing
2016-07-10 16:39:53.533762 Processed 0 of 209
2016-07-10 16:39:54.442811 Processed 100 of 209
2016-07-10 16:40:00.209636 Processed 200 of 209
2016-07-10 16:40:00.316879 Finished processing


In [None]:
print organizations.count()
print len(organization_xml_failures)

7131
6


In [None]:
print organizations_metadata.count()

203


In [None]:
organizations_metadata.find_one()

{u'@w210-key': u'womankindworld-org',
 u'_id': ObjectId('57827a59b6b1a62fd0d4805f'),
 u'author': None,
 u'author_email': u'sarahj@womankind.org.uk',
 u'extras': [{u'id': u'712042d0-c6f4-4f5c-b9f6-7455a6e969e2',
   u'key': u'activity_count',
   u'package_id': u'8e839de8-764d-43ca-b34d-32d6e2a28fe4',
   u'revision_id': u'8dc8fab3-0cfb-46f7-b498-42ab1c57b157',
   u'revision_timestamp': u'2015-01-16T14:16:04.818974',
   u'state': u'active',
   u'value': u''},
  {u'id': u'cf6065e0-611f-431c-8d45-87891065a09a',
   u'key': u'activity_period-from',
   u'package_id': u'8e839de8-764d-43ca-b34d-32d6e2a28fe4',
   u'revision_id': u'8dc8fab3-0cfb-46f7-b498-42ab1c57b157',
   u'revision_timestamp': u'2015-01-16T14:16:04.818974',
   u'state': u'active',
   u'value': u''},
  {u'id': u'985e16e9-04ae-4fa8-8114-8fb18ccbf430',
   u'key': u'activity_period-to',
   u'package_id': u'8e839de8-764d-43ca-b34d-32d6e2a28fe4',
   u'revision_id': u'8dc8fab3-0cfb-46f7-b498-42ab1c57b157',
   u'revision_timestamp': u'20

## Backups

In [None]:
def create_backup(collection):
    print 'Removing previous backup...'
    !rm dump.{collection}.tar.gz
    !rm dump/iati/{collection}.bson
    !rm dump/iati/{collection}.metadata.json

    print 'Starting mongodump...'
    !mongodump --db iati --collection {collection}

    print 'Creating archive...'
    !tar -cf dump.{collection}.tar dump/iati/{collection}.bson dump/iati/{collection}.metadata.json
    !gzip dump.{collection}.tar

    print 'Uploading to S3...'
    !aws s3 cp dump.{collection}.tar.gz s3://mdang.w210/

In [None]:
create_backup('activities')
create_backup('activities_metadata')

Removing previous backup...
Starting mongodump...
2016-07-10T16:40:01.081+0000	writing iati.activities to
2016-07-10T16:40:04.082+0000	[#############...........]  iati.activities  322410/579320  (55.7%)
2016-07-10T16:40:06.343+0000	[########################]  iati.activities  579320/579320  (100.0%)
2016-07-10T16:40:06.343+0000	done dumping iati.activities (579320 documents)
Creating archive...
Uploading to S3...
upload: ./dump.activities.tar.gz to s3://mdang.w210/dump.activities.tar.gz
Removing previous backup...
Starting mongodump...
2016-07-10T16:40:50.441+0000	writing iati.activities_metadata to
2016-07-10T16:40:50.488+0000	done dumping iati.activities_metadata (3562 documents)
Creating archive...
Uploading to S3...
upload: ./dump.activities_metadata.tar.gz to s3://mdang.w210/dump.activities_metadata.tar.gz


In [None]:
create_backup('transactions')

Removing previous backup...
Starting mongodump...
2016-07-10T16:40:52.493+0000	writing iati.transactions to
2016-07-10T16:40:55.493+0000	[#########...............]  iati.transactions  934310/2251178  (41.5%)
2016-07-10T16:40:58.493+0000	[###################.....]  iati.transactions  1873682/2251178  (83.2%)
2016-07-10T16:40:59.703+0000	[########################]  iati.transactions  2251178/2251178  (100.0%)
2016-07-10T16:40:59.703+0000	done dumping iati.transactions (2251178 documents)
Creating archive...
Uploading to S3...
upload: ./dump.transactions.tar.gz to s3://mdang.w210/dump.transactions.tar.gz


In [None]:
create_backup('organizations')
create_backup('organizations_metadata')

Removing previous backup...
Starting mongodump...
2016-07-10T16:41:17.548+0000	writing iati.organizations to
2016-07-10T16:41:17.590+0000	done dumping iati.organizations (7131 documents)
Creating archive...
Uploading to S3...
upload: ./dump.organizations.tar.gz to s3://mdang.w210/dump.organizations.tar.gz
Removing previous backup...
Starting mongodump...
2016-07-10T16:41:18.986+0000	writing iati.organizations_metadata to
2016-07-10T16:41:18.989+0000	done dumping iati.organizations_metadata (203 documents)
Creating archive...
Uploading to S3...
upload: ./dump.organizations_metadata.tar.gz to s3://mdang.w210/dump.organizations_metadata.tar.gz


In [None]:
with open("activity_xml_failures",'w') as f:
    pickle.dump(activity_xml_failures,f)

In [None]:
with open("activity_failures",'w') as f:
    pickle.dump(activity_failures,f)

In [None]:
with open("organization_xml_failures",'w') as f:
    pickle.dump(organization_xml_failures,f)

# Investigating/Fixing Errors

## Activities

In [None]:
p = pprint.PrettyPrinter()
p.pprint(activity_xml_failures)

{'iati/ausgov-889.xml': 'html',
 'iati/ausgov-998.xml': 'html',
 'iati/cafod-multiple.xml': 'html',
 'iati/cafod-nepal.xml': 'html',
 'iati/ciuk-0001.xml': 'html',
 'iati/cordaid-activities.xml': ExpatError('unclosed token: line 2402, column 44',),
 'iati/ec-devco-998.xml': ExpatError('unclosed token: line 1, column 9999982',),
 'iati/gl-3.xml': 'html',
 'iati/plannlno-89.xml': 'html',
 'iati/plannlno-bd.xml': 'html',
 'iati/plannlno-br.xml': 'html',
 'iati/plannlno-ke.xml': 'html',
 'iati/sida-agreements.xml': 'html',
 'iati/sida-ck.xml': 'html',
 'iati/sida-org.xml': 'html',
 'iati/sossaheluk-activities.xml': 'html',
 'iati/utz-org.xml': 'html'}


**Website?**

* ausgov-889.xml
* ausgov-998.xml
* gl-3.xml
* sossaheluk-activities.xml

**Missing (403 Error)**

* cafod-multiple.xml
* cafod-nepal.xml
* plannlno-89.xml
* plannlno-bd.xml
* plannlno-br.xml
* plannlno-ke.xml
* utz-org.xml

**Missing (404 Error)**

* sida-agreements.xml
* sida-ck.xml
* cida-org.xml

**Truncated XML (incomplete file)**

* cordaid-activities.xml (truncated)
* ec-devco-998.xml (truncated)

In [None]:
p.pprint(activity_failures)

{}


Of the 11 activity failures, all are empty.

## Organizations

In [None]:
p.pprint(organization_xml_failures)

{'iati-orgs/af-14.xml': 'activities',
 'iati-orgs/dorcas-org.xml': 'html',
 'iati-orgs/ifad-org.xml': 'activities',
 'iati-orgs/kpmgea-org.xml': 'activities',
 'iati-orgs/somo-org.xml': 'html',
 'iati-orgs/unitedstates-org_peacecorps.xml': 'html'}


In [None]:
p.pprint(organization_failures)

{}
