## Get the Metadata for the 990 IRS files

In [None]:
import os, sys
#import requests
#import untangle
import xmltodict
import urllib.request
import pandas as pd
from pymongo import MongoClient
from datetime import datetime, timedelta
from IPython.display import display, HTML
import xml.etree.cElementTree as et

In [None]:
year = 2017

sourceURL = 'https://s3.amazonaws.com/irs-form-990/index_%d.json' % year
pkl_name = "./IRS990_%d.pkl" % year

print( 'Working on year: %d' % year)

In [None]:
# Verify if we already have the file
if os.path.isfile( pkl_name ):
    # Read from disk
    df = pd.read_pickle( pkl_name )
else:
    # Download from URL
    df = pd.read_json( sourceURL )
    df = df[df.keys()[0]].apply(lambda x:pd.Series(x))
    # Save version to disk
    df.to_pickle( pkl_name )
    
df

In [None]:
# example
df.iloc[0]

## Open MongoDB

In [None]:
# SET TO TRUE IF YOU WANT TO CLEAN DATABASE
DUMP_DB = True

uri = "mongodb://mongo/tweets"
client = MongoClient(uri)

print( 'List of databases in MongoDB:' )
print(client.list_database_names())

# database
db = client['irs990']

# collection
dbYear = db['%s' % year]

# if we already have documents then drop them and start
# clean on this collection
if DUMP_DB and dbYear.count_documents({}) > 0:
    print( 'Drop database and start over with archive')
    dbYear.drop()

## Read each XML and load into MongoDB

In [None]:

for x in range( 1000 ):
    
    # select 990 to download
    case = df.iloc[x]
    
    # EXAMPLE:
    #
    # EIN                 042662873
    # TaxPeriod           201603
    # DLN                 93493243000066
    # FormType            990
    # URL                 https://s3.amazonaws.com/irs-form-990/...
    # OrganizationName    ELKS BUILDING CORP OF NORWOOD
    # SubmittedOn         2017-01-04
    # ObjectId            201612439349300006
    # LastUpdated         2017-01-11T22:15:15
    
    #print( 'Now working on [%s] %s' % (case['EIN'], case['OrganizationName']) )
    sys.stdout.write('%s: [%s] %s\r' % (x, case['EIN'], case['OrganizationName']) )
    sys.stdout.flush()
    
    try:
        #print( '\tDownload %s' % df.iloc[x]['URL'])
        response = urllib.request.urlopen( df.iloc[x]['URL'] )
        data = response.read()
    except Exception as e:
        print( 'ERROR ON DOWNLOAD: %s' % e )
        continue
        
    xml = xmltodict.parse( data )['Return']['ReturnData']
    temp = dict()
    #print(x)
    for doc in xml.keys():
        if doc == '@documentCnt' or doc == '@documentCount': continue
        
        #print('\tAdd to MongoDB :%s' % xml[doc]['@documentId'])
        
        #display(xml[doc])
        dbYear.update_one( {"_id" : case['EIN']}, {"$set":xml[doc]}, upsert=True )
        dbYear.update_one( {"_id" : case['EIN']}, {"$set":df.iloc[x].to_dict()}, upsert=True )