## Get the Metadata for the 990 IRS files

In [4]:
import os
#import requests
#import untangle
import xmltodict
import urllib.request
import pandas as pd
from pymongo import MongoClient
from datetime import datetime, timedelta
from IPython.display import display, HTML
import xml.etree.cElementTree as et

In [5]:
year = 2017

sourceURL = 'https://s3.amazonaws.com/irs-form-990/index_%d.json' % year
pkl_name = "./IRS990_%d.pkl" % year

print( 'Working on year: %d' % year)

Working on year: 2017


In [6]:
# Verify if we already have the file
if os.path.isfile( pkl_name ):
    # Read from disk
    df = pd.read_pickle( pkl_name )
else:
    # Download from URL
    df = pd.read_json( sourceURL )
    df = df[df.keys()[0]].apply(lambda x:pd.Series(x))
    # Save version to disk
    df.to_pickle( pkl_name )
    
df

Unnamed: 0,EIN,TaxPeriod,DLN,FormType,URL,OrganizationName,SubmittedOn,ObjectId,LastUpdated
0,042662873,201603,93493243000066,990,https://s3.amazonaws.com/irs-form-990/20161243...,ELKS BUILDING CORP OF NORWOOD,2017-01-04,201612439349300006,2017-01-11T22:15:15
1,042964630,201512,93493243000266,990,https://s3.amazonaws.com/irs-form-990/20161243...,NEIGHBORHOOD OF AFFORDABLE HOUSING INC,2017-01-04,201612439349300026,2017-01-11T22:15:15
2,382912028,201512,93493243003416,990,https://s3.amazonaws.com/irs-form-990/20161243...,RELEAF MICHIGAN INC,2017-01-04,201612439349300341,2017-01-11T22:15:15
3,200509226,201605,93493243005166,990,https://s3.amazonaws.com/irs-form-990/20161243...,ST MICHAEL ALBERTVILLE FOOTBALL BOOSTER CLUB,2017-01-04,201612439349300516,2017-01-11T22:15:15
4,202699020,201512,93493243005466,990,https://s3.amazonaws.com/irs-form-990/20161243...,KARLA SMITH FOUNDATION,2017-01-04,201612439349300546,2017-01-11T22:15:15
5,251525572,201512,93493243006016,990,https://s3.amazonaws.com/irs-form-990/20161243...,ENTERPRISE DEVELOPMENT FUND OF ERIE COUNTY INC,2017-01-04,201612439349300601,2017-01-11T22:15:15
6,810593280,201512,93493243006216,990,https://s3.amazonaws.com/irs-form-990/20161243...,HCANJ FOUNDATION INC,2017-01-04,201612439349300621,2017-01-11T22:15:15
7,561434187,201606,93493243007466,990,https://s3.amazonaws.com/irs-form-990/20161243...,ROSEWOOD VOLUNTEER FIRE DEPT INC,2017-01-04,201612439349300746,2017-01-11T22:15:15
8,936027198,201606,93493243008616,990,https://s3.amazonaws.com/irs-form-990/20161243...,EUGENE ROTARY SCHOLARSHIP FOUNDATION,2017-01-04,201612439349300861,2017-01-11T22:15:15
9,274351424,201512,93491244006016,990PF,https://s3.amazonaws.com/irs-form-990/20161244...,JOHN AND MARGARET SNYDER FOUNDATION,2017-01-04,201612449349100601,2017-01-11T22:15:15


In [7]:
# example
df.iloc[0]

EIN                                                         042662873
TaxPeriod                                                      201603
DLN                                                    93493243000066
FormType                                                          990
URL                 https://s3.amazonaws.com/irs-form-990/20161243...
OrganizationName                        ELKS BUILDING CORP OF NORWOOD
SubmittedOn                                                2017-01-04
ObjectId                                           201612439349300006
LastUpdated                                       2017-01-11T22:15:15
Name: 0, dtype: object

## Open MongoDB

In [8]:
# SET TO TRUE IF YOU WANT TO CLEAN DATABASE
DUMP_DB = False

uri = "mongodb://mongo/tweets"
client = MongoClient(uri)

print( 'List of databases in MongoDB:' )
print(client.list_database_names())

# database
db = client['irs990']

# collection
dbYear = db['%s' % year]

# if we already have documents then drop them and start
# clean on this collection
if DUMP_DB and dbYear.count_documents({}) > 0:
    print( 'Drop database and start over with archive')
    dbYear.drop()

List of databases in MongoDB:
['admin', 'config', 'local']


## Read each XML and load into MongoDB

In [42]:

for x in range( 100 ):
    
    # select 990 to download
    case = df.iloc[x]
    
    # EXAMPLE:
    #
    # EIN                 042662873
    # TaxPeriod           201603
    # DLN                 93493243000066
    # FormType            990
    # URL                 https://s3.amazonaws.com/irs-form-990/...
    # OrganizationName    ELKS BUILDING CORP OF NORWOOD
    # SubmittedOn         2017-01-04
    # ObjectId            201612439349300006
    # LastUpdated         2017-01-11T22:15:15
    
    print( 'Now working on [%s] %s' % (case['EIN'], case['OrganizationName']) )
    
    try:
        print( '\tDownlaod %s' % df.iloc[0]['URL'])
        response = urllib.request.urlopen( df.iloc[0]['URL'] )
        data = response.read()
    except Exception as e:
        print( 'ERROR ON DOWNLOAD: %s' % e )
        continue
        
    xml = xmltodict.parse( data )['Return']['ReturnData']
    temp = dict()
    for doc in xml.keys():
        if doc == '@documentCnt': continue
            
        print('\tAdd to MongoDB :%s' % xml[doc]['@documentId'])
        
        #display(xml[doc])
        dbYear.update_one( {"_id" : case['EIN']}, {"$set":xml[doc]}, upsert=True )
        dbYear.update_one( {"_id" : case['EIN']}, {"$set":df.iloc[x].to_dict()}, upsert=True )

Now working on [042662873] ELKS BUILDING CORP OF NORWOOD
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [042964630] NEIGHBORHOOD OF AFFORDABLE HOUSING INC
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [382912028] RELEAF MICHIGAN INC
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [200509226] ST MICHAEL ALBERTVILLE FOOTBALL BOOSTER CLUB
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [202699020] KARLA SMITH FOUNDATION
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :R

	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [223091017] COMMUNITY LIVING CORPORATION
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [611339396] SACRED HEART VILLAGE II INC
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [760312087] WORKLIFE MINISTRY INC
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [481267982] RENEW THE HOPE INC
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [300308421] UCC XIX INC
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_p

	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [410128105] CONNEXUS ENERGY
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [930811197] OREGON POTTERS ASSOCIATION
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [464014717] PTO FARMS
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [454711722] RIVER SHOALS OF BALDWIN INC
	Downlaod https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [364150724] OAK PARK-RIVER FOREST COMMUNITY FOUNDATION
	Downlaod https://s3.amazonaws.com/irs-form-990/201612