## Get the Metadata for the 990 IRS files

In [1]:
import os
#import requests
#import untangle
import xmltodict
import urllib.request
import pandas as pd
from pymongo import MongoClient
from datetime import datetime, timedelta
from IPython.display import display, HTML
import xml.etree.cElementTree as et

In [2]:
year = 2017

sourceURL = 'https://s3.amazonaws.com/irs-form-990/index_%d.json' % year
pkl_name = "./IRS990_%d.pkl" % year

print( 'Working on year: %d' % year)

Working on year: 2017


In [3]:
# Verify if we already have the file
if os.path.isfile( pkl_name ):
    # Read from disk
    df = pd.read_pickle( pkl_name )
else:
    # Download from URL
    df = pd.read_json( sourceURL )
    df = df[df.keys()[0]].apply(lambda x:pd.Series(x))
    # Save version to disk
    df.to_pickle( pkl_name )
    
df

Unnamed: 0,DLN,EIN,FormType,LastUpdated,ObjectId,OrganizationName,SubmittedOn,TaxPeriod,URL
0,93493243000066,042662873,990,2017-01-11T22:15:15,201612439349300006,ELKS BUILDING CORP OF NORWOOD,2017-01-04,201603,https://s3.amazonaws.com/irs-form-990/20161243...
1,93493243000266,042964630,990,2017-01-11T22:15:15,201612439349300026,NEIGHBORHOOD OF AFFORDABLE HOUSING INC,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
2,93493243003416,382912028,990,2017-01-11T22:15:15,201612439349300341,RELEAF MICHIGAN INC,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
3,93493243005166,200509226,990,2017-01-11T22:15:15,201612439349300516,ST MICHAEL ALBERTVILLE FOOTBALL BOOSTER CLUB,2017-01-04,201605,https://s3.amazonaws.com/irs-form-990/20161243...
4,93493243005466,202699020,990,2017-01-11T22:15:15,201612439349300546,KARLA SMITH FOUNDATION,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
5,93493243006016,251525572,990,2017-01-11T22:15:15,201612439349300601,ENTERPRISE DEVELOPMENT FUND OF ERIE COUNTY INC,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
6,93493243006216,810593280,990,2017-01-11T22:15:15,201612439349300621,HCANJ FOUNDATION INC,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
7,93493243007466,561434187,990,2017-01-11T22:15:15,201612439349300746,ROSEWOOD VOLUNTEER FIRE DEPT INC,2017-01-04,201606,https://s3.amazonaws.com/irs-form-990/20161243...
8,93493243008616,936027198,990,2017-01-11T22:15:15,201612439349300861,EUGENE ROTARY SCHOLARSHIP FOUNDATION,2017-01-04,201606,https://s3.amazonaws.com/irs-form-990/20161243...
9,93491244006016,274351424,990PF,2017-01-11T22:15:15,201612449349100601,JOHN AND MARGARET SNYDER FOUNDATION,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161244...


In [4]:
# example
df.iloc[0]

DLN                                                    93493243000066
EIN                                                         042662873
FormType                                                          990
LastUpdated                                       2017-01-11T22:15:15
ObjectId                                           201612439349300006
OrganizationName                        ELKS BUILDING CORP OF NORWOOD
SubmittedOn                                                2017-01-04
TaxPeriod                                                      201603
URL                 https://s3.amazonaws.com/irs-form-990/20161243...
Name: 0, dtype: object

## Open MongoDB

In [5]:
# SET TO TRUE IF YOU WANT TO CLEAN DATABASE
DUMP_DB = True

uri = "mongodb://mongo/tweets"
client = MongoClient(uri)

print( 'List of databases in MongoDB:' )
print(client.list_database_names())

# database
db = client['irs990']

# collection
dbYear = db['%s' % year]

# if we already have documents then drop them and start
# clean on this collection
if DUMP_DB and dbYear.count_documents({}) > 0:
    print( 'Drop database and start over with archive')
    dbYear.drop()

List of databases in MongoDB:
['admin', 'config', 'irs990', 'local']
Drop database and start over with archive


## Read each XML and load into MongoDB

In [34]:

for x in range( 112 ):
    
    # select 990 to download
    case = df.iloc[x]
    
    # EXAMPLE:
    #
    # EIN                 042662873
    # TaxPeriod           201603
    # DLN                 93493243000066
    # FormType            990
    # URL                 https://s3.amazonaws.com/irs-form-990/...
    # OrganizationName    ELKS BUILDING CORP OF NORWOOD
    # SubmittedOn         2017-01-04
    # ObjectId            201612439349300006
    # LastUpdated         2017-01-11T22:15:15
    
    print( 'Now working on [%s] %s' % (case['EIN'], case['OrganizationName']) )
    
    try:
        print( '\tDownload %s' % df.iloc[x]['URL'])
        response = urllib.request.urlopen( df.iloc[x]['URL'] )
        data = response.read()
    except Exception as e:
        print( 'ERROR ON DOWNLOAD: %s' % e )
        continue
        
    xml = xmltodict.parse( data )['Return']['ReturnData']
    temp = dict()
    print(x)
    for doc in xml.keys():
        if doc == '@documentCnt': continue
        
        print('\tAdd to MongoDB :%s' % xml[doc]['@documentId'])
        
        #display(xml[doc])
        dbYear.update_one( {"_id" : case['EIN']}, {"$set":xml[doc]}, upsert=True )
        dbYear.update_one( {"_id" : case['EIN']}, {"$set":df.iloc[x].to_dict()}, upsert=True )

Now working on [042662873] ELKS BUILDING CORP OF NORWOOD
	Download https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
0
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [042964630] NEIGHBORHOOD OF AFFORDABLE HOUSING INC
	Download https://s3.amazonaws.com/irs-form-990/201612439349300026_public.xml
1
	Add to MongoDB :RetDoc1038000001
	Add to MongoDB :RetDoc1039100001
	Add to MongoDB :RetDoc1234500001
	Add to MongoDB :RetDoc1040000001
	Add to MongoDB :RetDoc1041900001
	Add to MongoDB :RetDoc1042400001
	Add to MongoDB :RetDoc1042800001
	Add to MongoDB :RetDoc1044400001
	Add to MongoDB :RetDoc1043400001
Now working on [382912028] RELEAF MICHIGAN INC
	Download https://s3.amazonaws.com/irs-form-990/201612439349300341_public.xml
2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc6
	Add to MongoDB :RetDoc1
	Add to MongoDB :RetDoc4
	Add to MongoDB :RetDoc5
Now working on [200509226] ST MICHAEL ALBERTVILLE FOOTBALL BOOSTER CLUB
	Download ht

29
	Add to MongoDB :IRS990PF
	Add to MongoDB :DepreciationSch
	Add to MongoDB :InvestCorpStockSch
	Add to MongoDB :InvOtherSch2
	Add to MongoDB :OtherExpensesSch
	Add to MongoDB :OtherIncomeSch2
	Add to MongoDB :OthProfFeesSch
	Add to MongoDB :TaxesSchedule
Now working on [460305857] PLATTE AMBULANCE SERVICE
	Download https://s3.amazonaws.com/irs-form-990/201612459349200111_public.xml
30
	Add to MongoDB :RetDoc1234200001
	Add to MongoDB :RetDoc1039100001
	Add to MongoDB :RetDoc1051800001
	Add to MongoDB :RetDoc2278000001
Now working on [546053668] ROTARY CLUB OF WILLIAMSBURG INC
	Download https://s3.amazonaws.com/irs-form-990/201612459349200206_public.xml
31
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
Now working on [455227971] GULF COAST GUJARATI SAMAJ INC
	Download https://s3.amazonaws.com/irs-form-990/201612459349200321_public.xml
32
	Add to MongoDB :00000001
	Add to MongoDB :00000002
Now working on [270290159] COMMON SENSE ALLIANCE
	Download https://s

57
	Add to MongoDB :IRS990
	Add to MongoDB :990A
	Add to MongoDB :990G
	Add to MongoDB :990O
Now working on [453068553] MILWAUKEE CENTER FOR CHILDREN AND YOUTH INC
	Download https://s3.amazonaws.com/irs-form-990/201602399349300330_public.xml
58
	Add to MongoDB :IRS990
	Add to MongoDB :IRS990ScheduleA
	Add to MongoDB :IRS990ScheduleB
	Add to MongoDB :IRS990ScheduleD
	Add to MongoDB :IRS990ScheduleO
Now working on [841261271] ADOLESCENT COUNSELING EXCHANGE
	Download https://s3.amazonaws.com/irs-form-990/201602399349300410_public.xml
59
	Add to MongoDB :990
	Add to MongoDB :990A
	Add to MongoDB :990D
	Add to MongoDB :990O
Now working on [222209117] EDNA ST VINCENT MILLAY SOCIETY
	Download https://s3.amazonaws.com/irs-form-990/201602399349300605_public.xml
60
	Add to MongoDB :IRS990
	Add to MongoDB :IRS990ScheduleA
	Add to MongoDB :IRS990ScheduleB
	Add to MongoDB :IRS990ScheduleD
	Add to MongoDB :IRS990ScheduleG
	Add to MongoDB :IRS990ScheduleM
	Add to MongoDB :IRS990ScheduleO
Now working 

86
	Add to MongoDB :R000001
	Add to MongoDB :R000002
	Add to MongoDB :R000003
	Add to MongoDB :R000004
	Add to MongoDB :R000005
	Add to MongoDB :R000006
Now working on [522078118] WHARTON HIGH SCHOOL BAND BOOSTERS INC
	Download https://s3.amazonaws.com/irs-form-990/201612509349300101_public.xml
87
	Add to MongoDB :RetDoc2
	Add to MongoDB :RetDoc6
	Add to MongoDB :RetDoc3
	Add to MongoDB :RetDoc4
	Add to MongoDB :RetDoc5
Now working on [161511190] HOLY FAMILY COMMUNICATIONS
	Download https://s3.amazonaws.com/irs-form-990/201612509349300166_public.xml
88
	Add to MongoDB :IRS990
	Add to MongoDB :IRS990ScheduleA
	Add to MongoDB :IRS990ScheduleB
	Add to MongoDB :IRS990ScheduleD
	Add to MongoDB :IRS990ScheduleM
	Add to MongoDB :IRS990ScheduleO
Now working on [721245195] FRIENDS OF ZOO OF ACADIANA
	Download https://s3.amazonaws.com/irs-form-990/201612509349300211_public.xml
89
	Add to MongoDB :RetDoc1038000001
	Add to MongoDB :RetDoc1039100001
	Add to MongoDB :RetDoc1234500001
	Add to MongoDB