## Get the Metadata for the 990 IRS files

In [1]:
import os, sys
#import requests
#import untangle
import xmltodict
import urllib.request
import pandas as pd
from pymongo import MongoClient
from datetime import datetime, timedelta
from IPython.display import display, HTML
import xml.etree.cElementTree as et
from irsx.xmlrunner import XMLRunner
import json

In [2]:
year = 2017

sourceURL = 'https://s3.amazonaws.com/irs-form-990/index_%d.json' % year
pkl_name = "./IRS990_%d.pkl" % year

print( 'Working on year: %d' % year)

Working on year: 2017


In [3]:
# Verify if we already have the file
if os.path.isfile( pkl_name ):
    # Read from disk
    df = pd.read_pickle( pkl_name )
else:
    # Download from URL
    df = pd.read_json( sourceURL )
    df = df[df.keys()[0]].apply(lambda x:pd.Series(x))
    # Save version to disk
    df.to_pickle( pkl_name )
    
df

Unnamed: 0,DLN,EIN,FormType,LastUpdated,ObjectId,OrganizationName,SubmittedOn,TaxPeriod,URL
0,93493243000066,042662873,990,2017-01-11T22:15:15,201612439349300006,ELKS BUILDING CORP OF NORWOOD,2017-01-04,201603,https://s3.amazonaws.com/irs-form-990/20161243...
1,93493243000266,042964630,990,2017-01-11T22:15:15,201612439349300026,NEIGHBORHOOD OF AFFORDABLE HOUSING INC,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
2,93493243003416,382912028,990,2017-01-11T22:15:15,201612439349300341,RELEAF MICHIGAN INC,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
3,93493243005166,200509226,990,2017-01-11T22:15:15,201612439349300516,ST MICHAEL ALBERTVILLE FOOTBALL BOOSTER CLUB,2017-01-04,201605,https://s3.amazonaws.com/irs-form-990/20161243...
4,93493243005466,202699020,990,2017-01-11T22:15:15,201612439349300546,KARLA SMITH FOUNDATION,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
5,93493243006016,251525572,990,2017-01-11T22:15:15,201612439349300601,ENTERPRISE DEVELOPMENT FUND OF ERIE COUNTY INC,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
6,93493243006216,810593280,990,2017-01-11T22:15:15,201612439349300621,HCANJ FOUNDATION INC,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161243...
7,93493243007466,561434187,990,2017-01-11T22:15:15,201612439349300746,ROSEWOOD VOLUNTEER FIRE DEPT INC,2017-01-04,201606,https://s3.amazonaws.com/irs-form-990/20161243...
8,93493243008616,936027198,990,2017-01-11T22:15:15,201612439349300861,EUGENE ROTARY SCHOLARSHIP FOUNDATION,2017-01-04,201606,https://s3.amazonaws.com/irs-form-990/20161243...
9,93491244006016,274351424,990PF,2017-01-11T22:15:15,201612449349100601,JOHN AND MARGARET SNYDER FOUNDATION,2017-01-04,201512,https://s3.amazonaws.com/irs-form-990/20161244...


In [4]:
# example
df.iloc[0]

DLN                                                    93493243000066
EIN                                                         042662873
FormType                                                          990
LastUpdated                                       2017-01-11T22:15:15
ObjectId                                           201612439349300006
OrganizationName                        ELKS BUILDING CORP OF NORWOOD
SubmittedOn                                                2017-01-04
TaxPeriod                                                      201603
URL                 https://s3.amazonaws.com/irs-form-990/20161243...
Name: 0, dtype: object

## Open MongoDB

In [23]:
# SET TO TRUE IF YOU WANT TO CLEAN DATABASE
DUMP_DB = True

uri = "mongodb://mongo/tweets"
client = MongoClient(uri)

print( 'List of databases in MongoDB:' )
print(client.list_database_names())

# database
db = client['irs990']

# collection
dbYear = db['%s' % year]

# if we already have documents then drop them and start
# clean on this collection
if DUMP_DB and dbYear.count_documents({}) > 0:
    print( 'Drop database and start over with archive')
    dbYear.drop()

List of databases in MongoDB:
['admin', 'config', 'irs990', 'local']
Drop database and start over with archive


## Read each XML and load into MongoDB

In [53]:
for x in range(500):
    
    # select 990 to download
    case = df.iloc[x]
    
    # EXAMPLE:
    #
    # EIN                 042662873
    # TaxPeriod           201603
    # DLN                 93493243000066
    # FormType            990
    # URL                 https://s3.amazonaws.com/irs-form-990/...
    # OrganizationName    ELKS BUILDING CORP OF NORWOOD
    # SubmittedOn         2017-01-04
    # ObjectId            201612439349300006
    # LastUpdated         2017-01-11T22:15:15
    
    #print( 'Now working on [%s] %s' % (case['EIN'], case['OrganizationName']) )
    sys.stdout.write('%s: [%s] %s\r' % (x, case['ObjectId'], case['OrganizationName']) )
    sys.stdout.flush()
    
    try:
        print( '\tDownload %s' % df.iloc[x]['URL'])

        parsed_filing = xml_runner.run_filing(case['ObjectId'])
        result = parsed_filing.get_result()
        
    except Exception as e:
        print( 'ERROR ON DOWNLOAD: %s' % e )
        continue
    for sked in result:
            
        if sked['schedule_name'] == 'IRS990' or sked['schedule_name'] == 'IRS990EZ'or sked['schedule_name'] == 'IRS990PF': 
            
            #print(json.dumps(sked))
            
            ##Check for Website Address
            #try:
            #    if sked['schedule_parts']['part_0']['WbstAddrssTxt'] !='N/A':
                    
            print("Schedule: %s" % sked['schedule_name'])  
                    
                    #Mapping for Required Fields
                    #Need to check if fields are really normalized, Below mappings appear to only work for ['schedule_name'] == 'IRS990'
            '''     temp = {}
                    temp['Website'] = sked['schedule_parts']['part_0']['WbstAddrssTxt'] 
                    temp['Desc'] = sked['schedule_parts']['part_iii']['Dsc'] 
                    temp['ActivityOrMissionDesc'] = sked['schedule_parts']['part_i']['ActvtyOrMssnDsc'] 
                    temp['ZipCode'] = sked['schedule_parts']['part_0']['USAddrss_ZIPCd'] 
                    temp['EmployeeCnt'] = sked['schedule_parts']['part_i']['TtlEmplyCnt'] 
                    temp['CYTotalRevenueAmt'] = sked['schedule_parts']['part_i']['CYTtlRvnAmt'] 
                    temp['CYTotalExpensesAmt'] = sked['schedule_parts']['part_i']['CYTtlExpnssAmt'] 
                    temp['NetAssetsOrFundBalancesEOYAmt'] = sked['schedule_parts']['part_i']['NtAsstsOrFndBlncsEOYAmt'] 
                    temp['GrossReceiptsAmt'] =  sked['schedule_parts']['part_0']['GrssRcptsAmt'] 
            '''
            dbYear.update_one( {"_id" : case['ObjectId']}, {"$set":sked}, upsert=True )
            dbYear.update_one( {"_id" : case['ObjectId']}, {"$set":df.iloc[x].to_dict()}, upsert=True )
                    
            #except Exception as e:
                #print( 'WEBSITE ERROR: %s' % e )
            #    continue

	Download https://s3.amazonaws.com/irs-form-990/201612439349300006_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612439349300026_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612439349300341_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612439349300516_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612439349300546_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612439349300601_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612439349300621_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612439349300746_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612439349300861_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612449349100601_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/

	Download https://s3.amazonaws.com/irs-form-990/201612469349300501_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612469349300601_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612509349300101_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612509349300166_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612509349300211_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612509349300356_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612509349300476_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612509349300501_public.xmlN
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612509349300621_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201612519349200021_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990

	Download https://s3.amazonaws.com/irs-form-990/201622109349300627_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201602079349301200_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201632099349301353_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201632079349301368_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201632039349300953_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201602159349300225_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201602599349100435_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201612539349100406_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602599349100030_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602599349100640_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-for

	Download https://s3.amazonaws.com/irs-form-990/201622579349100412_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201612549349100501_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201612539349100401_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602589349100750_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602599349100135_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602589349100510_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602599349100420_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602589349100110_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602599349100535_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602589349100015_public.xml
Schedule: IRS990PF
	Download https://s3.amazonaws

Schedule: IRS990PF
	Download https://s3.amazonaws.com/irs-form-990/201602189349300440_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201642169349301589_public.xml
Schedule: IRS990
	Download https://s3.amazonaws.com/irs-form-990/201632289349203713_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349203468_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349203398_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349204603_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349203743_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349203373_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349203233_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349203158_public.xml
Schedule: IRS990EZ
	Download https

	Download https://s3.amazonaws.com/irs-form-990/201642259349200549_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632309349200418_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349204273_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349204258_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349203703_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349202983_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349201408_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349201353_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201632289349200228_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws.com/irs-form-990/201642289349202399_public.xml
Schedule: IRS990EZ
	Download https://s3.amazonaws