In [1]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import warnings
import math
from tqdm import tqdm
from irsx.xmlrunner import XMLRunner
xml_runner = XMLRunner()
warnings.filterwarnings('ignore')

import ipyparallel as ipp
c = ipp.Client()
print(c.ids)
dview = c[:]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]


### Prepare UK Data.

```Python
# Compress icnpo training data.
df_icnpo_classifier_training_data=pd.read_csv('../dataset/icnpo_classifier_training_data.csv', sep=',')
df_icnpo_classifier_training_data.to_pickle('../dataset/icnpo_classifier_training_data.pkl.gzip', compression='gzip')
```

### Prepare US Training Data.
Useful links:
- IRS 990 forms on AWS: https://registry.opendata.aws/irs990/
- NCCS Data Archive (data dict, NTEE codes, etc.): https://nccs-data.urban.org/index.php

### TODO:
- [x] Add year of formation. - Do not use information from 990 forms: only Form 990 has "Year of Formation". Use BMF 199508 to dummy-code the organizations.
- [x] Add program descriptions in Part VIII of Form 990.

In [2]:
df_index=pd.DataFrame()
for year in range(2014, 2018):
    try:
        print(str(year), 'started', end='\t')
        df_index_temp=pd.read_csv('https://s3.amazonaws.com/irs-form-990/index_'+str(year)+'.csv', 
                                  error_bad_lines=False, warn_bad_lines=True # Be cautious with bad lines.
                                 )
        df_index_temp['YEAR']=year
        df_index=pd.concat([df_index, df_index_temp], ignore_index=True)
        print('got')
    except:
        print('passed')

2014 started	

b'Skipping line 39569: expected 9 fields, saw 10\n'


got
2015 started	got
2016 started	got
2017 started	got


In [3]:
df_index.groupby(['YEAR','RETURN_TYPE']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,DLN,OBJECT_ID
YEAR,RETURN_TYPE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014,990,163381,163381,163381,163381,163381,163381,163381,163381
2014,990EO,29466,29466,29466,29466,29466,29466,29466,29466
2014,990EZ,82937,82937,82937,82937,82937,82937,82937,82937
2014,990O,52490,52490,52490,52490,52490,52490,52490,52490
2014,990PF,59254,59254,59254,59254,59254,59254,59254,59254
2015,990,106571,106571,106571,106571,106571,106571,106571,106571
2015,990EO,21399,21399,21399,21399,21399,21399,21399,21399
2015,990EZ,59845,59845,59845,59845,59845,59845,59845,59845
2015,990O,34567,34567,34567,34567,34567,34567,34567,34567
2015,990PF,38650,38650,38650,38650,38650,38650,38650,38650


### Acquiring text data.

In [4]:
df_index.sample(10)

Unnamed: 0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID,YEAR
295849,12138709,EFILE,262037375,201312,12/29/2014,WILMETTE THEATRE EDUCATION PROJECT NFP,990,93493321045374,201423219349304537,2014
1045894,14079227,EFILE,592171928,201512,1/13/2017 9:19:09 AM,AMERICAN ACADEMY OF MATRIMONIAL LAWYERS - FLOR...,990O,93493258005036,201632589349300503,2017
113179,11759970,EFILE,161375637,201312,9/12/2014 11:05:28 PM,HOUSING VISIONS UNLIMITED INC,990,93493226026264,201412269349302626,2014
124050,11786894,EFILE,391870694,201312,9/23/2014 12:12:33 AM,UWMF FACULTY PHYSICIANS VEBA AND TRUST,990O,93493227009084,201432279349300908,2014
1248650,14689227,EFILE,201030449,201612,8/30/2017 10:42:26 AM,SELCAT TITLE HOLDING CO INC,990O,93493124008457,201701249349300845,2017
760849,13294048,EFILE,223460723,201506,03/25/2016,ART PRIDE NEW JERSEY FOUNDATION INC,990,93493030003146,201640309349300314,2016
436555,12640097,EFILE,161553655,201412,7/7/2015,FESSENDEN LAUMER AND DEANGELO ATHLETIC CLUB INC,990EZ,93492117007305,201501179349200730,2015
718567,13195723,EFILE,320357158,201412,02/10/2016,CASA JALISCO EN ESTADOS UNIDOS,990O,93493319007035,201533199349300703,2016
305267,11755930,EFILE,455358161,201312,9/11/2014 2:09:15 PM,WHEEL PASSION INC,990EZ,93492223010624,201422239349201062,2014
260,12051463,EFILE,953555022,201406,12/6/2014 1:15:54 AM,RHF HOUSING INC,990,93493316030574,201423169349303057,2014


In [5]:
done_index_list=[]
dview.execute('import pandas as pd')
dview.execute('from time import sleep')
dview.execute('from irsx.xmlrunner import XMLRunner')
dview.execute('import os')
dview['xml_runner'] = XMLRunner()
dview['df_index'] = df_index
dview['df_index_text_acq'] = pd.DataFrame()

In [37]:
dview['done_index_list'] = list(set(done_index_list))
dview['counter']=0

In [38]:
@dview.parallel(block=True)
def func_text_acq(index):
    global df_index, done_index_list, df_index_text_acq, counter
    
    ###### Define functions ######################################
    def func_irs990_text(obj_id):
        IRS990_result=xml_runner.run_sked(obj_id, 'IRS990').result

        ############## Check filing version.##############
        if IRS990_result!=None:
            IRS990_result=IRS990_result[0]
        else:
            return {'IRS990_p1_ActvtyOrMssnDsc': 'VERSION_NOT_SUPPORTED',
                    'IRS990_p3_MssnDsc': 'VERSION_NOT_SUPPORTED',
                    'IRS990_p3_DscS': 'VERSION_NOT_SUPPORTED',
                    'IRS990ScheduleO_ExplntnTxt': 'VERSION_NOT_SUPPORTED',
                   }
        ##################################################

        ###### IRS990_p1_ActvtyOrMssnDsc #################
        IRS990_p1_ActvtyOrMssnDsc=IRS990_result['schedule_parts']['part_i']['ActvtyOrMssnDsc']
        ##################################################

        ##### IRS990_p3_MssnDsc, IRS990_p3_DscS ##########
        IRS990_p3=IRS990_result['schedule_parts']['part_iii']
        IRS990_p3_MssnDsc='##'.join([IRS990_p3[key] for key in IRS990_p3.keys() if key=='MssnDsc'])
        IRS990_p3_DscS='##'.join([IRS990_p3[key] for key in IRS990_p3.keys() if (key=='Dsc' or '_Dsc' in key)])
        ##################################################

        ###### IRS990ScheduleO_ExplntnTxt #################
#         if 'InfInSkdOPrtIIIInd' in IRS990_p3.keys() and IRS990_p3['InfInSkdOPrtIIIInd']=='X' and 'IRS990ScheduleO' in xml_runner.run_filing(obj_id).list_schedules():
        if 'IRS990ScheduleO' in xml_runner.run_filing(obj_id).list_schedules():
            IRS990ScheduleO=xml_runner.run_sked(obj_id, 'IRS990ScheduleO').result[0]
            IRS990ScheduleO_ExplntnTxt='##'.join([s['ExplntnTxt'] for s in IRS990ScheduleO['groups']['SkdOSpplmntlInfrmtnDtl'] if 'III' in s['FrmAndLnRfrncDsc']])
        else:
            IRS990ScheduleO_ExplntnTxt=''
        ##################################################

        return {'IRS990_p1_ActvtyOrMssnDsc': IRS990_p1_ActvtyOrMssnDsc,
                'IRS990_p3_MssnDsc': IRS990_p3_MssnDsc,
                'IRS990_p3_DscS': IRS990_p3_DscS,
                'IRS990ScheduleO_ExplntnTxt': IRS990ScheduleO_ExplntnTxt,
               }
    
    def func_irs990ez_text(obj_id):
        IRS990EZ_result=xml_runner.run_sked(obj_id, 'IRS990EZ').result

        ############## Check filing version.##############
        if IRS990EZ_result!=None:
            IRS990EZ_result=IRS990EZ_result[0]
        else:
            return {'IRS990EZ_p3_PrmryExmptPrpsTxt': 'VERSION_NOT_SUPPORTED',
                    'IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt': 'VERSION_NOT_SUPPORTED',
                    'IRS990ScheduleO_ExplntnTxt': 'VERSION_NOT_SUPPORTED',
                   }
        ########################################################

        ###### IRS990EZ_p3_PrmryExmptPrpsTxt #############
        if 'ez_part_iii' in IRS990EZ_result['schedule_parts'].keys():
            IRS990EZ_p3=IRS990EZ_result['schedule_parts']['ez_part_iii']
            IRS990EZ_p3_PrmryExmptPrpsTxt='##'.join([IRS990EZ_p3[key] for key in IRS990EZ_p3.keys() if key=='PrmryExmptPrpsTxt'])
        else:
            IRS990EZ_p3_PrmryExmptPrpsTxt=''
        ##################################################

        ####### IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt ######
        if 'EZPrgrmSrvcAccmplshmnt' in IRS990EZ_result['groups'].keys():
            IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt='##'.join([s['DscrptnPrgrmSrvcAccmTxt'] for s in IRS990EZ_result['groups']['EZPrgrmSrvcAccmplshmnt']])
        else:
            IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt=''
        ##################################################

        ###### IRS990ScheduleO_ExplntnTxt #################
#         if 'InfInSkdOPrtIIIInd' in IRS990EZ_p3.keys() and IRS990EZ_p3['InfInSkdOPrtIIIInd']=='X' and 'IRS990ScheduleO' in xml_runner.run_filing(obj_id).list_schedules():
        if 'IRS990ScheduleO' in xml_runner.run_filing(obj_id).list_schedules():
            IRS990ScheduleO=xml_runner.run_sked(obj_id, 'IRS990ScheduleO').result[0]
            IRS990ScheduleO_ExplntnTxt='##'.join([s['ExplntnTxt'] for s in IRS990ScheduleO['groups']['SkdOSpplmntlInfrmtnDtl'] if 'III' in s['FrmAndLnRfrncDsc']])
        else:
            IRS990ScheduleO_ExplntnTxt=''
        ##################################################

        return {'IRS990EZ_p3_PrmryExmptPrpsTxt': IRS990EZ_p3_PrmryExmptPrpsTxt,
                'IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt': IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,
                'IRS990ScheduleO_ExplntnTxt': IRS990ScheduleO_ExplntnTxt,
               }

    def func_irs990pf_text(obj_id):
        IRS990PF_result=xml_runner.run_sked(obj_id, 'IRS990PF').result

        ############## Check filing version.##############
        if IRS990PF_result!=None:
            IRS990PF_result=IRS990PF_result[0]
        else:
            return {'IRS990PF_p9a_DscrptnTxt': 'VERSION_NOT_SUPPORTED',
                    'IRS990PF_p16b_RltnshpSttmntTxt': 'VERSION_NOT_SUPPORTED',
                   }
        ########################################################

        ##### IRS990PF_p9a_DscrptnTxt ##########################
        if 'pf_part_ixa' in IRS990PF_result['schedule_parts'].keys():
            IRS990PF_p9a=IRS990PF_result['schedule_parts']['pf_part_ixa']
            IRS990PF_p9a_DscrptnTxt='##'.join([IRS990PF_p9a[key] for key in IRS990PF_p9a.keys() if 'Txt' in key])
        else:
            IRS990PF_p9a_DscrptnTxt=''
        ########################################################

        ##### IRS990PF_p16b_RltnshpSttmntTxt ###################
        if 'PFRlnOfActyTAccmOfExmptPrps' in IRS990PF_result['groups']:
            IRS990PF_p16b_RltnshpSttmntTxt='##'.join([s['RltnshpSttmntTxt'] for s in IRS990PF_result['groups']['PFRlnOfActyTAccmOfExmptPrps']])
        else:
            IRS990PF_p16b_RltnshpSttmntTxt=''
        ########################################################

        return {'IRS990PF_p9a_DscrptnTxt':IRS990PF_p9a_DscrptnTxt,
                'IRS990PF_p16b_RltnshpSttmntTxt':IRS990PF_p16b_RltnshpSttmntTxt,
               }
    ###### Define functions ######################################
    
    ###### Run main function ################################
    try:
        if index not in done_index_list:
            obj_id = df_index.loc[index, 'OBJECT_ID']
            return_type=df_index.loc[index, 'RETURN_TYPE']
            if return_type in ['990', '990O']:
                text_dict=func_irs990_text(obj_id)
                df_index.loc[index, 'IRS990_p1_ActvtyOrMssnDsc']=text_dict['IRS990_p1_ActvtyOrMssnDsc']
                df_index.loc[index, 'IRS990_p3_MssnDsc']=text_dict['IRS990_p3_MssnDsc']
                df_index.loc[index, 'IRS990_p3_DscS']=text_dict['IRS990_p3_DscS']
                df_index.loc[index, 'IRS990ScheduleO_ExplntnTxt']=text_dict['IRS990ScheduleO_ExplntnTxt']
            elif return_type in ['990EZ', '990EO']:
                text_dict=func_irs990ez_text(obj_id)
                df_index.loc[index, 'IRS990EZ_p3_PrmryExmptPrpsTxt']=text_dict['IRS990EZ_p3_PrmryExmptPrpsTxt']
                df_index.loc[index, 'IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt']=text_dict['IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt']
                df_index.loc[index, 'IRS990ScheduleO_ExplntnTxt']=text_dict['IRS990ScheduleO_ExplntnTxt']
            elif return_type in ['990PF']:
                text_dict=func_irs990pf_text(obj_id)
                df_index.loc[index, 'IRS990PF_p9a_DscrptnTxt']=text_dict['IRS990PF_p9a_DscrptnTxt']
                df_index.loc[index, 'IRS990PF_p16b_RltnshpSttmntTxt']=text_dict['IRS990PF_p16b_RltnshpSttmntTxt']
            df_index_text_acq=pd.concat([df_index_text_acq, df_index.loc[[index]]], ignore_index=True)
            done_index_list+=[index]

            #### update progress file #####
            with open('../../output/pid_log/pid_'+str(os.getpid())+'.log', 'a') as log:
                log.write(str(index)+'\n')
            #### update progress file #####

    ###### Run main function ################################

            #### Sleep 2 seconds every 300 requests, adding ~2.5 hours in total #####
            counter+=1
            if counter%300==0:
                sleep(2)
            else:
                pass
            #### Sleep 2 seconds every 300 requests, adding ~2.5 hours in total #####
    except:
        pass

In [39]:
t=func_text_acq.map(df_index.index)
done_index_list=list(dview.gather('done_index_list')) # Sync done_index_list.

In [40]:
len(set(done_index_list)), len(pd.concat(dview.gather('df_index_text_acq'), ignore_index=True))

(1515976, 1515976)

In [41]:
df_EIN_TXT=pd.concat(dview.gather('df_index_text_acq'), ignore_index=True)

In [42]:
len(df_EIN_TXT), len(df_index)

(1515976, 1515993)

In [54]:
df_bmf_9508=pd.read_csv('https://nccs-data.urban.org/data/bmf/1995/bmf.bm9508.csv')
df_ein_95=df_bmf_9508[['EIN']]
df_ein_95['95_and_before']=1
df_EIN_TXT=df_EIN_TXT.merge(df_ein_95, on='EIN', how='outer')
df_EIN_TXT['95_and_before']=df_EIN_TXT['95_and_before'].fillna(0)
df_EIN_TXT.sample(3)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before
239328,93493050000000.0,112911407,EFILE,,,,,VERSION_NOT_SUPPORTED,VERSION_NOT_SUPPORTED,VERSION_NOT_SUPPORTED,VERSION_NOT_SUPPORTED,2.014005e+17,11286589.0,990,4/3/2014,MEDGAR EVERS COLLEGE AUXILIARY ENTERPRISES COR...,201306.0,2014.0,0.0
667206,93493210000000.0,222852643,EFILE,,,,,THE ORGANIZATION ALLOWS FOR STRUCTURED LEARNIN...,THE PROGRAM ALLOWS FOR STRUCTURED LEARNING ENV...,THE ORGANIZATION ALLOWS FOR STRUCTURED LEARNIN...,THE PROGRAM ALLOWS FOR STRUCTURED LEARNING ENV...,2.016221e+17,13995736.0,990,12/13/2016 5:32:45 PM,LOIS LEARNING TREE DAYCARE CENTER,201512.0,2016.0,1.0
641259,93492140000000.0,202035364,EFILE,To provide the public better appreciation of t...,To provide an annual Christmas ballet for the ...,,,,,,,2.015014e+17,12783700.0,990EZ,8/14/2015,GREAT SMOKY MOUNTAIN DANCE THEATRE INC,201412.0,2015.0,0.0


In [55]:
for index in range(0, len(df_EIN_TXT)+1, math.ceil(len(df_EIN_TXT)/20)):
    df_temp=df_EIN_TXT.loc[index:index+math.ceil(len(df_EIN_TXT)/20)-1]
    df_temp.to_pickle('../../dataset/EIN_TXT_2014_18.pkl.gz/EIN_TXT_2014_18.pkl.gz'+'-'+str(df_temp.iloc[0].name)+'-'+str(df_temp.iloc[-1].name), compression='gzip')

### Test files.

In [56]:
import os

In [58]:
df_test=pd.DataFrame()
for file in os.listdir('../../dataset/EIN_TXT_2014_18.pkl.gz/'):
    df_test=pd.concat([df_test, 
                       pd.read_pickle('../../dataset/EIN_TXT_2014_18.pkl.gz/'+file, compression='gzip')
                      ])

In [59]:
df_test.sample(10)

Unnamed: 0,DLN,EIN,FILING_TYPE,IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,IRS990EZ_p3_PrmryExmptPrpsTxt,IRS990PF_p16b_RltnshpSttmntTxt,IRS990PF_p9a_DscrptnTxt,IRS990ScheduleO_ExplntnTxt,IRS990_p1_ActvtyOrMssnDsc,IRS990_p3_DscS,IRS990_p3_MssnDsc,OBJECT_ID,RETURN_ID,RETURN_TYPE,SUB_DATE,TAXPAYER_NAME,TAX_PERIOD,YEAR,95_and_before
95660,93493320000000.0,731317252,EFILE,,,,,,SEE ATTACHED NOTE #1.,ADMINISTRATION OF PROGRAM TO DISTRIBUTE GRANT ...,SEE ATTACHED NOTE #1.,2.014132e+17,12041185.0,990,12/4/2014 6:54:08 AM,OKLAHOMA ASSOCIATION OF AREA AGENCIES ON AGING,201312.0,2014.0,1.0
2163638,,752334117,,,,,,,,,,,,,,,,,1.0
2080579,,990192396,,,,,,,,,,,,,,,,,1.0
1738840,,591747141,,,,,,,,,,,,,,,,,1.0
238279,93493130000000.0,942897957,EFILE,,,,,,TO PROVIDE LEADERSHIP ON ISSUES WHICH AFFECT T...,NUTRITION SERVICES TO THE ELDERLY - CONGREGATE...,TO PROVIDE LEADERSHIP ON ISSUES WHICH AFFECT T...,2.017013e+17,14781893.0,990,9/28/2017 10:08:25 AM,AREA 4 AGENCY ON AGING,201606.0,2017.0,1.0
701312,93491170000000.0,800015489,EFILE,,,Investment income (dividends received on inves...,,,,,,2.015317e+17,12656418.0,990PF,7/9/2015,MOHINI JAIN FAMILY FOUNDATION,201412.0,2015.0,0.0
610176,93493160000000.0,454334363,EFILE,,,,,,"As a religious charitable organization, we aim...",WE WERE ABLE TO IMPACT YOUTHS LIVES THROUGH GU...,"As a religious charitable organization,",2.014416e+17,11901292.0,990,10/22/2014 4:35:50 PM,BREAKING THE CHAINS INC,201312.0,2014.0,0.0
2103557,,481064493,,,,,,,,,,,,,,,,,1.0
134909,93491100000000.0,262384012,EFILE,,,,,,,,,2.01611e+17,13731076.0,990PF,08/23/2016,JOHN AND LINDA WIEBE FOUNDATION INC,201512.0,2016.0,0.0
1053596,93493190000000.0,362170155,EFILE,,,,,The mission of Morris Hospital & Healthcare Ce...,MORRIS HOSPITAL & HEALTHCARE CENTERS EXIST TO ...,SHORT TERM GENERAL AND PRIMARY HEALTH CARE FOR...,MORRIS HOSPITAL & HEALTHCARE CENTERS EXIST TO ...,2.014319e+17,11912793.0,990,10/24/2014 7:37:27 PM,MORRIS HOSPITAL,201312.0,2014.0,1.0


In [60]:
len(df_test), len(df_EIN_TXT)

(2371868, 2371868)

### Draft.

In [None]:
for index in df_index.sample(1000).index:
    obj_id=df_index.loc[index, 'OBJECT_ID']
    func_text_acq(index)

In [21]:
IRS990ScheduleO=xml_runner.run_sked(obj_id, 'IRS990ScheduleO').result[0]
xml_runner.run_filing(obj_id).list_schedules()

['ReturnHeader990x', 'IRS990EZ', 'IRS990ScheduleA']

In [20]:
IRS990ScheduleO

{'schedule_name': 'IRS990ScheduleO',
 'groups': {},
 'schedule_parts': {},
 'csv_line_array': []}

In [19]:
IRS990ScheduleO['groups']

{}

In [129]:
xml_runner.run_filing(obj_id).list_schedules()

Filing version 2012v2.1 isn't supported for this operation


['ReturnHeader990x',
 'IRS990',
 'IRS990ScheduleA',
 'IRS990ScheduleB',
 'IRS990ScheduleD',
 'IRS990ScheduleF',
 'IRS990ScheduleO']

In [51]:
obj_id=201623169349100822
xml_runner.run_filing(obj_id).list_schedules()

['ReturnHeader990x', 'IRS990PF', 'IRS990ScheduleB']

In [23]:
IRS990PF_result['schedule_parts'].keys()

dict_keys(['pf_part_0', 'pf_part_i', 'pf_part_ii', 'pf_part_iii', 'pf_part_iv', 'pf_part_v', 'pf_part_vi', 'pf_part_viia', 'pf_part_viib', 'pf_part_viii', 'pf_part_ixb', 'pf_part_x', 'pf_part_xi', 'pf_part_xii', 'pf_part_xiii', 'pf_part_xv', 'pf_part_xvia', 'pf_part_xvii'])