In [2]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import warnings
import math
from tqdm import tqdm
from irsx.xmlrunner import XMLRunner
xml_runner = XMLRunner()
warnings.filterwarnings('ignore')

import ipyparallel as ipp
c = ipp.Client()
print(c.ids)
dview = c[:]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]


### Prepare UK Data.

```Python
# Compress icnpo training data.
df_icnpo_classifier_training_data=pd.read_csv('../dataset/icnpo_classifier_training_data.csv', sep=',')
df_icnpo_classifier_training_data.to_pickle('../dataset/icnpo_classifier_training_data.pkl.gzip', compression='gzip')
```

### Prepare US Training Data.
Useful links:
- IRS 990 forms on AWS: https://registry.opendata.aws/irs990/
- NCCS Data Archive (data dict, NTEE codes, etc.): https://nccs-data.urban.org/index.php

In [3]:
df_index=pd.DataFrame()
for year in range(2014, 2019):
    try:
        print(str(year), 'started', end='\t')
        df_index_temp=pd.read_csv('https://s3.amazonaws.com/irs-form-990/index_'+str(year)+'.csv', 
                                  error_bad_lines=False, warn_bad_lines=True # Be cautious with bad lines.
                                 )
        df_index_temp['YEAR']=year
        df_index=pd.concat([df_index, df_index_temp], ignore_index=True)
        print('got')
    except:
        print('passed')

2014 started	

b'Skipping line 39569: expected 9 fields, saw 10\n'


got
2015 started	got
2016 started	got
2017 started	got
2018 started	got


In [4]:
df_index.groupby(['YEAR','RETURN_TYPE']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,DLN,OBJECT_ID
YEAR,RETURN_TYPE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014,990,163381,163381,163381,163381,163381,163381,163381,163381
2014,990EO,29466,29466,29466,29466,29466,29466,29466,29466
2014,990EZ,82937,82937,82937,82937,82937,82937,82937,82937
2014,990O,52490,52490,52490,52490,52490,52490,52490,52490
2014,990PF,59254,59254,59254,59254,59254,59254,59254,59254
2015,990,106571,106571,106571,106571,106571,106571,106571,106571
2015,990EO,21399,21399,21399,21399,21399,21399,21399,21399
2015,990EZ,59845,59845,59845,59845,59845,59845,59845,59845
2015,990O,34567,34567,34567,34567,34567,34567,34567,34567
2015,990PF,38650,38650,38650,38650,38650,38650,38650,38650


### Acquiring text data.

In [5]:
df_index.sample(10)

Unnamed: 0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID,YEAR
985401,13998026,EFILE,202717744,201509,12/14/2016 8:23:20 AM,COMMUNITY HEALTHCARE CONNECTIONS,990,93493224026056,201602249349302605,2016
362402,11158094,EFILE,205822108,201212,1/16/2014,SARI AND THOMAS TURNER FAMILY FOUNDATION,990PF,93491308002583,201333089349100258,2014
1363365,14824315,EFILE,391279307,201612,10/11/2017 9:08:09 PM,PROJECT HOME INC,990,93493243007297,201742439349300729,2017
412583,12251969,EFILE,626047092,201406,3/10/2015,ROTARY INTERNATIONAL WINCHESTER ROTARY CLUB,990EO,93492048008185,201530489349200818,2015
854619,13757384,EFILE,460356449,201512,08/31/2016,AVON COMMUNITY DEVELOPMENT INC,990EO,93492117003046,201641179349200304,2016
308269,11946573,EFILE,841436447,201312,11/3/2014 2:55:56 PM,DESO FOUNDATION,990PF,93491133020124,201421339349102012,2014
93956,11641212,EFILE,237086112,201306,7/2/2014 11:34:57 PM,CODAC BEHAVIORAL HEALTH SERVICES OF PIMA COUNT...,990,93493133050934,201431339349305093,2014
1128667,14149216,EFILE,203385115,201607,2/14/2017 7:46:56 PM,WHOLE ELEPHANT INSTITUTE INC,990EZ,93492335003036,201633359349200303,2017
1618798,15174159,EFILE,233033396,201706,2/13/2018 1:16:28 PM,GOVERNOR MIFFLIN MUSIC ASSOCIATION CO GOVERNOR...,990EZ,93492355003007,201703559349200300,2018
544861,12883659,EFILE,330325562,201506,9/22/2015,CORONA DEL MAR HIGH SCHOOL BOOSTER,990EZ,93492231007155,201502319349200715,2015


In [6]:
@dview.parallel(block=True)
def func_text_acq(index):
    
    ###### Define functions ######################################
    def func_irs990_text(obj_id):
        IRS990_result=xml_runner.run_sked(obj_id, 'IRS990').result

        ############## Check filing version.##############
        if IRS990_result!=None:
            IRS990_result=IRS990_result[0]
        else:
            return {'IRS990_p1_ActvtyOrMssnDsc': 'VERSION_NOT_SUPPORTED',
                    'IRS990_p3_MssnDsc': 'VERSION_NOT_SUPPORTED',
                    'IRS990_p3_DscS': 'VERSION_NOT_SUPPORTED',
                    'IRS990ScheduleO_ExplntnTxt': 'VERSION_NOT_SUPPORTED',
                   }
        ##################################################

        ###### IRS990_p1_ActvtyOrMssnDsc #################
        IRS990_p1_ActvtyOrMssnDsc=IRS990_result['schedule_parts']['part_i']['ActvtyOrMssnDsc']
        ##################################################

        ##### IRS990_p3_MssnDsc, IRS990_p3_DscS ##########
        IRS990_p3=IRS990_result['schedule_parts']['part_iii']
        IRS990_p3_MssnDsc='##'.join([IRS990_p3[key] for key in IRS990_p3.keys() if key=='MssnDsc'])
        IRS990_p3_DscS='##'.join([IRS990_p3[key] for key in IRS990_p3.keys() if (key=='Dsc' or '_Dsc' in key)])
        ##################################################

        ###### IRS990ScheduleO_ExplntnTxt #################
#         if 'InfInSkdOPrtIIIInd' in IRS990_p3.keys() and IRS990_p3['InfInSkdOPrtIIIInd']=='X' and 'IRS990ScheduleO' in xml_runner.run_filing(obj_id).list_schedules():
        if 'IRS990ScheduleO' in xml_runner.run_filing(obj_id).list_schedules():
            IRS990ScheduleO=xml_runner.run_sked(obj_id, 'IRS990ScheduleO').result[0]
            IRS990ScheduleO_ExplntnTxt='##'.join([s['ExplntnTxt'] for s in IRS990ScheduleO['groups']['SkdOSpplmntlInfrmtnDtl'] if 'III' in s['FrmAndLnRfrncDsc']])
        else:
            IRS990ScheduleO_ExplntnTxt=''
        ##################################################

        return {'IRS990_p1_ActvtyOrMssnDsc': IRS990_p1_ActvtyOrMssnDsc,
                'IRS990_p3_MssnDsc': IRS990_p3_MssnDsc,
                'IRS990_p3_DscS': IRS990_p3_DscS,
                'IRS990ScheduleO_ExplntnTxt': IRS990ScheduleO_ExplntnTxt,
               }
    
    def func_irs990ez_text(obj_id):
        IRS990EZ_result=xml_runner.run_sked(obj_id, 'IRS990EZ').result

        ############## Check filing version.##############
        if IRS990EZ_result!=None:
            IRS990EZ_result=IRS990EZ_result[0]
        else:
            return {'IRS990EZ_p3_PrmryExmptPrpsTxt': 'VERSION_NOT_SUPPORTED',
                    'IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt': 'VERSION_NOT_SUPPORTED',
                    'IRS990ScheduleO_ExplntnTxt': 'VERSION_NOT_SUPPORTED',
                   }
        ########################################################

        ###### IRS990EZ_p3_PrmryExmptPrpsTxt #############
        if 'ez_part_iii' in IRS990EZ_result['schedule_parts'].keys():
            IRS990EZ_p3=IRS990EZ_result['schedule_parts']['ez_part_iii']
            IRS990EZ_p3_PrmryExmptPrpsTxt='##'.join([IRS990EZ_p3[key] for key in IRS990EZ_p3.keys() if key=='PrmryExmptPrpsTxt'])
        else:
            IRS990EZ_p3_PrmryExmptPrpsTxt=''
        ##################################################

        ####### IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt ######
        if 'EZPrgrmSrvcAccmplshmnt' in IRS990EZ_result['groups'].keys():
            IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt='##'.join([s['DscrptnPrgrmSrvcAccmTxt'] for s in IRS990EZ_result['groups']['EZPrgrmSrvcAccmplshmnt']])
        else:
            IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt=''
        ##################################################

        ###### IRS990ScheduleO_ExplntnTxt #################
#         if 'InfInSkdOPrtIIIInd' in IRS990EZ_p3.keys() and IRS990EZ_p3['InfInSkdOPrtIIIInd']=='X' and 'IRS990ScheduleO' in xml_runner.run_filing(obj_id).list_schedules():
        if 'IRS990ScheduleO' in xml_runner.run_filing(obj_id).list_schedules():
            IRS990ScheduleO=xml_runner.run_sked(obj_id, 'IRS990ScheduleO').result[0]
            IRS990ScheduleO_ExplntnTxt='##'.join([s['ExplntnTxt'] for s in IRS990ScheduleO['groups']['SkdOSpplmntlInfrmtnDtl'] if 'III' in s['FrmAndLnRfrncDsc']])
        else:
            IRS990ScheduleO_ExplntnTxt=''
        ##################################################

        return {'IRS990EZ_p3_PrmryExmptPrpsTxt': IRS990EZ_p3_PrmryExmptPrpsTxt,
                'IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt': IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,
                'IRS990ScheduleO_ExplntnTxt': IRS990ScheduleO_ExplntnTxt,
               }

    def func_irs990pf_text(obj_id):
        IRS990PF_result=xml_runner.run_sked(obj_id, 'IRS990PF').result

        ############## Check filing version.##############
        if IRS990PF_result!=None:
            IRS990PF_result=IRS990PF_result[0]
        else:
            return {'IRS990PF_p9a_DscrptnTxt': 'VERSION_NOT_SUPPORTED',
                    'IRS990PF_p16b_RltnshpSttmntTxt': 'VERSION_NOT_SUPPORTED',
                   }
        ########################################################

        ##### IRS990PF_p9a_DscrptnTxt ##########################
        if 'pf_part_ixa' in IRS990PF_result['schedule_parts'].keys():
            IRS990PF_p9a=IRS990PF_result['schedule_parts']['pf_part_ixa']
            IRS990PF_p9a_DscrptnTxt='##'.join([IRS990PF_p9a[key] for key in IRS990PF_p9a.keys() if 'Txt' in key])
        else:
            IRS990PF_p9a_DscrptnTxt=''
        ########################################################

        ##### IRS990PF_p16b_RltnshpSttmntTxt ###################
        if 'PFRlnOfActyTAccmOfExmptPrps' in IRS990PF_result['groups']:
            IRS990PF_p16b_RltnshpSttmntTxt='##'.join([s['RltnshpSttmntTxt'] for s in IRS990PF_result['groups']['PFRlnOfActyTAccmOfExmptPrps']])
        else:
            IRS990PF_p16b_RltnshpSttmntTxt=''
        ########################################################

        return {'IRS990PF_p9a_DscrptnTxt':IRS990PF_p9a_DscrptnTxt,
                'IRS990PF_p16b_RltnshpSttmntTxt':IRS990PF_p16b_RltnshpSttmntTxt,
               }
    ###### Define functions ######################################
    
    ###### Run main function ################################
    global df_index, done_index_list, df_index_text_acq
    if index not in done_index_list:
        obj_id = df_index.loc[index, 'OBJECT_ID']
        return_type=df_index.loc[index, 'RETURN_TYPE']
        if return_type in ['990', '990O']:
            text_dict=func_irs990_text(obj_id)
            df_index.loc[index, 'IRS990_p1_ActvtyOrMssnDsc']=text_dict['IRS990_p1_ActvtyOrMssnDsc']
            df_index.loc[index, 'IRS990_p3_MssnDsc']=text_dict['IRS990_p3_MssnDsc']
            df_index.loc[index, 'IRS990_p3_DscS']=text_dict['IRS990_p3_DscS']
            df_index.loc[index, 'IRS990ScheduleO_ExplntnTxt']=text_dict['IRS990ScheduleO_ExplntnTxt']
        elif return_type in ['990EZ', '990EO']:
            text_dict=func_irs990ez_text(obj_id)
            df_index.loc[index, 'IRS990EZ_p3_PrmryExmptPrpsTxt']=text_dict['IRS990EZ_p3_PrmryExmptPrpsTxt']
            df_index.loc[index, 'IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt']=text_dict['IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt']
            df_index.loc[index, 'IRS990ScheduleO_ExplntnTxt']=text_dict['IRS990ScheduleO_ExplntnTxt']
        elif return_type in ['990PF']:
            text_dict=func_irs990pf_text(obj_id)
            df_index.loc[index, 'IRS990PF_p9a_DscrptnTxt']=text_dict['IRS990PF_p9a_DscrptnTxt']
            df_index.loc[index, 'IRS990PF_p16b_RltnshpSttmntTxt']=text_dict['IRS990PF_p16b_RltnshpSttmntTxt']
        done_index_list+=[index]
    df_index_text_acq=pd.concat([df_index_text_acq, df_index.loc[[index]]], ignore_index=True)
    ###### Run main function ################################

In [7]:
done_index_list=[]
dview.execute('import pandas as pd')
dview.execute('from irsx.xmlrunner import XMLRunner')
dview['xml_runner'] = XMLRunner()
dview['df_index'] = df_index
dview['done_index_list'] = done_index_list
dview['df_index_text_acq'] = pd.DataFrame()

In [8]:
t=func_text_acq.map(df_index.sample(10000-1).index)

In [9]:
df_EIN_TXT=pd.concat(dview.gather('df_index_text_acq'), ignore_index=True)
for index in range(0, len(df_EIN_TXT)+1, math.ceil(len(df_EIN_TXT)/20)):
    df_temp=df_EIN_TXT.loc[index:index+math.ceil(len(df_EIN_TXT)/20)-1]
    df_temp.to_pickle('../../dataset/EIN_TXT_2014_18.pkl.gz_sample/EIN_TXT_2014_18.pkl.gz'+'-'+str(df_temp.iloc[0].name)+'-'+str(df_temp.iloc[-1].name), compression='gzip')

### Draft.

In [None]:
for index in df_index.sample(1000).index:
    obj_id=df_index.loc[index, 'OBJECT_ID']
    func_text_acq(index)

In [21]:
IRS990ScheduleO=xml_runner.run_sked(obj_id, 'IRS990ScheduleO').result[0]
xml_runner.run_filing(obj_id).list_schedules()

['ReturnHeader990x', 'IRS990EZ', 'IRS990ScheduleA']

In [20]:
IRS990ScheduleO

{'schedule_name': 'IRS990ScheduleO',
 'groups': {},
 'schedule_parts': {},
 'csv_line_array': []}

In [19]:
IRS990ScheduleO['groups']

{}

In [129]:
xml_runner.run_filing(obj_id).list_schedules()

Filing version 2012v2.1 isn't supported for this operation


['ReturnHeader990x',
 'IRS990',
 'IRS990ScheduleA',
 'IRS990ScheduleB',
 'IRS990ScheduleD',
 'IRS990ScheduleF',
 'IRS990ScheduleO']

In [51]:
obj_id=201623169349100822
xml_runner.run_filing(obj_id).list_schedules()

['ReturnHeader990x', 'IRS990PF', 'IRS990ScheduleB']

In [23]:
IRS990PF_result['schedule_parts'].keys()

dict_keys(['pf_part_0', 'pf_part_i', 'pf_part_ii', 'pf_part_iii', 'pf_part_iv', 'pf_part_v', 'pf_part_vi', 'pf_part_viia', 'pf_part_viib', 'pf_part_viii', 'pf_part_ixb', 'pf_part_x', 'pf_part_xi', 'pf_part_xii', 'pf_part_xiii', 'pf_part_xv', 'pf_part_xvia', 'pf_part_xvii'])