In [1]:
import warnings
warnings.filterwarnings('ignore')

### Prepare UK Data.

```Python
# Compress icnpo training data.
df_icnpo_classifier_training_data=pd.read_csv('../dataset/icnpo_classifier_training_data.csv', sep=',')
df_icnpo_classifier_training_data.to_pickle('../dataset/icnpo_classifier_training_data.pkl.gzip', compression='gzip')
```

In [7]:
import pandas as pd

### Prepare US Training Data.
Useful links:
- IRS 990 forms on AWS: https://registry.opendata.aws/irs990/
- NCCS Data Archive (data dict, NTEE codes, etc.): https://nccs-data.urban.org/index.php

In [2]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET

In [140]:
df_index=pd.DataFrame()
for year in range(2014, 2018):
    try:
        print(str(year), 'started', end='\t')
        df_index_temp=pd.read_csv('https://s3.amazonaws.com/irs-form-990/index_'+str(year)+'.csv', 
                                  error_bad_lines=False, warn_bad_lines=True # Be cautious with bad lines.
                                 )
        df_index_temp['YEAR']=year
        df_index=pd.concat([df_index, df_index_temp])
        print('got')
    except:
        print('passed')

2014 started	

b'Skipping line 39569: expected 9 fields, saw 10\n'


got
2015 started	got
2016 started	got
2017 started	got


In [143]:
df_index.groupby(['YEAR','RETURN_TYPE']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,DLN,OBJECT_ID
YEAR,RETURN_TYPE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014,990,163381,163381,163381,163381,163381,163381,163381,163381
2014,990EO,29466,29466,29466,29466,29466,29466,29466,29466
2014,990EZ,82937,82937,82937,82937,82937,82937,82937,82937
2014,990O,52490,52490,52490,52490,52490,52490,52490,52490
2014,990PF,59254,59254,59254,59254,59254,59254,59254,59254
2015,990,106571,106571,106571,106571,106571,106571,106571,106571
2015,990EO,21399,21399,21399,21399,21399,21399,21399,21399
2015,990EZ,59845,59845,59845,59845,59845,59845,59845,59845
2015,990O,34567,34567,34567,34567,34567,34567,34567,34567
2015,990PF,38650,38650,38650,38650,38650,38650,38650,38650


### Acquiring text data.

In [311]:
df_index.sample(10)

Unnamed: 0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID,YEAR
268325,13821844,EFILE,810641294,201512,09/28/2016,FACE IN THE MIRROR FOUNDATION,990,93493137035456,201601379349303545,2016
126339,12790665,EFILE,861120841,201412,8/17/2015,JILL AND ERIK MASCHLER FOUNDATION INC,990PF,93491135000405,201501359349100040,2015
306978,11778333,EFILE,943091689,201312,9/18/2014 6:21:16 PM,OREGON JUNIOR GOLF FUND INC,990,93493225027924,201422259349302792,2014
38715,13150265,EFILE,620423847,201506,01/27/2016,PARIS - HENRY COUNTY CHAMBER OF COMMERCE,990O,93493314013255,201503149349301325,2016
279244,13813207,EFILE,382794339,201512,09/26/2016,INTERNATIONAL UNION OF ELEVATOR LOCAL 85,990O,93493127001596,201641279349300159,2016
204569,14681606,EFILE,251434445,201608,8/28/2017 4:17:03 PM,GAMMA OMEGA CHAPTER HOUSE ASSOCIATION - ALPHA,990O,93493105010197,201741059349301019,2017
35600,13147120,EFILE,271966049,201412,01/26/2016,MASSACHUSETTS YOUTH RUGBY ORGANIZATION INC CO ...,990EZ,93492320058225,201523209349205822,2016
38041,13150013,EFILE,954529368,201412,01/27/2016,ONE INCREDIBLE FAMILY INC,990EZ,93492313019355,201503139349201935,2016
164203,14572262,EFILE,460526297,201512,7/18/2017 5:14:29 PM,RANDOLPH SMOKERS CLUB INC,990EO,93492023006637,201730239349200663,2017
174123,12923134,EFILE,530196573,201409,10/2/2015,AMERICAN COUNCIL ON EDUCATION,990,93493226023595,201542269349302359,2015


In [37]:
from irsx.xmlrunner import XMLRunner
xml_runner = XMLRunner()

In [286]:
def func_irs990_text(obj_id):
    IRS990_result=xml_runner.run_sked(obj_id, 'IRS990').result[0]
    IRS990_p1_ActvtyOrMssnDsc=IRS990_result['schedule_parts']['part_i']['ActvtyOrMssnDsc']
    IRS990_p3=IRS990_result['schedule_parts']['part_iii']
    IRS990_p3_MssnDsc=IRS990_p3['MssnDsc']
    IRS990_p3_DscS='##'.join([IRS990_p3[key] for key in IRS990_p3.keys() if (key=='Dsc' or '_Dsc' in key)])
    if 'InfInSkdOPrtIIIInd' in IRS990_p3.keys() and IRS990_p3['InfInSkdOPrtIIIInd']=='X':
        IRS990ScheduleO=xml_runner.run_sked(obj_id, 'IRS990ScheduleO').result[0]
        IRS990ScheduleO_ExplntnTxt='##'.join([s['ExplntnTxt'] for s in IRS990ScheduleO['groups']['SkdOSpplmntlInfrmtnDtl'] if 'III' in s['FrmAndLnRfrncDsc']])
    else:
        IRS990ScheduleO_ExplntnTxt=''
    return {'IRS990_p1_ActvtyOrMssnDsc': IRS990_p1_ActvtyOrMssnDsc,
            'IRS990_p3_MssnDsc': IRS990_p3_MssnDsc,
            'IRS990_p3_DscS': IRS990_p3_DscS,
            'IRS990ScheduleO_ExplntnTxt': IRS990ScheduleO_ExplntnTxt,
           }

def func_irs990ez_text(obj_id):
    IRS990EZ_result=xml_runner.run_sked(obj_id, 'IRS990EZ').result[0]
    IRS990EZ_p3=IRS990EZ_result['schedule_parts']['ez_part_iii']
    IRS990EZ_p3_PrmryExmptPrpsTxt=IRS990EZ_p3['PrmryExmptPrpsTxt']
    if 'EZPrgrmSrvcAccmplshmnt' in IRS990EZ_result['groups'].keys():
        IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt='##'.join([s['DscrptnPrgrmSrvcAccmTxt'] for s in IRS990EZ_result['groups']['EZPrgrmSrvcAccmplshmnt']])
    else:
        IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt=''
    if 'InfInSkdOPrtIIIInd' in IRS990EZ_p3.keys() and IRS990EZ_p3['InfInSkdOPrtIIIInd']=='X':
        IRS990ScheduleO=xml_runner.run_sked(obj_id, 'IRS990ScheduleO').result[0]
        IRS990ScheduleO_ExplntnTxt='##'.join([s['ExplntnTxt'] for s in IRS990ScheduleO['groups']['SkdOSpplmntlInfrmtnDtl'] if 'III' in s['FrmAndLnRfrncDsc']])
    else:
        IRS990ScheduleO_ExplntnTxt=''
    return {'IRS990EZ_p3_PrmryExmptPrpsTxt': IRS990EZ_p3_PrmryExmptPrpsTxt,
            'IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt': IRS990EZ_p3_DscrptnPrgrmSrvcAccmTxt,
            'IRS990ScheduleO_ExplntnTxt': IRS990ScheduleO_ExplntnTxt,
           }

In [290]:
obj_id=201622029349200602
xml_runner.run_filing(obj_id).list_schedules()

['ReturnHeader990x',
 'IRS990EZ',
 'IRS990ScheduleA',
 'IRS990ScheduleB',
 'IRS990ScheduleO']

In [315]:
xml_runner.run_sked(201503139349201935, 'IRS990ScheduleO').result[0]

{'schedule_name': 'IRS990ScheduleO',
 'groups': {'SkdOSpplmntlInfrmtnDtl': [{'object_id': 201503139349201935,
    'ein': '954529368',
    'FrmAndLnRfrncDsc': 'Form 990EZ, Part I, Line 16',
    'ExplntnTxt': '20 TH ANNIVERSARY CELEBRATION 3449. BANK CHARGES & MERCHANT FEES 507. EASTER BASKET MAKING 4069. HALLOWEEN TRICK-OR-TREAT BAG MAKING 6521. HOMELESS FEEDING & BANNER CREATION 4479. INSURANCE 600. OFFICE SUPPLIES 88. PERMITS - BULK MAILING 220. POTTED-PLANT MAKING & SING-A-LONG 502. PROMOTIONAL ITEMS (20 TH ANNIVERSARY) 300. ROUNDING 2. VALENTINE MAKING 505. WEBSITE EXPENSES 259.'},
   {'object_id': 201503139349201935,
    'ein': '954529368',
    'FrmAndLnRfrncDsc': 'Form 990EZ, Part II, Line 24',
    'ExplntnTxt': 'PROJECT SUPPLIES 729. 968.'}]},
 'schedule_parts': {},
 'csv_line_array': []}