In [1]:
import os
import time
import xml.etree.ElementTree as ET
from collections import OrderedDict

import numpy as np
import pandas as pd

In [2]:
# Data from:
# https://www.irs.gov/charities-non-profits/form-990-series-downloads

In [3]:
base = 'download990xml'

In [4]:
subs = []
for text in os.listdir(base):
    if text.startswith('download990xml'):
        subs += [text]
        
subs.sort()
# display(subs)

In [5]:
print('Create an index file for each download folder:\n')
for i, sub in enumerate(subs):
    start = time.time()
    print(f'{i + 1:02} of {len(subs):02}, {sub:<21}', end = '')

    files = []
    for text in os.listdir(os.path.join(base, sub)):
        if text.endswith('public.xml'):
            files += [text]

    list0 =[]
    for file in files:
        with open(os.path.join(base, sub, file), 'r') as f: text = f.read()
        text = text.replace('xmlns="http://www.irs.gov/efile"', '')

        root = ET.fromstring(text)
        dic = OrderedDict()

        year1 = root.find('ReturnHeader/TaxYear')
        year2 = root.find('ReturnHeader/TaxYr')
        if year1 is not None: dic['Year'] = int(year1.text)
        if year2 is not None: dic['Year'] = int(year2.text)

        form1 = root.find('ReturnHeader/ReturnType')     
        form2 = root.find('ReturnHeader/ReturnTypeCd')
        if form1 is not None: dic['Form'] = form1.text
        if form2 is not None: dic['Form'] = form2.text

        state1 = root.find('ReturnHeader/Filer/USAddress/State')  
        state2 = root.find('ReturnHeader/Filer/USAddress/StateAbbreviationCd')
        if state1 is not None: dic['State'] = state1.text
        if state2 is not None: dic['State'] = state2.text
            
        path = os.path.join(sub, file)
        dic['Path'] = path
        
        list0 += [dic]

    df0 = pd.DataFrame(list0)
    index = 'index' + sub.split('download990xml')[1]
    df0.to_csv(os.path.join(base, index), index=False)
    
    stop = time.time()
    print(f', {(stop - start)/60:.2f} min')

Create an index file for each download folder:

01 of 45, download990xml_2015_1, 1.21 min
02 of 45, download990xml_2015_2, 1.10 min
03 of 45, download990xml_2016_1, 1.25 min
04 of 45, download990xml_2016_2, 1.26 min
05 of 45, download990xml_2016_3, 1.24 min
06 of 45, download990xml_2016_4, 1.26 min
07 of 45, download990xml_2016_5, 1.24 min
08 of 45, download990xml_2016_6, 0.76 min
09 of 45, download990xml_2017_1, 1.25 min
10 of 45, download990xml_2017_2, 1.25 min
11 of 45, download990xml_2017_3, 1.25 min
12 of 45, download990xml_2017_4, 1.25 min
13 of 45, download990xml_2017_5, 1.25 min
14 of 45, download990xml_2017_6, 1.25 min
15 of 45, download990xml_2017_7, 0.51 min
16 of 45, download990xml_2018_1, 1.26 min
17 of 45, download990xml_2018_2, 1.27 min
18 of 45, download990xml_2018_3, 1.27 min
19 of 45, download990xml_2018_4, 1.27 min
20 of 45, download990xml_2018_5, 1.26 min
21 of 45, download990xml_2018_6, 1.24 min
22 of 45, download990xml_2018_7, 0.93 min
23 of 45, download990xml_201

In [6]:
indexes = []
for text in os.listdir(base):
    if text.startswith('index'):
        indexes += [text]
        
indexes.sort()
# display(indexes)

In [7]:
df1 = pd.DataFrame()
for index in indexes:
    new = pd.read_csv(os.path.join(base, index))
    df1 = df1.append(new, ignore_index=True)
    
df1.to_csv(os.path.join(base, 'index'), index=False)

In [8]:
! head download990xml/index

Year,Form,State,Path
2014.0,990PF,NJ,download990xml_2015_1/201513209349103101_public.xml
2014.0,990,OH,download990xml_2015_1/201503209349307025_public.xml
2014.0,990,NY,download990xml_2015_1/201513179349306981_public.xml
2014.0,990PF,MI,download990xml_2015_1/201503009349100100_public.xml
2014.0,990,VA,download990xml_2015_1/201503149349301075_public.xml
2014.0,990,MN,download990xml_2015_1/201513369349300901_public.xml
2014.0,990,PA,download990xml_2015_1/201513169349303261_public.xml
2014.0,990EZ,MA,download990xml_2015_1/201503209349203800_public.xml
2014.0,990,MO,download990xml_2015_1/201513579349300206_public.xml


In [9]:
! tail download990xml/index

2020.0,990,TN,download990xml_2022/202200289349300010_public.xml
2020.0,990,IL,download990xml_2022/202220539349300112_public.xml
2020.0,990,WI,download990xml_2022/202210469349300126_public.xml
2020.0,990,NY,download990xml_2022/202200759349300400_public.xml
2020.0,990,NC,download990xml_2022/202220249349301007_public.xml
2020.0,990,NC,download990xml_2022/202230359349301528_public.xml
2020.0,990,NY,download990xml_2022/202240749349300524_public.xml
2019.0,990,AZ,download990xml_2022/202200349349300930_public.xml
2020.0,990,PA,download990xml_2022/202230409349300203_public.xml
2020.0,990,CA,download990xml_2022/202220429349302017_public.xml
