# Parsing Statistics of Income (SOI) Tax Tables from the IRS
John Mays | maysj@omb.nyc.gov | Created: 03/11/25 | Last Updated: 03/11/25

Data is from the "Individual income tax returns with exemptions and itemized deductions > Publication 1304" category on the [IRS.gov website](https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income).

In [1]:
import pandas as pd
import re
from pathlib import Path
from tqdm import tqdm # cool arabic word here: taqadum (meaning: progress) = تقدم

In [2]:
data_directory = Path('../data')

## Collecting all of the files into dataframes:

In [3]:
def collect_files(dir: Path) -> dict:
    sheets = {}
    for sheet_path in tqdm([path for path in data_directory.glob('*.xl*')]):
        sheets[sheet_path.name] = pd.read_excel(sheet_path, header=None)
    return sheets

In [4]:
sheets = collect_files(data_directory)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 14.33it/s]


## Finding the Total Returns Cells:

In [5]:
sheet_names = list(sheets.keys())

In [6]:
sheet = sheets[sheet_names[1]]

In [7]:
def find_total_returns_cells(sheet:pd.DataFrame) -> list:
    indices = []
    for column in sheet.columns:
        col_matches = sheet[column].str.match(
            r"^taxable[, ]*returns[, ]*total", flags=re.IGNORECASE, na=False
        )
        row_indices = list(sheet.index[col_matches])
        if row_indices:
            indices += [(r, column) for r in row_indices]
    return indices


In [8]:
total_returns_cells = find_total_returns_cells(sheet)

In [9]:
for row, col in total_returns_cells:
    print(f'row: {row}, col: {col} -- {sheet.iloc[row, col]}')

row: 26, col: 0 -- Taxable returns, total
row: 53, col: 0 -- Taxable returns, total
row: 84, col: 0 -- Taxable returns, total
row: 111, col: 0 -- Taxable returns, total
row: 143, col: 0 -- Taxable returns, total
row: 172, col: 0 -- Taxable returns, total
row: 205, col: 0 -- Taxable returns, total
row: 233, col: 0 -- Taxable returns, total
row: 266, col: 0 -- Taxable returns, total
row: 292, col: 0 -- Taxable returns, total


## Finding Likely Header Rows:
In the data, the header rows are displayed like: (1) (2) (3)... or perhaps (11) (12) (13)...

These can show up in one of two ways when imported into pandas: as negative or positive values.

Either way, it is a repeating series of values horizontally decrementing or incrementing by 1, so it should be easy to find.


In [10]:
likely_header_rows = []
threshold = 5

for row_index in sheet.index:
    row = sheet.iloc[row_index, :]
    one_increments = 0
    prev_value = None
    for value in row:
        if isinstance(value, int) and isinstance(prev_value, int):
            if abs(value - prev_value) == 1:
                one_increments += 1
        prev_value = value
    if one_increments > threshold:
        likely_header_rows.append(row_index)

In [11]:
likely_header_rows

[6, 33, 64, 91, 123, 152, 185, 213, 246, 272]

In [12]:
cell_index = total_returns_cells[0]
cell = sheet.iloc[*cell_index]

## Finding the values of the total returns:

In [13]:
tr_row, tr_column = cell_index

In [14]:
possible_numeric_indices = [(tr_row, col) for col in sheet.columns if col > tr_column]

In [15]:
indices_and_tr_values = {}

In [16]:
for index in possible_numeric_indices:
    value = sheet.iloc[*index]
    if isinstance(value, (int, float)): # then the value is numeric & valid
        indices_and_tr_values[index] = value

In [17]:
indices_and_tr_values

{(26, 1): 37532174,
 (26, 2): 3795660097,
 (26, 3): 33601376,
 (26, 4): 2537788820,
 (26, 5): 30342268,
 (26, 6): 99299119,
 (26, 7): 2988667,
 (26, 8): 42329459,
 (26, 9): 16803873,
 (26, 10): 96091083,
 (26, 11): 18389163,
 (26, 12): 16409845}

## Attaching total return values to numeric headers:

In [18]:
numeric_headers_and_values = {}
for index, value in indices_and_tr_values.items():
    value_row, value_column = index
    # finds the closest header row that is above this value:
    corresponding_header_row = max([row for row in likely_header_rows if row < value_row])
    numeric_header = sheet.iloc[corresponding_header_row, value_column]
    numeric_headers_and_values[numeric_header] = value

In [19]:
numeric_headers_and_values

{-1: 37532174,
 -2: 3795660097,
 -3: 33601376,
 -4: 2537788820,
 -5: 30342268,
 -6: 99299119,
 -7: 2988667,
 -8: 42329459,
 -9: 16803873,
 -10: 96091083,
 -11: 18389163,
 -12: 16409845}

## Associating Numeric Headers with Text Headers:

In [20]:
limit = 7

### Associating headers with indices:

In [21]:
indices_and_numeric_headers = {}

for header_row_index in likely_header_rows:
    header_row = sheet.iloc[header_row_index, :]
    for potential_header_column, potential_header in zip(header_row.index, header_row):
        if isinstance(potential_header, int) and potential_header < 1000:
            header_index = (header_row_index, potential_header_column)
            indices_and_numeric_headers[header_index] = potential_header

In [22]:
def assemble_header(segments:list):
    segments_text_only = [s for s in segments if isinstance(s, str)]
    header = '_'.join(segments_text_only)
    header = re.sub(r"\[|\]|-| |_|\\|\/|//|\n", "_", header) # replace special chars
    header = '_'.join([s for s in header.split('_') if s != '']) # de-duplicate underscores
    header = header.lower()
    return header

In [47]:
numeric_to_text_headers = {}

for header_index, numeric_header in indices_and_numeric_headers.items():
    header_row, header_col = header_index
    lesser_row_index = max(header_row-8, 0)
    greater_row_index = max(header_row, 0)
    text_header_segments = list(sheet.iloc[lesser_row_index:greater_row_index, header_col])
    numeric_to_text_headers[numeric_header] = assemble_header(text_header_segments)

In [48]:
numeric_to_text_headers

{-1: 'number_of_returns',
 -2: 'adjusted_gross_income_less_deficit',
 -3: 'salaries_and_wages_number_of_returns',
 -4: 'amount',
 -5: 'taxable_interest_number_of_returns',
 -6: 'amount',
 -7: 'tax_exempt_interest_number_of_returns',
 -8: 'amount',
 -9: 'dividends_number_of_returns',
 -10: 'amount',
 -11: 'state_income_tax_refunds_number_of_returns',
 -12: 'amount',
 -13: 'alimony_received_number_of_returns',
 -14: 'amount',
 -15: 'business_or_profession_net_income_less_loss_number_of_returns',
 -16: 'amount',
 -17: 'sales_of_capital_assets_net_gain_less_loss_number_of_returns',
 -18: 'amount',
 -19: 'sales_of_property_other_than_capital_assets_number_of_returns',
 -20: 'amount',
 -21: 'taxable_ira_distributions_number_of_returns',
 -22: 'amount',
 -23: 'taxable_pensions_and_annuities_number_of_returns',
 -24: 'amount',
 -25: 'rent_and_royalty_net_income_number_of_returns',
 -26: 'amount',
 -27: 'rent_and_royalty_net_loss_number_of_returns',
 -28: 'amount',
 -29: 'farm_rental_net_income

In [49]:
def supplement_inadequate_text_headers(num_to_text_headers: dict) -> dict:
    for num_header, text_header in numeric_to_text_headers.items():
        if text_header == 'amount':
            # extract prefix info from the col before it:
            try:
                preceeding_col_index = -(abs(num_header)-1) if num_header < 0 else abs(num_header)-1
                preceeding_col_name = num_to_text_headers[preceeding_col_index]
                search_result = re.search('number', preceeding_col_name)
                if search_result:
                    prefix_end_index = search_result.span()[0]
                    prefix = preceeding_col_name[0:prefix_end_index]
                    num_to_text_headers[num_header] = prefix + text_header
            except KeyError:
                pass
    return num_to_text_headers

In [50]:
supplement_inadequate_text_headers(numeric_to_text_headers)

{-1: 'number_of_returns',
 -2: 'adjusted_gross_income_less_deficit',
 -3: 'salaries_and_wages_number_of_returns',
 -4: 'salaries_and_wages_amount',
 -5: 'taxable_interest_number_of_returns',
 -6: 'taxable_interest_amount',
 -7: 'tax_exempt_interest_number_of_returns',
 -8: 'tax_exempt_interest_amount',
 -9: 'dividends_number_of_returns',
 -10: 'dividends_amount',
 -11: 'state_income_tax_refunds_number_of_returns',
 -12: 'state_income_tax_refunds_amount',
 -13: 'alimony_received_number_of_returns',
 -14: 'alimony_received_amount',
 -15: 'business_or_profession_net_income_less_loss_number_of_returns',
 -16: 'business_or_profession_net_income_less_loss_amount',
 -17: 'sales_of_capital_assets_net_gain_less_loss_number_of_returns',
 -18: 'sales_of_capital_assets_net_gain_less_loss_amount',
 -19: 'sales_of_property_other_than_capital_assets_number_of_returns',
 -20: 'sales_of_property_other_than_capital_assets_amount',
 -21: 'taxable_ira_distributions_number_of_returns',
 -22: 'taxable_ira_d