# Parsing Statistics of Income (SOI) Tax Tables from the IRS
John Mays | maysj@omb.nyc.gov | Created: 03/11/25 | Last Updated: 03/11/25

Data is from the "Individual income tax returns with exemptions and itemized deductions > Publication 1304" category on the [IRS.gov website](https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income).

In [1]:
import pandas as pd
import re
from pathlib import Path
from tqdm import tqdm # cool arabic word here: taqadum (meaning: progress) = تقدم

In [2]:
data_directory = Path('../data')

## Collecting all of the files into dataframes:

In [3]:
def collect_files(dir: Path) -> dict:
    sheets = {}
    for sheet_path in tqdm([path for path in data_directory.glob('*.xl*')]):
        sheets[sheet_path.name] = pd.read_excel(sheet_path, header=None)
    return sheets

In [4]:
sheets = collect_files(data_directory)

100%|██████████| 2/2 [00:00<00:00, 14.70it/s]


## Finding the Total Returns Cells:

In [10]:
sheet_names = list(sheets.keys())

In [11]:
sheet = sheets[sheet_names[0]]

In [12]:
def find_total_returns_cells(sheet:pd.DataFrame) -> list:
    indices = []
    for column in sheet.columns:
        col_matches = sheet[column].str.match(
            r"^taxable[, ]*returns[, ]*total", flags=re.IGNORECASE, na=False
        )
        row_indices = list(sheet.index[col_matches])
        if row_indices:
            indices += [(r, column) for r in row_indices]
    return indices


In [13]:
total_returns_cells = find_total_returns_cells(sheet)

In [14]:
for row, col in total_returns_cells:
    print(f'row: {row}, col: {col} -- {sheet.iloc[row, col]}')

row: 32, col: 0 -- Taxable returns, total


## Finding Likely Header Rows:
In the data, the header rows are displayed like: (1) (2) (3)... or perhaps (11) (12) (13)...

These can show up in one of two ways when imported into pandas: as negative or positive values.

Either way, it is a repeating series of values horizontally decrementing or incrementing by 1, so it should be easy to find.


In [15]:
likely_header_rows = []
threshold = 5

for row_index in sheet.index:
    row = sheet.iloc[row_index, :]
    one_increments = 0
    prev_value = None
    for value in row:
        if isinstance(value, int) and isinstance(prev_value, int):
            if abs(value - prev_value) == 1:
                one_increments += 1
        prev_value = value
    if one_increments > threshold:
        likely_header_rows.append(row_index)

In [16]:
likely_header_rows

[8]

In [17]:
cell_index = total_returns_cells[0]
cell = sheet.iloc[*cell_index]

## Finding the values of the total returns:

In [18]:
tr_row, tr_column = cell_index

In [19]:
possible_numeric_indices = [(tr_row, col) for col in sheet.columns if col > tr_column]

In [30]:
indices_and_tr_values = {}

In [31]:
for index in possible_numeric_indices:
    value = sheet.iloc[*index]
    if isinstance(value, (int, float)): # then the value is numeric & valid
        indices_and_tr_values[index] = value

## Attaching total return values to numeric headers:

In [32]:
numeric_headers_and_values = {}
for index, value in indices_and_tr_values.items():
    value_row, value_column = index
    # finds the closest header row that is above this value:
    corresponding_header_row = max([row for row in likely_header_rows if row < value_row])
    numeric_header = sheet.iloc[corresponding_header_row, value_column]
    numeric_headers_and_values[numeric_header] = value

In [33]:
numeric_headers_and_values

{1: 13783612,
 2: 4737746572,
 3: 13783612,
 4: 4784736388,
 5: 11097679,
 6: 2304629042,
 7: 10637239,
 8: 2205369313,
 9: 2721,
 10: 7565,
 11: 2667,
 12: 6394,
 13: 84650,
 14: 187509,
 15: 3228,
 16: 74672,
 17: 181865,
 18: 8656124,
 19: 8346001,
 20: 75637974,
 21: 1725941,
 22: 29355040,
 23: 6063286,
 24: 222577571,
 25: 5776852,
 26: 173186835,
 27: 1890787,
 28: 2575872,
 29: 43472,
 30: 3401942,
 31: 2877489,
 32: 104263506,
 33: 5891527,
 34: 837378355,
 35: 507447,
 36: 26050611,
 37: 2380356,
 38: 105221210,
 39: 3673737,
 40: 167433498,
 41: 1228415,
 42: 60280397,
 43: 641653,
 44: 18008406,
 45: 66931,
 46: 1422010,
 47: 2498188,
 48: 711667120,
 49: 188263,
 50: 28957237,
 51: 178203,
 52: -6114520,
 53: 331424,
 54: 2588016,
 55: 3113328,
 56: 77597406,
 57: 84987,
 58: 32868381,
 59: 110130,
 60: 1716333,
 61: 81772,
 62: 152475,
 63: 11404,
 64: 1151781,
 65: 623646,
 66: 46187767,
 67: 1220221,
 68: 19830594,
 69: 3609379,
 70: 46989816,
 71: 589501631,
 72: 29871

## Associating Numeric Headers with Text Headers:

In [28]:
limit = 7

### Associating headers with indices:

In [54]:
indices_and_numeric_headers = {}

for header_row_index in likely_header_rows:
    header_row = sheet.iloc[header_row_index, :]
    for potential_header_column, potential_header in zip(header_row.index, header_row):
        if isinstance(potential_header, int) and potential_header < 1000:
            header_index = (header_row_index, potential_header_column)
            indices_and_numeric_headers[header_index] = potential_header

In [58]:
for header_index, numeric_header in indices_and_numeric_headers.items():
    header_row, header_col = header_index
    lesser_row_index = max(header_row-8, 0)
    greater_row_index = max(header_row-1, 0)
    text_header_segments = list(sheet.iloc[lesser_row_index:greater_row_index, header_col])
    print(text_header_segments)
    if header_col == 12:
        break

[nan, nan, 'Number\nof\nreturns', nan, nan, nan, nan]
[nan, nan, 'Adjusted\ngross income\nless deficit', nan, nan, nan, nan]
[nan, nan, 'Total income', nan, nan, nan, 'Number of\nreturns']
[nan, nan, nan, nan, nan, nan, 'Amount']
[nan, nan, 'Total wages', 'Total [1]', nan, nan, 'Number of\nreturns']
[nan, nan, nan, nan, nan, nan, 'Amount']
[nan, nan, nan, 'Total from\nForm W-2 wages', nan, nan, 'Number of\nreturns']
[nan, nan, nan, nan, nan, nan, 'Amount']
[nan, nan, nan, 'Household employee wages\nnot reported on Form W-2', nan, nan, 'Number of\nreturns']
[nan, nan, nan, nan, nan, nan, 'Amount']
[nan, nan, nan, 'Tip income not reported\non Form W-2 wages', nan, nan, 'Number of\nreturns']
[nan, nan, nan, nan, nan, nan, 'Amount']
