# Extract Spending Data by Fiscal Year

In [23]:
import requests
import xml.etree.ElementTree as ET
import time
import pandas as pd
from pathlib import Path
import xml.dom.minidom
import math
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq

ModuleNotFoundError: No module named 'tqdm.notebooks'

In [2]:
API_URL = "https://www.checkbooknyc.com/api"
TEMP_XML_FILE = Path("tmp_latest_response.xml")  # same temp file each time
TIMEOUT = (300, 300) # connect_timeout, read_timeout for requests
OUTPUT_DIR = Path("checkbook_data")
OUTPUT_DIR.mkdir(exist_ok=True)
BATCH_SIZE = 20000  # API's retrieval limit on records per request

In [3]:
# session set up: bypass incapsula bot protection using session mgmt + browser headers
session = requests.Session()
HEADERS = {
    'content-type': 'application/xml',
    'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36'
}

session.headers.update(HEADERS)

# * visit main site initially *
response = session.get('https://www.checkbooknyc.com/', timeout=TIMEOUT)
print(f"main site status: {response.status_code}")
time.sleep(2)

session.headers.update({
    'content-type': 'application/xml',
    'referer': 'https://www.checkbooknyc.com/',
    'origin': 'https://www.checkbooknyc.com'
})

main site status: 200


## Helpers

In [4]:
def fetch_xml(payload: str, verbose: bool = False) -> ET.Element:
    response = session.post(API_URL, data=payload, headers=HEADERS, timeout=TIMEOUT)
    if verbose: 
        print(response.text[:500])
    response.raise_for_status()

    if verbose:
        xml_str = response.content.decode('utf-8')
        print(xml_str)
        
    return ET.fromstring(response.content)

def get_record_count(xml_root: ET.Element) -> int:
    count_tag = xml_root.find(".//result_records/record_count")
    return int(count_tag.text) if count_tag is not None else 0



In [5]:
y = "2025"
y[2:]

'25'

## Initial Metadata Request: Determining Pagination

In [6]:
fiscal_year = 2025

In [7]:
seed_request=f"""
<request>
  <type_of_data>Spending</type_of_data>
  <records_from>1</records_from>
  <max_records>1</max_records>
  <search_criteria>
   <criteria>
      <name>fiscal_year</name>
      <type>value</type>
      <value>{fiscal_year}</value>
    </criteria>
  </search_criteria>
  <response_columns/>
</request>
"""

In [8]:
seed_root = fetch_xml(seed_request,verbose=True)

<?xml version="1.0"?>
<response>
  <status>
    <result>success</result>
  </status>
  <request_criteria>
    <request>
      <type_of_data>Spending</type_of_data>
      <records_from>1</records_from>
      <max_records>1</max_records>
      <search_criteria>
        <criteria>
          <name>fiscal_year</name>
          <type>value</type>
          <value>2025</value>
        </criteria>
      </search_criteria>
      <response_columns/>
    </request>
  </request_criteria>
  <result_records>

<?xml version="1.0"?>
<response>
  <status>
    <result>success</result>
  </status>
  <request_criteria>
    <request>
      <type_of_data>Spending</type_of_data>
      <records_from>1</records_from>
      <max_records>1</max_records>
      <search_criteria>
        <criteria>
          <name>fiscal_year</name>
          <type>value</type>
          <value>2025</value>
        </criteria>
      </search_criteria>
      <response_columns/>
    </request>
  </request_criteria>
  <result_records>

In [9]:
retrievable_records = get_record_count(seed_root)
batches_needed = math.ceil(retrievable_records / BATCH_SIZE)

print(f"""
Seed Request Results for FY {fiscal_year}
=========================================
Total retrievable records: {retrievable_records}
Batches needed: {batches_needed}
""")


Seed Request Results for FY 2025
Total retrievable records: 3157155
Batches needed: 158



## Start Paginated Fetch-to-Appends

## Fetch and Save FY Spending

### Function and Helpers

In [19]:
tqdm._instances.clear()

def save_parquet_append(df: pd.DataFrame, out_file: Path) -> int:
    out_file = Path(out_file)
    if out_file.exists():
        try:
            pa.unregister_extension_type("pandas.period")
        except KeyError:
            pass
        existing_df = pd.read_parquet(out_file, engine="pyarrow")
        original_count = len(existing_df)
        combined = pd.concat([existing_df, df], ignore_index=True)
        deduped = combined.drop_duplicates()
        
        # fix: calculate actual new records added
        final_count = len(deduped)
        new_records_attempted = len(df)
        
        # net change can be negative, but "added" should be non-negative
        net_change = final_count - original_count
        actually_added = max(0, net_change)  # never negative
        
        deduped.to_parquet(out_file, engine="pyarrow", index=False)
        return actually_added
    else:
        df.to_parquet(out_file, engine="pyarrow", index=False)
        return len(df)

        
def parse_transactions(xml_root: ET.Element) -> pd.DataFrame:
    """convert <transaction> elements into dataframe rows"""
    records = []
    for txn in xml_root.findall(".//transaction"):
        row = {child.tag: (child.text or "").strip() for child in txn}
        records.append(row)
    return pd.DataFrame(records)

def download_all_spending(xml_template: str, year: int, total_available: int, verbose: bool = True):
    """download all available spending records for given year"""
    from tqdm import tqdm
    import time
    
    out_file = OUTPUT_DIR / f"spending_{year}.parquet"
    
    # check current progress
    if out_file.exists():
        existing_df = pd.read_parquet(out_file)
        records_downloaded = len(existing_df)
    else:
        records_downloaded = 0
    
    records_remaining = total_available - records_downloaded
    
    if records_remaining <= 0:
        if verbose:
            print(f"download already complete: {records_downloaded:,} records")
        return
    
    if verbose:
        print(f"downloading {records_remaining:,} remaining records (total: {total_available:,})")
    
    with tqdm(total=records_remaining, desc=f"FY{year}", unit="records", ncols=100) as pbar:
        while records_remaining > 0:
            # calculate batch size for this request
            batch_size = min(BATCH_SIZE, records_remaining)
            
            try:
                # download one batch
                records_saved = download_spending_atomic(xml_template, year, batch_size, verbose=False)
                
                # update progress
                records_remaining -= records_saved
                pbar.update(records_saved)
                
                if verbose:
                    total_now = total_available - records_remaining
                    tqdm.write(f"batch complete: {records_saved:,} saved, {total_now:,} total, {records_remaining:,} remaining")
                
                # rate limiting - api requires 1 request per second
                time.sleep(1.1)
                
            except Exception as e:
                tqdm.write(f"batch failed: {e}")
                tqdm.write("retrying in 5 seconds...")
                time.sleep(5)
                # continue loop without updating counters - will retry same batch
    
    if verbose:
        final_df = pd.read_parquet(out_file)
        print(f"download complete: {len(final_df):,} total records saved")

# usage:
xml_template = """<request>
  <type_of_data>Spending</type_of_data>
  <records_from>{records_from}</records_from>
  <max_records>{max_records}</max_records>
  <search_criteria>
   <criteria>
      <name>fiscal_year</name>
      <type>value</type>
      <value>{fiscal_year}</value>
    </criteria>
  </search_criteria>
  <response_columns>
    <column>agency</column>
    <column>payee_name</column>
    <column>check_amount</column>
    <column>issue_date</column>
    <column>document_id</column>
    <column>spending_category</column>
    <column>department</column>
    <column>fiscal_year</column>
  </response_columns>
</request>"""

In [20]:
# download all 3+ million records
download_all_spending(xml_template, 2025, retrievable_records)

downloading 3,117,155 remaining records (total: 3,157,155)


FY2025:   1%|▎                                       | 20000/3117155 [00:04<11:48, 4372.98records/s]

batch complete: 20,000 saved, 60,000 total, 3,097,155 remaining


FY2025:   1%|▌                                       | 40000/3117155 [00:10<13:49, 3710.48records/s]

batch complete: 20,000 saved, 80,000 total, 3,077,155 remaining


FY2025:   2%|▊                                       | 60000/3117155 [00:17<15:00, 3396.27records/s]

batch complete: 20,000 saved, 100,000 total, 3,057,155 remaining


FY2025:   3%|█                                       | 80000/3117155 [00:24<16:12, 3124.45records/s]

batch complete: 20,000 saved, 120,000 total, 3,037,155 remaining


FY2025:   3%|█▎                                     | 100000/3117155 [00:30<15:54, 3159.71records/s]

batch complete: 20,000 saved, 140,000 total, 3,017,155 remaining


FY2025:   4%|█▌                                     | 120000/3117155 [00:36<15:51, 3150.13records/s]

batch complete: 20,000 saved, 160,000 total, 2,997,155 remaining


FY2025:   4%|█▊                                     | 140000/3117155 [00:44<16:30, 3005.43records/s]

batch complete: 20,000 saved, 180,000 total, 2,977,155 remaining


FY2025:   5%|██                                     | 160000/3117155 [00:50<16:09, 3050.41records/s]

batch complete: 20,000 saved, 200,000 total, 2,957,155 remaining


FY2025:   6%|██▎                                    | 180000/3117155 [00:56<15:38, 3129.28records/s]

batch complete: 20,000 saved, 220,000 total, 2,937,155 remaining


FY2025:   6%|██▌                                    | 200000/3117155 [01:02<15:19, 3173.16records/s]

batch complete: 20,000 saved, 240,000 total, 2,917,155 remaining


FY2025:   7%|██▊                                    | 220000/3117155 [01:09<15:50, 3048.62records/s]

batch complete: 20,000 saved, 260,000 total, 2,897,155 remaining


FY2025:   8%|███                                    | 240000/3117155 [01:16<16:14, 2952.05records/s]

batch complete: 20,000 saved, 280,000 total, 2,877,155 remaining


FY2025:   8%|███▎                                   | 260000/3117155 [01:23<15:47, 3016.62records/s]

batch complete: 20,000 saved, 300,000 total, 2,857,155 remaining


FY2025:   9%|███▌                                   | 280000/3117155 [01:29<15:26, 3062.42records/s]

batch complete: 20,000 saved, 320,000 total, 2,837,155 remaining


FY2025:  10%|███▊                                   | 300000/3117155 [01:36<15:25, 3045.14records/s]

batch complete: 20,000 saved, 340,000 total, 2,817,155 remaining


FY2025:  10%|████                                   | 320000/3117155 [01:43<15:35, 2991.07records/s]

batch complete: 20,000 saved, 360,000 total, 2,797,155 remaining


FY2025:  11%|████▎                                  | 340000/3117155 [01:49<15:25, 2999.11records/s]

batch complete: 20,000 saved, 380,000 total, 2,777,155 remaining


FY2025:  12%|████▌                                  | 360000/3117155 [01:56<15:15, 3011.56records/s]

batch complete: 20,000 saved, 400,000 total, 2,757,155 remaining


FY2025:  12%|████▊                                  | 380000/3117155 [02:03<15:39, 2912.05records/s]

batch complete: 20,000 saved, 420,000 total, 2,737,155 remaining


FY2025:  13%|█████                                  | 400000/3117155 [02:10<15:32, 2914.70records/s]

batch complete: 20,000 saved, 440,000 total, 2,717,155 remaining


FY2025:  13%|█████▎                                 | 420000/3117155 [02:19<16:44, 2684.90records/s]

batch complete: 20,000 saved, 460,000 total, 2,697,155 remaining


FY2025:  14%|█████▌                                 | 440000/3117155 [02:26<16:20, 2731.06records/s]

batch complete: 20,000 saved, 480,000 total, 2,677,155 remaining


FY2025:  15%|█████▊                                 | 460000/3117155 [02:33<16:02, 2761.55records/s]

batch complete: 20,000 saved, 500,000 total, 2,657,155 remaining


FY2025:  15%|██████                                 | 480000/3117155 [02:40<15:50, 2773.23records/s]

batch complete: 20,000 saved, 520,000 total, 2,637,155 remaining


FY2025:  16%|██████▎                                | 500000/3117155 [02:47<15:37, 2792.20records/s]

batch complete: 20,000 saved, 540,000 total, 2,617,155 remaining


FY2025:  17%|██████▌                                | 520000/3117155 [02:55<15:45, 2746.57records/s]

batch complete: 20,000 saved, 560,000 total, 2,597,155 remaining


FY2025:  17%|██████▊                                | 540000/3117155 [03:02<15:30, 2770.94records/s]

batch complete: 20,000 saved, 580,000 total, 2,577,155 remaining


FY2025:  18%|███████                                | 560000/3117155 [03:09<15:36, 2731.49records/s]

batch complete: 20,000 saved, 600,000 total, 2,557,155 remaining


FY2025:  19%|███████▎                               | 580000/3117155 [03:18<15:59, 2644.92records/s]

batch complete: 20,000 saved, 620,000 total, 2,537,155 remaining


FY2025:  19%|███████▌                               | 600000/3117155 [03:25<15:47, 2655.79records/s]

batch complete: 20,000 saved, 640,000 total, 2,517,155 remaining


FY2025:  20%|███████▊                               | 620000/3117155 [03:33<15:54, 2615.42records/s]

batch complete: 20,000 saved, 660,000 total, 2,497,155 remaining


FY2025:  21%|████████                               | 640000/3117155 [03:41<16:14, 2541.90records/s]

batch complete: 20,000 saved, 680,000 total, 2,477,155 remaining


FY2025:  21%|████████▎                              | 660000/3117155 [03:49<15:49, 2588.36records/s]

batch complete: 20,000 saved, 700,000 total, 2,457,155 remaining


FY2025:  22%|████████▌                              | 680000/3117155 [03:56<15:40, 2591.59records/s]

batch complete: 20,000 saved, 720,000 total, 2,437,155 remaining


FY2025:  22%|████████▊                              | 700000/3117155 [04:04<15:20, 2626.21records/s]

batch complete: 20,000 saved, 740,000 total, 2,417,155 remaining


FY2025:  23%|█████████                              | 720000/3117155 [04:11<15:09, 2635.07records/s]

batch complete: 20,000 saved, 760,000 total, 2,397,155 remaining


FY2025:  24%|█████████▎                             | 740000/3117155 [04:19<15:21, 2580.47records/s]

batch complete: 20,000 saved, 780,000 total, 2,377,155 remaining


FY2025:  24%|█████████▌                             | 760000/3117155 [04:27<15:01, 2614.38records/s]

batch complete: 20,000 saved, 800,000 total, 2,357,155 remaining


FY2025:  25%|█████████▊                             | 780000/3117155 [04:35<15:03, 2585.87records/s]

batch complete: 20,000 saved, 820,000 total, 2,337,155 remaining


FY2025:  26%|██████████                             | 800000/3117155 [04:42<14:55, 2587.64records/s]

batch complete: 20,000 saved, 840,000 total, 2,317,155 remaining


FY2025:  26%|██████████▎                            | 820000/3117155 [04:50<14:49, 2581.59records/s]

batch complete: 20,000 saved, 860,000 total, 2,297,155 remaining


FY2025:  27%|██████████▌                            | 840000/3117155 [04:59<15:03, 2520.82records/s]

batch complete: 20,000 saved, 880,000 total, 2,277,155 remaining


FY2025:  28%|██████████▊                            | 860000/3117155 [05:07<15:12, 2474.55records/s]

batch complete: 20,000 saved, 900,000 total, 2,257,155 remaining


FY2025:  28%|███████████                            | 880000/3117155 [05:16<15:47, 2361.46records/s]

batch complete: 20,000 saved, 920,000 total, 2,237,155 remaining


FY2025:  29%|███████████▎                           | 900000/3117155 [05:25<15:26, 2394.03records/s]

batch complete: 20,000 saved, 940,000 total, 2,217,155 remaining


FY2025:  30%|███████████▌                           | 920000/3117155 [05:33<15:17, 2395.37records/s]

batch complete: 20,000 saved, 960,000 total, 2,197,155 remaining


FY2025:  30%|███████████▊                           | 940000/3117155 [05:41<15:09, 2393.24records/s]

batch complete: 20,000 saved, 980,000 total, 2,177,155 remaining


FY2025:  31%|████████████                           | 960000/3117155 [05:50<14:59, 2396.94records/s]

batch complete: 20,000 saved, 1,000,000 total, 2,157,155 remaining


FY2025:  31%|████████████▎                          | 980000/3117155 [05:58<14:45, 2413.89records/s]

batch complete: 20,000 saved, 1,020,000 total, 2,137,155 remaining


FY2025:  32%|████████████▏                         | 1000000/3117155 [06:06<14:34, 2421.56records/s]

batch complete: 20,000 saved, 1,040,000 total, 2,117,155 remaining


FY2025:  33%|████████████▍                         | 1020000/3117155 [06:14<14:35, 2395.28records/s]

batch complete: 20,000 saved, 1,060,000 total, 2,097,155 remaining


FY2025:  33%|████████████▋                         | 1040000/3117155 [06:23<14:29, 2389.49records/s]

batch complete: 20,000 saved, 1,080,000 total, 2,077,155 remaining


FY2025:  34%|████████████▉                         | 1060000/3117155 [06:31<14:25, 2377.31records/s]

batch complete: 20,000 saved, 1,100,000 total, 2,057,155 remaining


FY2025:  35%|█████████████▏                        | 1080000/3117155 [06:40<14:26, 2351.95records/s]

batch complete: 20,000 saved, 1,120,000 total, 2,037,155 remaining


FY2025:  35%|█████████████▍                        | 1100000/3117155 [06:49<14:22, 2339.59records/s]

batch complete: 20,000 saved, 1,140,000 total, 2,017,155 remaining


FY2025:  36%|█████████████▋                        | 1120000/3117155 [06:57<14:06, 2360.14records/s]

batch complete: 20,000 saved, 1,160,000 total, 1,997,155 remaining


FY2025:  37%|█████████████▉                        | 1140000/3117155 [07:06<14:05, 2337.90records/s]

batch complete: 20,000 saved, 1,180,000 total, 1,977,155 remaining


FY2025:  37%|██████████████▏                       | 1160000/3117155 [07:14<13:55, 2342.02records/s]

batch complete: 20,000 saved, 1,200,000 total, 1,957,155 remaining


FY2025:  38%|██████████████▍                       | 1180000/3117155 [07:23<13:55, 2319.72records/s]

batch complete: 20,000 saved, 1,220,000 total, 1,937,155 remaining


FY2025:  38%|██████████████▋                       | 1200000/3117155 [07:32<13:52, 2304.23records/s]

batch complete: 20,000 saved, 1,240,000 total, 1,917,155 remaining


FY2025:  39%|██████████████▊                       | 1220000/3117155 [07:42<14:23, 2197.15records/s]

batch complete: 20,000 saved, 1,260,000 total, 1,897,155 remaining


FY2025:  40%|███████████████                       | 1240000/3117155 [07:51<14:06, 2217.26records/s]

batch complete: 20,000 saved, 1,280,000 total, 1,877,155 remaining


FY2025:  40%|███████████████▎                      | 1260000/3117155 [08:00<13:57, 2216.32records/s]

batch complete: 20,000 saved, 1,300,000 total, 1,857,155 remaining


FY2025:  41%|███████████████▌                      | 1280000/3117155 [08:10<14:13, 2152.26records/s]

batch complete: 20,000 saved, 1,320,000 total, 1,837,155 remaining


FY2025:  42%|███████████████▊                      | 1300000/3117155 [08:19<14:06, 2146.63records/s]

batch complete: 20,000 saved, 1,340,000 total, 1,817,155 remaining


FY2025:  42%|████████████████                      | 1320000/3117155 [08:31<15:12, 1970.12records/s]

batch complete: 20,000 saved, 1,360,000 total, 1,797,155 remaining


FY2025:  43%|████████████████▎                     | 1340000/3117155 [08:41<14:41, 2015.25records/s]

batch complete: 20,000 saved, 1,380,000 total, 1,777,155 remaining


FY2025:  44%|████████████████▌                     | 1360000/3117155 [08:50<14:22, 2036.64records/s]

batch complete: 20,000 saved, 1,400,000 total, 1,757,155 remaining


FY2025:  44%|████████████████▊                     | 1380000/3117155 [09:00<14:15, 2031.29records/s]

batch complete: 20,000 saved, 1,420,000 total, 1,737,155 remaining


FY2025:  45%|█████████████████                     | 1400000/3117155 [09:09<13:45, 2080.73records/s]

batch complete: 20,000 saved, 1,440,000 total, 1,717,155 remaining


FY2025:  46%|█████████████████▎                    | 1420000/3117155 [09:19<13:39, 2069.91records/s]

batch complete: 20,000 saved, 1,460,000 total, 1,697,155 remaining


FY2025:  46%|█████████████████▌                    | 1440000/3117155 [09:29<13:41, 2041.56records/s]

batch complete: 20,000 saved, 1,480,000 total, 1,677,155 remaining


FY2025:  47%|█████████████████▊                    | 1460000/3117155 [09:38<13:17, 2077.05records/s]

batch complete: 20,000 saved, 1,500,000 total, 1,657,155 remaining


FY2025:  47%|██████████████████                    | 1480000/3117155 [09:48<12:56, 2107.53records/s]

batch complete: 20,000 saved, 1,520,000 total, 1,637,155 remaining


FY2025:  48%|██████████████████▎                   | 1500000/3117155 [09:57<12:56, 2081.98records/s]

batch complete: 20,000 saved, 1,540,000 total, 1,617,155 remaining


FY2025:  49%|██████████████████▌                   | 1520000/3117155 [10:07<12:40, 2099.28records/s]

batch complete: 20,000 saved, 1,560,000 total, 1,597,155 remaining


FY2025:  49%|██████████████████▊                   | 1540000/3117155 [10:18<13:18, 1976.12records/s]

batch complete: 20,000 saved, 1,580,000 total, 1,577,155 remaining


FY2025:  50%|███████████████████                   | 1560000/3117155 [10:28<13:06, 1980.41records/s]

batch complete: 20,000 saved, 1,600,000 total, 1,557,155 remaining


FY2025:  51%|███████████████████▎                  | 1580000/3117155 [10:38<12:53, 1986.88records/s]

batch complete: 20,000 saved, 1,620,000 total, 1,537,155 remaining


FY2025:  51%|███████████████████▌                  | 1600000/3117155 [10:48<12:28, 2025.59records/s]

batch complete: 20,000 saved, 1,640,000 total, 1,517,155 remaining


FY2025:  52%|███████████████████▋                  | 1620000/3117155 [10:58<12:19, 2023.40records/s]

batch complete: 20,000 saved, 1,660,000 total, 1,497,155 remaining


FY2025:  53%|███████████████████▉                  | 1640000/3117155 [11:07<12:04, 2039.07records/s]

batch complete: 20,000 saved, 1,680,000 total, 1,477,155 remaining


FY2025:  53%|████████████████████▏                 | 1660000/3117155 [11:18<12:06, 2004.52records/s]

batch complete: 20,000 saved, 1,700,000 total, 1,457,155 remaining


FY2025:  54%|████████████████████▍                 | 1680000/3117155 [11:27<11:52, 2017.83records/s]

batch complete: 20,000 saved, 1,720,000 total, 1,437,155 remaining


FY2025:  55%|████████████████████▋                 | 1700000/3117155 [11:38<11:55, 1981.53records/s]

batch complete: 20,000 saved, 1,740,000 total, 1,417,155 remaining


FY2025:  55%|████████████████████▉                 | 1720000/3117155 [11:48<11:38, 2000.31records/s]

batch complete: 20,000 saved, 1,760,000 total, 1,397,155 remaining


FY2025:  56%|█████████████████████▏                | 1740000/3117155 [11:58<11:32, 1988.12records/s]

batch complete: 20,000 saved, 1,780,000 total, 1,377,155 remaining


FY2025:  56%|█████████████████████▍                | 1760000/3117155 [12:08<11:16, 2005.34records/s]

batch complete: 20,000 saved, 1,800,000 total, 1,357,155 remaining


FY2025:  57%|█████████████████████▋                | 1780000/3117155 [12:18<11:15, 1979.93records/s]

batch complete: 20,000 saved, 1,820,000 total, 1,337,155 remaining


FY2025:  58%|█████████████████████▉                | 1800000/3117155 [12:28<11:04, 1982.61records/s]

batch complete: 20,000 saved, 1,840,000 total, 1,317,155 remaining


FY2025:  58%|██████████████████████▏               | 1820000/3117155 [12:39<11:04, 1953.09records/s]

batch complete: 20,000 saved, 1,860,000 total, 1,297,155 remaining


FY2025:  59%|██████████████████████▍               | 1840000/3117155 [12:53<12:02, 1767.31records/s]

batch complete: 20,000 saved, 1,880,000 total, 1,277,155 remaining


FY2025:  60%|██████████████████████▋               | 1860000/3117155 [13:03<11:32, 1814.52records/s]

batch complete: 20,000 saved, 1,900,000 total, 1,257,155 remaining


FY2025:  60%|██████████████████████▉               | 1880000/3117155 [13:13<11:09, 1846.71records/s]

batch complete: 20,000 saved, 1,920,000 total, 1,237,155 remaining


FY2025:  61%|███████████████████████▏              | 1900000/3117155 [13:23<10:47, 1879.19records/s]

batch complete: 20,000 saved, 1,940,000 total, 1,217,155 remaining


FY2025:  62%|███████████████████████▍              | 1920000/3117155 [13:34<10:29, 1901.65records/s]

batch complete: 20,000 saved, 1,960,000 total, 1,197,155 remaining


FY2025:  62%|███████████████████████▋              | 1940000/3117155 [13:44<10:19, 1900.36records/s]

batch complete: 20,000 saved, 1,980,000 total, 1,177,155 remaining


FY2025:  63%|███████████████████████▉              | 1960000/3117155 [13:55<10:15, 1879.05records/s]

batch complete: 20,000 saved, 2,000,000 total, 1,157,155 remaining


FY2025:  64%|████████████████████████▏             | 1980000/3117155 [14:06<10:11, 1861.00records/s]

batch complete: 20,000 saved, 2,020,000 total, 1,137,155 remaining


FY2025:  64%|████████████████████████▍             | 2000000/3117155 [14:18<10:23, 1791.03records/s]

batch complete: 20,000 saved, 2,040,000 total, 1,117,155 remaining


FY2025:  65%|████████████████████████▋             | 2020000/3117155 [14:29<10:06, 1808.50records/s]

batch complete: 20,000 saved, 2,060,000 total, 1,097,155 remaining


FY2025:  65%|████████████████████████▊             | 2040000/3117155 [14:40<09:45, 1839.31records/s]

batch complete: 20,000 saved, 2,080,000 total, 1,077,155 remaining


FY2025:  66%|█████████████████████████             | 2060000/3117155 [14:51<09:40, 1822.49records/s]

batch complete: 20,000 saved, 2,100,000 total, 1,057,155 remaining


FY2025:  67%|█████████████████████████▎            | 2080000/3117155 [15:03<09:43, 1776.10records/s]

batch complete: 20,000 saved, 2,120,000 total, 1,037,155 remaining


FY2025:  67%|█████████████████████████▌            | 2100000/3117155 [15:16<10:00, 1692.71records/s]

batch complete: 20,000 saved, 2,140,000 total, 1,017,155 remaining


FY2025:  68%|█████████████████████████▊            | 2120000/3117155 [15:27<09:34, 1734.63records/s]

batch complete: 20,000 saved, 2,160,000 total, 997,155 remaining


FY2025:  69%|██████████████████████████            | 2140000/3117155 [15:38<09:19, 1747.50records/s]

batch complete: 20,000 saved, 2,180,000 total, 977,155 remaining


FY2025:  69%|██████████████████████████▎           | 2160000/3117155 [15:49<08:59, 1774.34records/s]

batch complete: 20,000 saved, 2,200,000 total, 957,155 remaining


FY2025:  70%|██████████████████████████▌           | 2180000/3117155 [16:00<08:54, 1754.43records/s]

batch complete: 20,000 saved, 2,220,000 total, 937,155 remaining


FY2025:  71%|██████████████████████████▊           | 2200000/3117155 [16:11<08:34, 1782.60records/s]

batch complete: 20,000 saved, 2,240,000 total, 917,155 remaining


FY2025:  71%|███████████████████████████           | 2220000/3117155 [16:23<08:26, 1772.90records/s]

batch complete: 20,000 saved, 2,260,000 total, 897,155 remaining


FY2025:  72%|███████████████████████████▎          | 2240000/3117155 [16:33<08:05, 1806.86records/s]

batch complete: 20,000 saved, 2,280,000 total, 877,155 remaining


FY2025:  73%|███████████████████████████▌          | 2260000/3117155 [16:45<08:03, 1774.36records/s]

batch complete: 20,000 saved, 2,300,000 total, 857,155 remaining


FY2025:  73%|███████████████████████████▊          | 2280000/3117155 [16:56<07:47, 1790.11records/s]

batch complete: 20,000 saved, 2,320,000 total, 837,155 remaining


FY2025:  74%|████████████████████████████          | 2300000/3117155 [17:07<07:38, 1781.00records/s]

batch complete: 20,000 saved, 2,340,000 total, 817,155 remaining


FY2025:  74%|████████████████████████████▎         | 2320000/3117155 [17:18<07:23, 1797.06records/s]

batch complete: 20,000 saved, 2,360,000 total, 797,155 remaining


FY2025:  75%|████████████████████████████▌         | 2340000/3117155 [17:30<07:20, 1763.25records/s]

batch complete: 20,000 saved, 2,380,000 total, 777,155 remaining


FY2025:  76%|████████████████████████████▊         | 2360000/3117155 [17:42<07:17, 1729.37records/s]

batch complete: 20,000 saved, 2,400,000 total, 757,155 remaining


FY2025:  76%|█████████████████████████████         | 2380000/3117155 [17:54<07:08, 1721.45records/s]

batch complete: 20,000 saved, 2,420,000 total, 737,155 remaining


FY2025:  77%|█████████████████████████████▎        | 2400000/3117155 [18:07<07:15, 1648.29records/s]

batch complete: 20,000 saved, 2,440,000 total, 717,155 remaining


FY2025:  78%|█████████████████████████████▌        | 2420000/3117155 [18:20<07:05, 1638.97records/s]

batch complete: 20,000 saved, 2,460,000 total, 697,155 remaining


FY2025:  78%|█████████████████████████████▋        | 2440000/3117155 [18:31<06:50, 1651.16records/s]

batch complete: 20,000 saved, 2,480,000 total, 677,155 remaining


FY2025:  79%|█████████████████████████████▉        | 2460000/3117155 [18:43<06:31, 1680.48records/s]

batch complete: 20,000 saved, 2,500,000 total, 657,155 remaining


FY2025:  80%|██████████████████████████████▏       | 2480000/3117155 [18:54<06:14, 1703.03records/s]

batch complete: 20,000 saved, 2,520,000 total, 637,155 remaining


FY2025:  80%|██████████████████████████████▍       | 2500000/3117155 [19:06<06:01, 1709.47records/s]

batch complete: 20,000 saved, 2,540,000 total, 617,155 remaining


FY2025:  81%|██████████████████████████████▋       | 2520000/3117155 [19:18<05:52, 1693.51records/s]

batch complete: 20,000 saved, 2,560,000 total, 597,155 remaining


FY2025:  81%|██████████████████████████████▉       | 2540000/3117155 [19:29<05:38, 1706.61records/s]

batch complete: 20,000 saved, 2,580,000 total, 577,155 remaining


FY2025:  82%|███████████████████████████████▏      | 2560000/3117155 [19:41<05:28, 1694.87records/s]

batch complete: 20,000 saved, 2,600,000 total, 557,155 remaining


FY2025:  83%|███████████████████████████████▍      | 2580000/3117155 [19:55<05:33, 1609.07records/s]

batch complete: 20,000 saved, 2,620,000 total, 537,155 remaining


FY2025:  83%|███████████████████████████████▋      | 2600000/3117155 [20:08<05:20, 1614.43records/s]

batch complete: 20,000 saved, 2,640,000 total, 517,155 remaining


FY2025:  84%|███████████████████████████████▉      | 2620000/3117155 [20:21<05:16, 1570.23records/s]

batch complete: 20,000 saved, 2,660,000 total, 497,155 remaining


FY2025:  85%|████████████████████████████████▏     | 2640000/3117155 [20:33<05:00, 1588.04records/s]

batch complete: 20,000 saved, 2,680,000 total, 477,155 remaining


FY2025:  85%|████████████████████████████████▍     | 2660000/3117155 [20:45<04:41, 1622.43records/s]

batch complete: 20,000 saved, 2,700,000 total, 457,155 remaining


FY2025:  86%|████████████████████████████████▋     | 2680000/3117155 [20:58<04:30, 1614.02records/s]

batch complete: 20,000 saved, 2,720,000 total, 437,155 remaining


FY2025:  87%|████████████████████████████████▉     | 2700000/3117155 [21:11<04:21, 1593.54records/s]

batch complete: 20,000 saved, 2,740,000 total, 417,155 remaining


FY2025:  87%|█████████████████████████████████▏    | 2720000/3117155 [21:22<04:04, 1622.54records/s]

batch complete: 20,000 saved, 2,760,000 total, 397,155 remaining


FY2025:  88%|█████████████████████████████████▍    | 2740000/3117155 [21:35<03:51, 1625.97records/s]

batch complete: 20,000 saved, 2,780,000 total, 377,155 remaining


FY2025:  89%|█████████████████████████████████▋    | 2760000/3117155 [21:47<03:41, 1615.12records/s]

batch complete: 20,000 saved, 2,800,000 total, 357,155 remaining


FY2025:  89%|█████████████████████████████████▉    | 2780000/3117155 [22:01<03:35, 1567.79records/s]

batch complete: 20,000 saved, 2,820,000 total, 337,155 remaining


FY2025:  90%|██████████████████████████████████▏   | 2800000/3117155 [22:14<03:25, 1546.82records/s]

batch complete: 20,000 saved, 2,840,000 total, 317,155 remaining


FY2025:  90%|██████████████████████████████████▍   | 2820000/3117155 [22:27<03:10, 1558.31records/s]

batch complete: 20,000 saved, 2,860,000 total, 297,155 remaining


FY2025:  91%|██████████████████████████████████▌   | 2840000/3117155 [22:39<02:55, 1579.08records/s]

batch complete: 20,000 saved, 2,880,000 total, 277,155 remaining


FY2025:  92%|██████████████████████████████████▊   | 2860000/3117155 [22:52<02:44, 1565.27records/s]

batch complete: 20,000 saved, 2,900,000 total, 257,155 remaining


FY2025:  92%|███████████████████████████████████   | 2880000/3117155 [23:04<02:30, 1578.37records/s]

batch complete: 20,000 saved, 2,920,000 total, 237,155 remaining


FY2025:  93%|███████████████████████████████████▎  | 2900000/3117155 [23:17<02:17, 1582.72records/s]

batch complete: 20,000 saved, 2,940,000 total, 217,155 remaining


FY2025:  94%|███████████████████████████████████▌  | 2920000/3117155 [23:30<02:04, 1581.13records/s]

batch complete: 20,000 saved, 2,960,000 total, 197,155 remaining


FY2025:  94%|███████████████████████████████████▊  | 2940000/3117155 [23:42<01:51, 1592.15records/s]

batch complete: 20,000 saved, 2,980,000 total, 177,155 remaining


FY2025:  95%|████████████████████████████████████  | 2960000/3117155 [23:55<01:38, 1594.55records/s]

batch complete: 20,000 saved, 3,000,000 total, 157,155 remaining


FY2025:  96%|████████████████████████████████████▎ | 2980000/3117155 [24:07<01:26, 1588.05records/s]

batch complete: 20,000 saved, 3,020,000 total, 137,155 remaining


FY2025:  96%|████████████████████████████████████▌ | 3000000/3117155 [24:19<01:13, 1603.70records/s]

batch complete: 20,000 saved, 3,040,000 total, 117,155 remaining


FY2025:  96%|████████████████████████████████████▌ | 3000000/3117155 [24:22<01:13, 1603.70records/s]

batch failed: invalid xml response: syntax error: line 1, column 0
retrying in 5 seconds...


FY2025:  96%|████████████████████████████████████▌ | 3000000/3117155 [24:28<01:13, 1603.70records/s]

batch failed: invalid xml response: syntax error: line 1, column 0
retrying in 5 seconds...


FY2025:  96%|████████████████████████████████████▌ | 3000000/3117155 [25:05<01:13, 1603.70records/s]

batch failed: invalid xml response: syntax error: line 1, column 0
retrying in 5 seconds...


FY2025:  96%|████████████████████████████████████▌ | 3000000/3117155 [25:13<01:13, 1603.70records/s]

batch failed: invalid xml response: syntax error: line 1, column 0
retrying in 5 seconds...


FY2025:  97%|█████████████████████████████████████▊ | 3020000/3117155 [25:31<02:25, 665.48records/s]

batch complete: 20,000 saved, 3,060,000 total, 97,155 remaining


FY2025:  98%|██████████████████████████████████████ | 3040000/3117155 [25:44<01:36, 798.85records/s]

batch complete: 20,000 saved, 3,080,000 total, 77,155 remaining


FY2025:  98%|██████████████████████████████████████▎| 3060000/3117155 [25:57<01:00, 937.92records/s]

batch complete: 20,000 saved, 3,100,000 total, 57,155 remaining


FY2025:  98%|██████████████████████████████████████▎| 3060000/3117155 [26:01<01:00, 937.92records/s]

batch failed: invalid xml response: syntax error: line 1, column 49
retrying in 5 seconds...


FY2025:  99%|██████████████████████████████████████▌| 3080000/3117155 [26:27<00:44, 826.50records/s]

batch complete: 20,000 saved, 3,120,000 total, 37,155 remaining


FY2025:  99%|██████████████████████████████████████▊| 3100000/3117155 [26:49<00:20, 849.51records/s]

batch complete: 20,000 saved, 3,140,000 total, 17,155 remaining


FY2025: 100%|███████████████████████████████████████| 3117155/3117155 [27:03<00:00, 932.96records/s]

batch complete: 17,155 saved, 3,157,155 total, 0 remaining


FY2025: 100%|██████████████████████████████████████| 3117155/3117155 [27:04<00:00, 1918.78records/s]


download complete: 3,157,155 total records saved


### Function Calls

## Peek in Parquet Files

In [None]:
# file_path = "checkbook_data/spending_2025.parquet"
file_path = "checkbook_data/duplicates_spending_2025.parquet"


# read the Parquet file into a DataFrame
df = pd.read_parquet(file_path, engine="pyarrow") 

# look at the first few rows
print(df.head())

# see general info
print(df.info())

# get column names
print(df.columns)

In [22]:
# check what actually got saved
out_file = OUTPUT_DIR / f"spending_{fiscal_year}.parquet"
if out_file.exists():
    df_check = pd.read_parquet(out_file)
    print(f"file exists: {len(df_check)} records")
else:
    print("no file created")
    
# check if xml was saved
xml_files = list(OUTPUT_DIR.glob("raw_FY*.xml"))
print(f"xml files: {len(xml_files)}")

file exists: 3157155 records
xml files: 6


###  API Notes

https://www.checkbooknyc.com/spending-api

- retrieval limit: 20K records per call
- FY25: 3.2M transactions 
- to retrieve all: stagger `records_from` 

3,157,063 transactions <- as of 8/22 from data feeds https://www.checkbooknyc.com/data-feeds 

##### XML request:
- global params: `type_of_data`, `records_from`, `max_records`
- filters: put in `<search_criteria>`
- requested columns: put in `<response_columns>`

##### XML response: 
- `<search_criteria>`: echoes request xml
- `<result_records>`: current result batch based on `records_from` and `max_records`
- `<record_count>`: TOTAL rows
- `<messages>`: errors + status info
- `<status>`: success/failure of request

##### prossibly outdated thread on rate limits
https://groups.google.com/g/checkbooknyc/c/hgU1niDG00Y?pli=1

## technical notes

### api integration:
- bypass incapsula bot protection using session mgmt + browser headers
- learn xml format structure requirements of checkbooknyc api
- debug XML templating issues betewen f-strings vs `.format()`

### pagination / data integrity:
- **problem**: deduplication approach failed - legitimate different records marked as duplicates
- **root cause**: overlapping requests (requesting records 1-20000 when file had 1-19815)
- **solution**: range tracking with atomic saves eliminates deduplication need

### download architecture:
- resumable downloads using file length to calculate next offset
- atomic batch operations ensure data consistency if interrupted
- complying with rate limiting + error handling with auto retry
- progress tracking for slo downloads (e.g., full FY)

### key insights
- solve api pagination problem at the request level (proper offsets) instead of data level (deduplication).
- separate concerns: overlapping requests vs. data consistency
- well-structured batch system with proper offsets -> remarkably less data mucking / data handling complexity