# Extract Spending Data by Fiscal Year

In [72]:
import requests
import xml.etree.ElementTree as ET
import time
import pandas as pd
from pathlib import Path
import xml.dom.minidom
import math

In [55]:
API_URL = "https://www.checkbooknyc.com/api"
TEMP_XML_FILE = Path("tmp_latest_sandbox_response.xml")  # same temp file each time
TIMEOUT = (10, 300) # connect_timeout, read_timeout for requests
OUTPUT_DIR = Path("checkbook_data")
OUTPUT_DIR.mkdir(exist_ok=True)
BATCH_SIZE = 20000  # API's retrieval limit on records per request

In [54]:
# session set up
session = requests.Session()
session.headers.update({
    'content-type': 'application/xml',
    'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36'
})

# **** visit main site initially to bypass incapsula ****
response = session.get('https://www.checkbooknyc.com/', timeout=TIMEOUT)
print(f"main site status: {response.status_code}")
time.sleep(2)

session.headers.update({
    'content-type': 'application/xml',
    'referer': 'https://www.checkbooknyc.com/',
    'origin': 'https://www.checkbooknyc.com'
})

main site status: 200


## Helpers

In [62]:
def fetch_xml(payload: str, verbose: bool = False) -> ET.Element:
    response = session.post(API_URL, data=payload, timeout=60)
    response.raise_for_status()

    if verbose:
        xml_str = response.content.decode('utf-8')
        print(xml_str)
        
    return ET.fromstring(response.content)

def parse_transactions(xml_root: ET.Element) -> pd.DataFrame:
    records = []
    for txn in xml_root.findall(".//transaction"):
        row = {child.tag: (child.text or "").strip() for child in txn}
        records.append(row)
    return pd.DataFrame(records)

def get_record_count(xml_root: ET.Element) -> int:
    count_tag = xml_root.find(".//result_records/record_count")
    return int(count_tag.text) if count_tag is not None else 0

def download_transactions(xml_body_template: str, year: int, batches_needed: int):
    """
    """
    offset = 1
    batch = 0

    while True:
        xml_body = xml_body_template.format(records_from=offset, max_records=MAX_RECORDS)
        root = fetch_xml(xml_body)

        if batch == 0:
            total_records = get_record_count(root)
            print(f"Total records for FY{year}: {total_records:,}")

        df = parse_transactions(root)
        if df.empty:
            print(f"Finished FY{year} after {batch} batches")
            break

        # append batch to parquet
        out_file = OUTPUT_DIR / f"spending_{year}.parquet"
        df.to_parquet(out_file, engine="pyarrow", index=False, append=True)
        print(f"Saved batch {batch} ({len(df)} rows), offset={offset}")

        offset += MAX_RECORDS
        batch += 1
        if offset > total_records:
            print(f"All {total_records:,} rows downloaded for FY{year}")
            break

## Initial Metadata Request: Determining Pagination

In [67]:
fiscal_year = 2025

In [68]:
seed_request=f"""
<request>
  <type_of_data>Spending</type_of_data>
  <records_from>1</records_from>
  <max_records>1</max_records>
  <search_criteria>
   <criteria>
      <name>fiscal_year</name>
      <type>value</type>
      <value>{fiscal_year}</value>
    </criteria>
  </search_criteria>
  <response_columns/>
</request>
"""

In [69]:
seed_root = fetch_xml(seed_request,verbose=True)

<?xml version="1.0"?>
<response>
  <status>
    <result>success</result>
  </status>
  <request_criteria>
    <request>
      <type_of_data>Spending</type_of_data>
      <records_from>1</records_from>
      <max_records>1</max_records>
      <search_criteria>
        <criteria>
          <name>fiscal_year</name>
          <type>value</type>
          <value>2025</value>
        </criteria>
      </search_criteria>
      <response_columns/>
    </request>
  </request_criteria>
  <result_records>
    <record_count>3157155</record_count>
    <spending_transactions>
      <transaction>
        <agency>Department of Education</agency>
        <associated_prime_vendor>N/A</associated_prime_vendor>
        <budget_code>4301</budget_code>
        <capital_project/>
        <contract_id/>
        <mocs_registered>No</mocs_registered>
        <contract_purpose/>
        <check_amount>153402038.46</check_amount>
        <department>GE INSTR &amp; SCH LEADERSHIP - PS</department>
        <document

In [73]:
retrievable_records = get_record_count(seed_root)
batches_needed = math.ceil(retrievable_records / BATCH_SIZE)

print(f"""
Seed Request Results for FY {fiscal_year}
=========================================
Total retrievable records: {retrievable_records}
Batches needed: {batches_needed}
""")


Seed Request Results for FY 2025
Total retrievable records: 3157155
Batches needed: 158



In [16]:
# Example XML template (adjust fiscal_year, filters, columns)
records_from=1
max_records=

xml_template=f"""
<request>
  <type_of_data>Spending</type_of_data>
  <records_from>{records_from}</records_from>
  <max_records>{max_records}</max_records>
  <search_criteria>
   <criteria>
      <name>fiscal_year</name>
      <type>value</type>
      <value>2025</value>
    </criteria>
  </search_criteria>
  <response_columns>
    <column>agency</column>
    <column>spending_category</column>
    <column>document_id</column>
    <column>payee_name</column>
    <column>check_amount</column>
    <column>department</column>
    <column>sub_vendor</column>
    <column>sub_contract_reference_id</column>
    <column>fiscal_year</column>
    <column>emerging_business</column>
    <column>mocs_registered</column>
    <column>expense_category</column>
    <column>contract_id</column>
    <column>contract_purpose</column>
    <column>issue_date</column>
    <column>capital_project</column>
    <column>mwbe_category</column>
    <column>associated_prime_vendor</column>
    <column>industry</column>
    <column>woman_owned_business</column>
    <column>budget_code</column>
  </response_columns>
</request>
"""

NameError: name 'records_from' is not defined

In [14]:
download_transactions(xml_template, year=2025)

ParseError: mismatched tag: line 7, column 9 (<string>)

###  API Notes

https://www.checkbooknyc.com/spending-api

- retrieval limit: 20K records per call
- FY25: 3.2M transactions 
- to retrieve all: stagger `records_from` 

3,157,063 transactions <- as of 8/22 from data feeds https://www.checkbooknyc.com/data-feeds 

##### XML request:
- global params: `type_of_data`, `records_from`, `max_records`
- filters: put in `<search_criteria>`
- requested columns: put in `<response_columns>`

##### XML response: 
- `<search_criteria>`: echoes request xml
- `<result_records>`: current result batch based on `records_from` and `max_records`
- `<record_count>`: TOTAL rows
- `<messages>`: errors + status info
- `<status>`: success/failure of request