In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import math
import os

BASE_URL = "https://tpr.fmcsa.dot.gov"
PAGE_URL = BASE_URL + "/Provider/InReview"
API_URL = BASE_URL + "/api/Public/InReviewPublic"

session = requests.Session()


def get_verification_token():
    """Fetch the HTML page and extract CSRF __RequestVerificationToken."""
    print("Fetching page to extract CSRF token...")
    r = session.get(PAGE_URL)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "html.parser")
    token_input = soup.find("input", {"name": "__RequestVerificationToken"})
    if not token_input:
        raise ValueError("Could not find __RequestVerificationToken in page HTML!")

    return token_input["value"]


def fetch_page(start, length, token):
    """Fetch one page of results using the DataTables API."""
    payload = {
        "draw": 1,
        "start": start,
        "length": length,
        "order[0][column]": 2,
        "order[0][orderable]": "true",
        "order[0][dir]": "desc",
        "columns[0][data]": "Name",
        "columns[1][data]": "City",
        "columns[2][data]": "PhysicalState",
        "search[regex]": "false",
        "__RequestVerificationToken": token,
    }

    headers = {
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "X-Requested-With": "XMLHttpRequest",
    }

    r = session.post(API_URL, data=payload, headers=headers)
    r.raise_for_status()
    return r.json()


def main():
    token = get_verification_token()

    print("Fetching first page to determine total rows...")
    first = fetch_page(start=0, length=10, token=token)
    total_records = first["recordsTotal"]
    print(f"Total records: {total_records}")

    page_size = 10
    total_pages = math.ceil(total_records / page_size)

    all_rows = first["data"]

    # Fetch remaining pages
    for page in range(1, total_pages):
        print(f"Fetching page {page+1}/{total_pages}...")
        start = page * page_size
        page_json = fetch_page(start=start, length=page_size, token=token)
        all_rows.extend(page_json["data"])

    print(f"Total rows downloaded: {len(all_rows)}")

    # ---- FIX: union of all keys across all rows ----
    all_keys = set()
    for row in all_rows:
        all_keys.update(row.keys())

    fieldnames = sorted(all_keys)

    # ---- Save CSV ----
    output_file = os.path.expanduser("~/Downloads/fmcsa_in_review.csv")

    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(all_rows)

    print(f"\nSaved CSV to: {output_file}")


if __name__ == "__main__":
    main()




Fetching page to extract CSRF token...
Fetching first page to determine total rows...
Total records: 4530
Fetching page 2/453...
Fetching page 3/453...
Fetching page 4/453...
Fetching page 5/453...
Fetching page 6/453...
Fetching page 7/453...
Fetching page 8/453...
Fetching page 9/453...
Fetching page 10/453...
Fetching page 11/453...
Fetching page 12/453...
Fetching page 13/453...
Fetching page 14/453...
Fetching page 15/453...
Fetching page 16/453...
Fetching page 17/453...
Fetching page 18/453...
Fetching page 19/453...
Fetching page 20/453...
Fetching page 21/453...
Fetching page 22/453...
Fetching page 23/453...
Fetching page 24/453...
Fetching page 25/453...
Fetching page 26/453...
Fetching page 27/453...
Fetching page 28/453...
Fetching page 29/453...
Fetching page 30/453...
Fetching page 31/453...
Fetching page 32/453...
Fetching page 33/453...
Fetching page 34/453...
Fetching page 35/453...
Fetching page 36/453...
Fetching page 37/453...
Fetching page 38/453...
Fetching page 

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import math
import os

BASE_URL = "https://tpr.fmcsa.dot.gov"
PAGE_URL = BASE_URL + "/Provider/Removed"
API_URL = BASE_URL + "/api/Public/RemovedPublic"

session = requests.Session()


def get_verification_token():
    """Fetch the HTML page and extract CSRF __RequestVerificationToken."""
    print("Fetching page to extract CSRF token...")
    r = session.get(PAGE_URL)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "html.parser")
    token_input = soup.find("input", {"name": "__RequestVerificationToken"})
    if not token_input:
        raise ValueError("Could not find __RequestVerificationToken in page HTML!")

    return token_input["value"]


def fetch_page(start, length, token):
    """Fetch one page of results from FMCSA using DataTables parameters."""
    payload = {
        "draw": 1,
        "start": start,
        "length": length,

        # Sorting by PhysicalState descending (same as site)
        "order[0][column]": 2,
        "order[0][dir]": "desc",

        "columns[0][data]": "Name",
        "columns[0][searchable]": "true",
        "columns[0][orderable]": "true",

        "columns[1][data]": "City",
        "columns[1][searchable]": "true",
        "columns[1][orderable]": "true",

        "columns[2][data]": "PhysicalState",
        "columns[2][searchable]": "true",
        "columns[2][orderable]": "true",

        "search[regex]": "false",

        "__RequestVerificationToken": token,
    }

    headers = {
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "X-Requested-With": "XMLHttpRequest",
    }

    r = session.post(API_URL, data=payload, headers=headers)
    r.raise_for_status()
    return r.json()


def main():
    token = get_verification_token()

    print("Fetching first page to determine total rows...")
    first = fetch_page(start=0, length=10, token=token)
    total_records = first["recordsTotal"]
    print(f"Total records: {total_records}")

    page_size = 10
    total_pages = math.ceil(total_records / page_size)

    all_rows = first["data"]

    # Fetch the remaining pages
    for page in range(1, total_pages):
        print(f"Fetching page {page+1}/{total_pages}...")
        start = page * page_size
        page_json = fetch_page(start=start, length=page_size, token=token)
        all_rows.extend(page_json["data"])

    print(f"Downloaded {len(all_rows)} total rows.")

    # ---- Fix: union of all keys across all rows ----
    all_keys = set()
    for row in all_rows:
        all_keys.update(row.keys())

    fieldnames = sorted(all_keys)

    # ---- Save CSV ----
    output_file = os.path.expanduser("~/Downloads/fmcsa_removed_jan_6_2026.csv")

    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(all_rows)

    print(f"\nSaved CSV to: {output_file}")


if __name__ == "__main__":
    main()


Fetching page to extract CSRF token...
Fetching first page to determine total rows...
Total records: 3080
Fetching page 2/308...
Fetching page 3/308...
Fetching page 4/308...
Fetching page 5/308...
Fetching page 6/308...
Fetching page 7/308...
Fetching page 8/308...
Fetching page 9/308...
Fetching page 10/308...
Fetching page 11/308...
Fetching page 12/308...
Fetching page 13/308...
Fetching page 14/308...
Fetching page 15/308...
Fetching page 16/308...
Fetching page 17/308...
Fetching page 18/308...
Fetching page 19/308...
Fetching page 20/308...
Fetching page 21/308...
Fetching page 22/308...
Fetching page 23/308...
Fetching page 24/308...
Fetching page 25/308...
Fetching page 26/308...
Fetching page 27/308...
Fetching page 28/308...
Fetching page 29/308...
Fetching page 30/308...
Fetching page 31/308...
Fetching page 32/308...
Fetching page 33/308...
Fetching page 34/308...
Fetching page 35/308...
Fetching page 36/308...
Fetching page 37/308...
Fetching page 38/308...
Fetching page 