<h1>Matt's eCFR API Demonstrator</h1>
<p>An intro to Jupyter Notebooks</p>
<p><i>Teach yourself Python using the eCFR API</i></p>

In [95]:
# Globals
# Do my imports - os for filepath, datetime for date operations, pandas for data analysis, aiohttp asyncio requests for HTTPS REST API requests, json for, well, javascript object notation, duh
# Note: the nest_asyncio is used to allow asyncio to run in Jupyter Notebooks without blocking the event loop
import datetime as dt
import os as os
import pandas as pd
import aiohttp as aio
import asyncio as asy   
import requests as rq
import json as json
import nest_asyncio
nest_asyncio.apply()

# Set my constants
base_url = "https://www.ecfr.gov/api/"
titles_url = base_url + "versioner/v1/titles.json"
agencies_url = base_url + "admin/v1/agencies.json"
title_and_chapter_versions_url = base_url + "versioner/v1/versions/title-"
data_path = "../data/"
stale_days = 30 # How many days before we consider the data stale

In [96]:
# Define my eCFR API wrapper class
class eCFR_API_Wrapper:
    """
    A wrapper for the eCFR API to fetch and process data.
    """

    def __init__(self, base_url):
        self.base_url = base_url

    def fetch_data(endpoint):
        """
        Fetch data from the eCFR API.
        """
        try:
            response = rq.get(f"{endpoint}")
            if response.status_code == 200:
                return response.json()
            else:
                response.raise_for_status()
        except: 
            print(f"Error fetching data from {endpoint}")
            return None
        
    async def fetch_data_asynch(endpoint):    
        """
        Fetch data from the eCFR API asynchronously.
        """
        session = rq.Session()
        async with session.get(endpoint) as response:
            # Return the JSON content of the response using 'response.json()'
            return await response.json()

In [97]:
class eCFR_HouseKeeping:

    titles_json = None
    agencies_json = None
    slugs = []
    titles_and_chapters = []


    def __init__(self, data_path):
        self.data_path = data_path

    # FETCH DATA METHODS
    def fetch_agencies_data(self, asynch=False):
        """
        Fetch the agencies data from the eCFR API.
        """
        try:           
            if( asynch ): 
                print("Fetching agencies asynchronously from the API...")
                self.agencies_json = eCFR_API_Wrapper.fetch_data_asynch(agencies_url)
                return self.agencies_json
            else:
                # fetch the agencies data from the API
                print("Fetching agencies data from the API...") 
                self.agencies_json = eCFR_API_Wrapper.fetch_data(agencies_url)
                return self.agencies_json
        except Exception as e:
            print(f"Error fetching agencies data: {e}")
            return None    
    
    def fetch_titles_data(self, asynch=False):
        """
        Fetch the titles data from the eCFR API.
        """
        try:
            if( asynch ):
                print("Fetching titles asynchronously from the API...")
                self.titles_json = eCFR_API_Wrapper.fetch_data_asynch(titles_url)    
            else:
                # fetch the titles data from the API
                print("Fetching titles data from the API...")   
                self.titles_json = eCFR_API_Wrapper.fetch_data(titles_url)
            return self.titles_json
        except Exception as e:
            print(f"Error fetching titles data: {e}")
            return None 

    def fetch_full_text_by_title_and_chapter(self, title, chapter, asynch=True):
        """
        Fetch the full text data from the eCFR API.
        """
        my_title_url = title_and_chapter_versions_url + str(title) + ".json?chapter=" + str(chapter) 
        # fetch the text for a single title and chapter 
        try:
            if( asynch):
                print("Fetching title and chapter text asynchronously from the API...")
                title_and_chapter_version_json = eCFR_API_Wrapper.fetch_data_asynch(my_title_url)
            else:
                print("Fetching title and chapter text from the API...")
                title_and_chapter_version_json = eCFR_API_Wrapper.fetch_data(my_title_url)
        except Exception as e:
            return (f"Error fetching title version data: {e}")
        return title_and_chapter_version_json    
    

    
    # LOAD DATA METHODS    
    def load_agency_data(self):
        """
        Load the agency data from the agencies.json file.
        """
        try:
            with open(self.data_path + 'agencies.json', 'r') as myagenciesfile:
                print("Loading agency text from the file...")
                self.agencies_json = json.load(myagenciesfile)
                return self.agencies_json
        except FileNotFoundError as e:
            print(f"No agencies.json found: {e}")
            return None

    def load_title_data(self):
        """
        Load the title data from the titles.json file.
        """
        try:
            with open(self.data_path + 'titles.json', 'r') as mytitlesfile:
                print("Loading title text from the file...")
                self.titles_json = json.load(mytitlesfile)
                return self.titles_json
        except FileNotFoundError as e:
            print(f"No agencies.json found, need to refresh: {e}")
            return None
        
    def load_full_text_data_by_title_and_chapter(self, title,chapter):
        """
        Load the title and chapter versions from an individual file.
        """
        filename = self.data_path + 'versions/title' + str(title) + 'chapter' + str(chapter) + '_versions.json'
        try:
            with open(filename, 'r') as mytextfile:
                print("Loading full text from "+filename+" ...")
                title_and_chapter_full_text = json.load(mytextfile)
                return title_and_chapter_full_text
        except FileNotFoundError:
            print("No "+filename+" text found, need to refresh: {e}")
            return None

    # REFRESH DATA METHODS    
    def is_fresh(self, filename):
        """
        Check if the file is fresh (not older than 30 days).
        """
        try:
            file_path = self.data_path + filename
            if os.path.exists(file_path):
                file_mtime = os.path.getmtime(file_path)
                print(file_path + " created at: ", dt.datetime.fromtimestamp(file_mtime))
                stale_days_ago = dt.datetime.now() - dt.timedelta(days=stale_days)
                return dt.datetime.fromtimestamp(file_mtime) > stale_days_ago
            else:
                return False
        except Exception as e:
            print(f"Error checking freshness of {filename}: {e}")
            return False
            
    def refresh_agencies(self, asynch=False):
        """
        Fetch the agencies data from the eCFR API if file is stale.
        """
        # first, see if the agencies.json file exists and is fresh
        agencyfile = self.data_path + 'agencies.json'
        agencystale = self.is_fresh(agencyfile)
        if( not agencystale ):
            print("Agencies data is fresh, loading from file...")
            return self.load_title_data()
        else:
            print("Agencies data is stale, fetching from API...")
            self.agencies_json = None
        if (agencystale or self.agencies_json == None):
            # fetch the agencies data from the API and write it to the file
            self.fetch_agencies_data(asynch)
            # if we have agencies_json, write it to the file
            if self.agencies_json is None:
                return "Unable to fetch agencies data from the API"
            # attempt to write the agencies_json to the file    
            else: 
                try:    
                    with open(agencyfile, 'w') as myagenciesfile:
                        print("Writing agencies data to agencies.json file...")
                        json.dump(self.agencies_json, myagenciesfile)
                    return self.agencies_json
                except IOError as e:
                    return (f"Error writing to agencies.json: {e}")
        else:
            return None

    def refresh_titles(self, asynch=False):
        """
        Fetch the titles data from the eCFR API.
        """
        # first, see if the titles.json file exists and is fresh    
        title_file = self.data_path + 'titles.json'
        isfresh = self.is_fresh(title_file)
        if isfresh:
            print("Titles data is fresh, loading from file...")
            # if the file is fresh, load it and return
            return self.load_title_data()
        else:
            print("Titles data is stale, fetching from API...")
            self.fetch_titles_data(asynch)
            #we have fresh titles_json, write it to the file
            try:
                with open(title_file, 'w') as mytitlesfile:
                    json.dump(self.titles_json , mytitlesfile)
                return self.titles_json
            except IOError as e:
                return None 

    def refresh_full_text_by_title_and_chapter(self, title, chapter, asynch=False):    
        """
        Fetch the title and chapter full text from the eCFR API.
        """
        # first, see if the title_and_chapter.json file exists and is fresh    
        title_and_chapter_file = self.data_path + '/versions/title'+str(title)+'chapter'+str(chapter)+'_versions.json'
        title_and_chapter_version_json = None
        isfresh = self.is_fresh(title_and_chapter_file)
        if isfresh:
            print("Title "+str(title)+" chapter "+str(chapter)+" data is fresh, loading from file...")
            # if the file is fresh, load it and return
            title_and_chapter_version_json =  self.load_full_text_data_by_title_and_chapter(title,chapter)
        else:
            # file is not fresh, fetch it from the API  
            print("Title and chapter data is stale, fetching from API...")
            title_and_chapter_version_json = self.fetch_full_text_by_title_and_chapter(str(title),str(chapter),asynch)
            #we have fresh titles_json, write it to the file
            try:
                with open(title_and_chapter_file, 'w') as mytitleandchapterfile:
                    json.dump(title_and_chapter_version_json, mytitleandchapterfile)
            except IOError as e:
                return None 
        return title_and_chapter_version_json

    def refresh_all_versions_by_title_and_chapter(self, asynch=True): #use asynch by default
        """
        Fetch all title and chapter versions from the eCFR API.
        """
        self.refresh_agencies()  # Ensure we have the latest agencies data
        # blank the slug array
        self.slugs = []
        for agency in self.agencies_json.get('agencies', []):
            agency_name = agency.get('name', 'Unknown Agency')
            slug = agency.get('slug', 'Unknown Slug')
            # Add the slug to the list of slugs
            if slug not in self.slugs:
                self.slugs.append(slug)
            print(f"Processing agency: {agency_name} with SLUG: {slug}")
            # loop through all the titles, chapters, and parts for each agency
            for cfr_references in agency.get('cfr_references', []):
                title = cfr_references.get('title', 'Unknown Title')
                chapter = cfr_references.get('chapter', None)
                part = cfr_references.get('part', None)
                # fetch the version for each title chapter for this SLUG
                try:
                    self.titles_and_chapters.append(self.refresh_full_text_by_title_and_chapter(str(title), str(chapter), asynch))   
                except Exception as e:
                    print(f"Error fetching versions for title {title} chapter {chapter}: {e}")
                # if not exception, titleXchapterY_versions.json file should be created
            pass
        return "Versions fetched successfully."    



<h2>Unit Tests</h2>

In [98]:
import unittest

class TestECFR(unittest.TestCase):
    def setUp(self):
        self.ecfr_hk = eCFR_HouseKeeping(data_path)
    
    # TEST FETCH DATA METHODS
    def test_fetch_agencies_data(self):
        agencies = self.ecfr_hk.fetch_agencies_data()
        self.assertIsNotNone(agencies)

    def test_fetch_titles_data(self):
        titles = self.ecfr_hk.fetch_titles_data()
        self.assertIsNotNone(titles)

    ''' # IGNORE ASYNCH FETCH TITLES AND AGENCIES - ITS FAST ENOUGH
    async def test_fetch_agencies_data_asynch(self):
        # Use 'asyncio.gather()' to run the tasks concurrently and gather their results
        agencies = await asy.gather(self.ecfr_hk.fetch_agencies_data(asynch=True)  )
        self.assertIsNotNone(agencies) 

    async def test_fetch_titles_data_asynch(self):
        # Use 'asyncio.gather()' to run the tasks concurrently and gather their results
        titles = await asy.gather(self.ecfr_hk.fetch_titles_data(asynch=True)  )
        self.assertIsNotNone(titles) 
    '''

    def test_fetch_full_text_by_title_and_chapter(self):
        title_text = self.ecfr_hk.fetch_full_text_by_title_and_chapter("1", "2", asynch=False)
        self.assertIsNotNone( title_text )    

    # TEST LOAD DATA METHODS
    def test_load_agency_data(self):
        agencies = self.ecfr_hk.load_agency_data()
        self.assertIsNotNone(agencies)

    def test_load_title_data(self):
        titles = self.ecfr_hk.load_title_data()
        self.assertIsNotNone(titles)    

    def test_load_full_text_data_by_title_and_chapter(self):
        title_and_chapter_version_text = self.ecfr_hk.load_full_text_data_by_title_and_chapter("1","III")
        self.assertIsNotNone(title_and_chapter_version_text)    
    
    # TEST REFRESH DATA METHODS
    def test_refresh_agencies(self):
        result = self.ecfr_hk.refresh_agencies()
        self.assertIsNotNone(result)

    def test_refresh_titles(self):
        result = self.ecfr_hk.refresh_titles()
        self.assertIsNotNone(result) 
    
    def test_refresh_title_and_chapter_versions(self):
        result = self.ecfr_hk.refresh_full_text_by_title_and_chapter("1", "III")
        self.assertIsNotNone(result)    

    def test_refresh_all_versions_byslug(self):
        result = self.ecfr_hk.refresh_all_versions_by_title_and_chapter(True)
        self.assertIn("Versions fetched successfully", result)  


unittest.main(argv=[''], verbosity=2, exit=False)


test_fetch_agencies_data (__main__.TestECFR.test_fetch_agencies_data) ... 

ok
test_fetch_full_text_by_title_and_chapter (__main__.TestECFR.test_fetch_full_text_by_title_and_chapter) ... ok
test_fetch_titles_data (__main__.TestECFR.test_fetch_titles_data) ... 

Fetching agencies data from the API...
Fetching title and chapter text from the API...
Fetching titles data from the API...


ok
test_load_agency_data (__main__.TestECFR.test_load_agency_data) ... ok
test_load_full_text_data_by_title_and_chapter (__main__.TestECFR.test_load_full_text_data_by_title_and_chapter) ... ok
test_load_title_data (__main__.TestECFR.test_load_title_data) ... ok
test_refresh_agencies (__main__.TestECFR.test_refresh_agencies) ... ok
test_refresh_all_versions_byslug (__main__.TestECFR.test_refresh_all_versions_byslug) ... 

Loading agency text from the file...
Loading full text from ../data/versions/title1chapterIII_versions.json ...
Loading title text from the file...
../data/../data/agencies.json created at:  2025-08-05 18:45:43.893705
Agencies data is stale, fetching from API...
Fetching agencies data from the API...
Writing agencies data to agencies.json file...
../data/../data/agencies.json created at:  2025-08-05 18:47:05.936704
Agencies data is stale, fetching from API...
Fetching agencies data from the API...
Writing agencies data to agencies.json file...
Processing agency: Administrative Conference of the United States with SLUG: administrative-conference-of-the-united-states
../data/../data//versions/title1chapterIII_versions.json created at:  2025-08-05 18:22:28.652719
Title 1 chapter III data is fresh, loading from file...
Loading full text from ../data/versions/title1chapterIII_versions.json ...
Processing agency: Advisory Council on Historic Preservation with SLUG: advisory-council-on-histor

  def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
  obj, end = self.scan_once(s, idx)
  print(f"Error fetching versions for title {title} chapter {chapter}: {e}")
  def _iterencode(o, _current_indent_level):
ok
test_refresh_title_and_chapter_versions (__main__.TestECFR.test_refresh_title_and_chapter_versions) ... ok
test_refresh_titles (__main__.TestECFR.test_refresh_titles) ... ok
test_fetch_agencies_data (__main__.TestECFRHouseKeeping.test_fetch_agencies_data) ... ok
test_fetch_full_text_by_title_and_chapter (__main__.TestECFRHouseKeeping.test_fetch_full_text_by_title_and_chapter) ... 

Error fetching versions for title 48 chapter 57: Object of type coroutine is not JSON serializable
Processing agency: United States Agency for Global Media with SLUG: united-states-agency-for-global-media
Title and chapter data is stale, fetching from API...
Fetching title and chapter text asynchronously from the API...
Error fetching versions for title 22 chapter V: Object of type coroutine is not JSON serializable
Title and chapter data is stale, fetching from API...
Fetching title and chapter text asynchronously from the API...
Error fetching versions for title 48 chapter 19: Object of type coroutine is not JSON serializable
Processing agency: Department of Agriculture with SLUG: agriculture-department
Title and chapter data is stale, fetching from API...
Fetching title and chapter text asynchronously from the API...
Error fetching versions for title 2 chapter IV: Object of type coroutine is not JSON serializable
Title and chapter data is stale, fetching from API...
Fetching title a

ok
test_fetch_titles_data (__main__.TestECFRHouseKeeping.test_fetch_titles_data) ... ok
test_load_agency_data (__main__.TestECFRHouseKeeping.test_load_agency_data) ... ok
test_load_full_text_data_by_title_and_chapter (__main__.TestECFRHouseKeeping.test_load_full_text_data_by_title_and_chapter) ... ok
test_load_title_data (__main__.TestECFRHouseKeeping.test_load_title_data) ... ok
test_refresh_agencies (__main__.TestECFRHouseKeeping.test_refresh_agencies) ... 

Fetching titles data from the API...
Loading agency text from the file...
Loading full text from ../data/versions/title1chapterIII_versions.json ...
Loading title text from the file...
../data/../data/agencies.json created at:  2025-08-05 18:47:06.019704
Agencies data is stale, fetching from API...
Fetching agencies data from the API...


ok
test_refresh_title_and_chapter_versions (__main__.TestECFRHouseKeeping.test_refresh_title_and_chapter_versions) ... ok
test_refresh_titles (__main__.TestECFRHouseKeeping.test_refresh_titles) ... ok

----------------------------------------------------------------------
Ran 19 tests in 1.033s

OK


Writing agencies data to agencies.json file...
../data/../data//versions/title1chapterIII_versions.json created at:  2025-08-05 18:22:28.652719
Title 1 chapter III data is fresh, loading from file...
Loading full text from ../data/versions/title1chapterIII_versions.json ...
../data/../data/titles.json created at:  2025-08-05 18:38:29.992709
Titles data is fresh, loading from file...
Loading title text from the file...


<unittest.main.TestProgram at 0x784d945d0470>

<h2>Put classes and constants above<h2>
<h3>Put implementation below</h3>

In [99]:

# initialize variables
titles_json= {'titles':[]}  #the titles_json dictionary to contain data
statusmsg = "initialized"  #use this to track what's going on 
lastbestdate = "1776-07-04"  #when was the eCFR last updated?
today = dt.date.today() #what day is it?

# test successful initialization
assert len(titles_json)>0
assert len(statusmsg)>0
assert str(today)>lastbestdate

print("Welcome to the MRWeCFR with base_url of "+base_url)

# Psuedocode
# 1. Let's see if we have a list of recent titles (less than 30 days old) in our titles.json file
# 2. If we do, let's see if we have the full and amendments of similar age, else, try and refresh titles.json 
# 3. If we are good with source data, skip to analysis, else, try and refresh the eCFR json details 
# 4. Analysis - let's load some basic semantic checks (wordcount, reading level) and some change trackers (count of changes, frequency over time)
# To do the analysis, we're going to have to download 2 things - the full text (for wordcount and reading level) and the amendments (for change tracking).


#let's check the titles.json file for freshness
# ecfr_housekeeping = eCFR_HouseKeeping(data_path)

# implementation to follow






Welcome to the MRWeCFR with base_url of https://www.ecfr.gov/api/
