<h1>Matt's eCFR API Demonstrator</h1>
<p>An intro to Jupyter Notebooks</p>
<p><i>Teach yourself Python using the eCFR API</i></p>

In [None]:
# Globals
# Do my imports - datetime for date operations, pandas for data analysis, aiohttp asyncio requests for HTTPS REST API requests, json for, well, javascript object notation, duh
import datetime as dt
import pandas as pd
import aiohttp as aio
import asyncio as asy   
import requests as rq
import json as json

# Set my constants
base_url = "https://www.ecfr.gov/api/"
titles_url = base_url + "versioner/v1/titles.json"
agencies_url = base_url + "admin/v1/agencies.json"
title_and_chapter_versions_url = base_url + "versioner/v1/versions/title-"
data_path = "../data/"

In [None]:
# Define my eCFR API wrapper class
class eCFR_API_Wrapper:
    """
    A wrapper for the eCFR API to fetch and process data.
    """

    def __init__(self, base_url):
        self.base_url = base_url

    def fetch_data(endpoint):
        """
        Fetch data from the eCFR API.
        """
        try:
            response = rq.get(f"{endpoint}")
            if response.status_code == 200:
                return response.json()
            else:
                response.raise_for_status()
        except: 
            print(f"Error fetching data from {endpoint}")
            return None
        
    def fetch_data_asynch(endpoint):    
        """
        Fetch data from the eCFR API asynchronously.
        """
        async def fetch(session, url):
            async with session.get(url) as response:
                return await response.json()

        async def main():
            async with aio.ClientSession() as session:
                return await fetch(session, endpoint)

        loop = asy.get_event_loop()
        return loop.run_until_complete(main())

In [None]:
class eCFR_HouseKeeping:

    titles_json = None
    agencies_json = None
    slugs = []
    titles_and_chapters = []


    def __init__(self, data_path):
        self.data_path = data_path

    def refresh_titles(self):
        """
        Check the titles.json file for freshness and update if necessary.
        """
        statusmsg = "Titles check completed"
        # Get the current date and calculate the date 30 days ago
        thirtydaysback = dt.datetime.now() - dt.timedelta(days=30)
        
        try:
            with open(self.data_path + 'titles.json', 'r') as mytitlesfile:
                titles_json = json.load(mytitlesfile)
        except FileNotFoundError:
            titles_json = None
            statusmsg = "No titles.json found, need to fetch data"

        if titles_json:
            # We have some data, let's check it
            titles_dict = titles_json.get('titles', [])
            if not titles_dict:
                statusmsg = "No titles found in JSON"
                return statusmsg
            
            lastbestdate = None
            key_to_find = "up_to_date_as_of"
            up_to_date_values = [t[key_to_find] for t in titles_dict if key_to_find in t]
            lastbestdate = max(list(filter(None, up_to_date_values)))
            
            if lastbestdate > str(thirtydaysback):
                statusmsg += f" with good data as of {lastbestdate}"
            else:     
                statusmsg += f" with stale data as of {lastbestdate}. We need to refresh"
                # call refresh API here
                new_titles = eCFR_API_Wrapper.fetch_data(titles_url)
                if new_titles:
                    # we got new data - update the titles_json
                    titles_json = new_titles
                    # and write it to the file
                    with open(self.data_path + 'titles.json', 'w') as mytitlesfile:
                        json.dump(titles_json, mytitlesfile)
                    statusmsg += " and refreshed titles.json"
        else:
            # failed to load useful data - darn.    
            statusmsg += " but data is no good"
    
        return statusmsg
    
    def load_agency_data(self):
        """
        Load the agency data from the agencies.json file.
        """
        try:
            with open(self.data_path + 'agencies.json', 'r') as myagenciesfile:
                self.agencies_json = json.load(myagenciesfile)
                return self.agencies_json
        except FileNotFoundError:
            print("No agencies.json found, need to fetch data from API")
            return None

    def load_title_and_chapter_versions(self, title,chapter):
        """
        Load the title and chapter versions from an individual file.
        """
        try:
            with open(self.data_path + 'title'+title+'chapter'+chapter+"_versions.json", 'r') as mytitlesfile:
                title_and_chapter_versions = json.load(mytitlesfile)
                return title_and_chapter_versions
        except FileNotFoundError:
            print("No title"+title+" chapter"+chapter+" versions found, need to fetch data from API")
            return None



    def refresh_title_and_chapter_versions(self, title, chapter):    
        """
        Refresh the title and chapter version data.
        """
        my_title_url = title_and_chapter_versions_url + title + ".json?chapter=" + chapter 
        # fetch the title and chapter version data from the API
        try:
            title_and_chapter_version_json = eCFR_API_Wrapper.fetch_data_asynch(my_title_url)
        except Exception as e:
            return (f"Error fetching title version data: {e}")
        # if we have version data, write it to the file
        if title_and_chapter_version_json:
            myfile = "title"+title+"chapter"+chapter+"_versions.json"
            try:
                with open(self.data_path + '/versions/'+myfile, 'w') as myversionfile:
                    json.dump(title_and_chapter_version_json, myversionfile)
                return "Version data available, data fetched and written to "+myfile
            except IOError as e:
                return (f"Error writing to versions.json: {e}")
        else:
            return "Unable to fetch versions data from the API"


    def refresh_agencies(self):
        """
        Fetch the agencies data from the eCFR API.
        """
        #see if we already have agencies.json      
        if not self.agencies_json:
            self.agencies_json = self.load_agency_data()
            # we have the agencies data, stop here
            return "Agency data available, no need to refresh"
        else: 
            # fetch the agencies data from the API
            try:
                self.agencies_json = eCFR_API_Wrapper.fetch_data(agencies_url)
            except Exception as e:
                return (f"Error fetching agencies data: {e}")
            # if we have agencies_json, write it to the file
            if self.agencies_json:
                try:
                    with open(self.data_path + 'agencies.json', 'w') as myagenciesfile:
                        json.dump(self.agencies_json, myagenciesfile)
                    return "Agency data available, data fetched and written to agencies.json"
                except IOError as e:
                    return (f"Error writing to agencies.json: {e}")
            else:
                return "Unable to fetch agencies data from the API"

    

    def refresh_fulltext_byslughierarchy(self):
        """
        Fetch the full text of eCFR by SLUG hierarchy.
        """
        self.refresh_agencies()  # Ensure we have the latest agencies data
        # blank the slug array
        self.slugs = []
        for agency in self.agencies_json.get('agencies', []):
            agency_name = agency.get('name', 'Unknown Agency')
            slug = agency.get('slug', 'Unknown Slug')
            self.slugs.append(slug)
            print(f"Processing agency: {agency_name} with SLUG: {slug}")
            # Here we would typically fetch the titles, chapters, and parts for each agency
            # and construct the SLUG hierarchy.
            for cfr_references in agency.get('cfr_references', []):
                title = cfr_references.get('title', 'Unknown Title')
                chapter = cfr_references.get('chapter', None)
                part = cfr_references.get('part', None)
                """ debug print messages
                if part is None:
                    print(f"Processing title/chapter: {title} {chapter} for agency {agency_name}")
                else: 
                    print(f"Processing title/chapter/part: {title} {chapter} {part} for agency {agency_name}")
                """
                # Here you would typically fetch the full text for this SLUG

                # and process it as needed.
        
            pass

        # You would need to implement the logic to fetch and process the full text
        print("Not quite finished implementing full text by SLUG hierarchy...")
    
        return "Full text fetched successfully. "    
    

    def refresh_versions_byslughierarchy(self):
        """
        Fetch the versions of eCFR by SLUG hierarchy.
        """
        self.refresh_agencies()  # Ensure we have the latest agencies data
        # blank the slug array
        self.slugs = []
        for agency in self.agencies_json.get('agencies', []):
            agency_name = agency.get('name', 'Unknown Agency')
            slug = agency.get('slug', 'Unknown Slug')
            self.slugs.append(slug)
            print(f"Processing agency: {agency_name} with SLUG: {slug}")
            # Here we would typically fetch the titles, chapters, and parts for each agency
            # and construct the SLUG hierarchy.
            for cfr_references in agency.get('cfr_references', []):
                title = cfr_references.get('title', 'Unknown Title')
                chapter = cfr_references.get('chapter', None)
                part = cfr_references.get('part', None)
                """ debug print messages
                if part is None:
                    print(f"Processing title/chapter: {title} {chapter} for agency {agency_name}")
                else: 
                    print(f"Processing title/chapter/part: {title} {chapter} {part} for agency {agency_name}")
                """
                # Here you would typically fetch the version for each title chapter for this SLUG
                try:
                    self.refresh_title_and_chapter_versions(str(title), str(chapter))   
                except Exception as e:
                    print(f"Error fetching versions for title {title} chapter {chapter}: {e}")
                # if not exception, titleXchapterY_versions.json file should be created
            pass
        print("Fetching versions by SLUG hierarchy...")
        return "Versions fetched successfully."    

<h2>Unit Tests</h2>

In [38]:
import unittest

class TestECFRHouseKeeping(unittest.TestCase):
    def setUp(self):
        self.ecfr_hk = eCFR_HouseKeeping(data_path)

    def test_refresh_titles(self):
        result = self.ecfr_hk.refresh_titles()
        self.assertIn("Titles check completed", result)

    def test_load_agency_data(self):
        agencies = self.ecfr_hk.load_agency_data()
        self.assertIsNotNone(agencies)

    def test_refresh_title_and_chapter_versions(self):
        result = self.ecfr_hk.refresh_title_and_chapter_versions("1", "I")
        self.assertIn("Version data available", result)

    def test_refresh_agencies(self):
        result = self.ecfr_hk.refresh_agencies()
        self.assertIn("Agency data available", result)

    def test_refresh_fulltext_byslughierarchy(self):
        result = self.ecfr_hk.refresh_fulltext_byslughierarchy()
        self.assertIn("Full text fetched successfully", result)

    def test_refresh_versions_byslughierarchy(self):
        result = self.ecfr_hk.refresh_versions_byslughierarchy()
        self.assertIn("Versions fetched successfully", result)

unittest.main(argv=[''], verbosity=2, exit=False)


test_load_agency_data (__main__.TestECFRHouseKeeping.test_load_agency_data) ... ok
test_refresh_agencies (__main__.TestECFRHouseKeeping.test_refresh_agencies) ... ok
test_refresh_fulltext_byslughierarchy (__main__.TestECFRHouseKeeping.test_refresh_fulltext_byslughierarchy) ... ok
test_refresh_title_and_chapter_versions (__main__.TestECFRHouseKeeping.test_refresh_title_and_chapter_versions) ... ok
test_refresh_titles (__main__.TestECFRHouseKeeping.test_refresh_titles) ... ok
test_refresh_versions_byslughierarchy (__main__.TestECFRHouseKeeping.test_refresh_versions_byslughierarchy) ... 

Processing agency: Administrative Conference of the United States with SLUG: administrative-conference-of-the-united-states
Processing agency: Advisory Council on Historic Preservation with SLUG: advisory-council-on-historic-preservation
Processing agency: Special Inspector General for Afghanistan Reconstruction with SLUG: special-inspector-general-for-afghanistan-reconstruction
Processing agency: African Development Foundation with SLUG: african-development-foundation
Processing agency: United States Agency for Global Media with SLUG: united-states-agency-for-global-media
Processing agency: Department of Agriculture with SLUG: agriculture-department
Processing agency: Air Transportation System Stabilization with SLUG: air-transportation-stabilization-board
Processing agency: American Battle Monuments Commission with SLUG: american-battle-monuments-commission
Processing agency: Appalachian Regional Commission with SLUG: appalachian-regional-commission
Processing agency: Architectural a

ok

----------------------------------------------------------------------
Ran 6 tests in 547.216s

OK


Processing agency: President's Commission on White House Fellowships with SLUG: president's-commission-on-white-house-fellowships
Fetching versions by SLUG hierarchy...


<unittest.main.TestProgram at 0x7ed7537636e0>

<h2>Put classes and constants above<h2>
<h3>Put implementation below</h3>

In [None]:

# initialize variables
titles_json= {'titles':[]}  #the titles_json dictionary to contain data
statusmsg = "initialized"  #use this to track what's going on 
lastbestdate = "1776-07-04"  #when was the eCFR last updated?
today = dt.date.today() #what day is it?

# test successful initialization
assert len(titles_json)>0
assert len(statusmsg)>0
assert str(today)>lastbestdate

print("Welcome to the MRWeCFR with base_url of "+base_url)

# Psuedocode
# 1. Let's see if we have a list of recent titles (less than 30 days old) in our titles.json file
# 2. If we do, let's see if we have the full and amendments of similar age, else, try and refresh titles.json 
# 3. If we are good with source data, skip to analysis, else, try and refresh the eCFR json details 
# 4. Analysis - let's load some basic semantic checks (wordcount, reading level) and some change trackers (count of changes, frequency over time)
# To do the analysis, we're going to have to download 2 things - the full text (for wordcount and reading level) and the amendments (for change tracking).


#let's check the titles.json file for freshness
ecfr_housekeeping = eCFR_HouseKeeping(data_path)
ecfr_housekeeping.refresh_agencies()
ecfr_housekeeping.refresh_titles()
ecfr_housekeeping.refresh_fulltext_byslughierarchy()
ecfr_housekeeping.refresh_versions_byslughierarchy()







Welcome to the MRWeCFR with base_url of https://www.ecfr.gov/api/
Processing agency: Administrative Conference of the United States with SLUG: administrative-conference-of-the-united-states
Processing agency: Advisory Council on Historic Preservation with SLUG: advisory-council-on-historic-preservation
Processing agency: Special Inspector General for Afghanistan Reconstruction with SLUG: special-inspector-general-for-afghanistan-reconstruction
Processing agency: African Development Foundation with SLUG: african-development-foundation
Processing agency: United States Agency for Global Media with SLUG: united-states-agency-for-global-media
Processing agency: Department of Agriculture with SLUG: agriculture-department
Processing agency: Air Transportation System Stabilization with SLUG: air-transportation-stabilization-board
Processing agency: American Battle Monuments Commission with SLUG: american-battle-monuments-commission
Processing agency: Appalachian Regional Commission with SLUG: 

'Versions fetched successfully.'