In [115]:
import requests
import pandas as pd
from datetime import datetime
from typing import Dict, Any

#### Task 1: Here my approach is to write a function and return a dictionary with the document information.

In [119]:
def get_presidential_document(start_date: str, end_date: str, nth_document: int) -> Dict[Any, Any]:
    """
    Problem: To get a specific presidential document information in a specific time period?
    
    Args:
        start_date (str): Start date in format 'DD.MM.YYYY'
        end_date (str): End date in format 'DD.MM.YYYY'
        nth_document (int): Position of document to retrieve (1-based index)
        
    Returns:
        dict: Document information including title, abstract, publication date, document number, and type
    """

    URL = "https://www.federalregister.gov/api/v1/documents"
    
    # As I see the API requires dates in YYYY-MM-DD format, convert dates to API format (YYYY-MM-DD)
    # I find solution here: 😀
    # https://stackoverflow.com/questions/502726/converting-date-between-dd-mm-yyyy-and-yyyy-mm-dd

    start = datetime.strptime(start_date, '%d.%m.%Y').strftime('%Y-%m-%d')
    end = datetime.strptime(end_date, '%d.%m.%Y').strftime('%Y-%m-%d')
    


    params = {
    'conditions[type][]': 'PRESDOCU',  # Filter for only presidential documents
    'conditions[publication_date][gte]': start,  # Greater than or equal to start date
    'conditions[publication_date][lte]': end,    # Less than or equal to end date
    'per_page': 1,  # Only one document per page
    'page': nth_document,  # Ex. To fetch the Xth document
    'order': 'oldest'  # Sort by oldest first
}
    
    response = requests.get(f"{URL}", params=params)
    
    # Check if the request was successful ✅
    if response.status_code != 200:
        raise Exception(f"API request failed with status code {response.status_code}")
        
    data = response.json()
    
    if len(data['results']) > 0:
        # As data is inside a results array, I can get the first element
        doc = data['results'][0]
        return {
            'number': nth_document,
            'publication_date': doc['publication_date'],
            'type': doc['type'],
            'document_number': doc['document_number'],
            'title': doc['title'],
            'abstract': doc['abstract'],
            'pdf_url': doc['pdf_url']
        }
    else: 
        return None # No document found at this position

### Task 2: Extract document data for the first 20 documents from 1st December 2010 to 31st December 2010

In [121]:
# for better error handling, I will use try except blocks
documents = []
try:
    # Get first 20 documents from oldest to newest
    for i in range(1, 21):  # means document number from 1 to 20
        doc = get_presidential_document('01.12.2010', '31.12.2010', i)
        if doc is not None:  # Only append if document was found
            documents.append(doc)
            
    # Create DataFrame from the list of documents
    df = pd.DataFrame(documents)
    
    # Save to data to either CSV or Excel
    df.to_csv('presidential_documents.csv', index=False)
    # df.to_excel('presidential_documents.xlsx', index=False)
    print("\nData saved to presidential_documents.csv")
    
except Exception as e:
    print(f"Error: {e}")


Data saved to presidential_documents.csv


In [122]:
# Print the DataFrame
print("\nFirst 20 Presidential Documents (1st Dec 2010 to 31st Dec 2010):")
df


First 20 Presidential Documents (1st Dec 2010 to 31st Dec 2010):


Unnamed: 0,number,publication_date,type,document_number,title,abstract,pdf_url
0,1,2010-12-01,Presidential Document,2010-30299,"Thanksgiving Day, 2010",,https://www.govinfo.gov/content/pkg/FR-2010-12...
1,2,2010-12-27,Presidential Document,2010-32617,"National Mentoring Month, 2011",,https://www.govinfo.gov/content/pkg/FR-2010-12...
