In [None]:
naId = ""
parentNaId = ""
# define env vars
API_KEY_NA = "hzj5ASoq1aj5bHbl8XNI3roMecW9oAm7TOBMjkre"
# Define API endpoints
API_ENDPOINT_RECORD_NA = 'https://catalog.archives.gov/api/v2/records/search'
API_ENDPOINT_TRANSCRIPT_NA = 'https://catalog.archives.gov/api/v2/transcriptions/naId/'
API_ENDPOINT_COLLECTION_NA = 'https://catalog.archives.gov/api/v2/records/parentNaId/'

In [None]:
import requests
import polars as pl
import datetime as dt

def fetch_transcription_and_contributors(naId, api_key=API_KEY_NA, api_endpoint=API_ENDPOINT_TRANSCRIPT_NA):
    response = requests.get(api_endpoint + naId, headers={'Content-Type': 'application/json', 'x-api-key': api_key})
    if response.status_code == 200:
        data = response.json()
        if hit := data['body']['hits']['hits']:
            transcription_objects = []
            for x in hit:
                object_id = x['_source']['record']['target']['objectId']
                transcriptions_raw = x['_source']['record']['contribution']
                contributors_info = x['_source']['record']['contributors']
                transcription_object = {object_id: {"transcription":transcriptions_raw, "contributors_info": contributors_info}}
                transcription_objects.append(transcription_object)
            # its not clear how the transcription hits are ordered
            # we have to assume latest comes first
            return transcription_objects
        else:
            print("No hits found in response.")
            return None
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

In [None]:
parentNaId = "653144"

In [None]:
# df = pl.DataFrame(
#     {
#         "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
#         "birthdate": [
#             dt.date(1997, 1, 10),
#             dt.date(1985, 2, 15),
#             dt.date(1983, 3, 22),
#             dt.date(1981, 4, 30),
#         ],
#         "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
#         "height": [1.56, 1.77, 1.65, 1.75],  # (m)
#     }
# )

# print(df)
# Fetch all pages of results and concatenate them into a single DataFrame
all_hits = []
page = 1
limit = 100  # Use a higher limit if the API allows

while True:
    params = {
        'page': page,
        'limit': limit,
    }
    response = requests.get(
        API_ENDPOINT_COLLECTION_NA + parentNaId,
        headers={'Content-Type': 'application/json', 'x-api-key': API_KEY_NA},
        params=params
    )
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        break
    hits = response.json().get('body', {}).get('hits', {}).get('hits', [])
    if not hits:
        break
    all_hits.extend(hits)
    if len(hits) < limit:
        break  # Last page reached
    page += 1

if all_hits:
    try:
        df = pl.DataFrame(all_hits, strict=False)
    except Exception as e:
        print(f"Error creating DataFrame: {e}")
        df = pl.DataFrame()
else:
    print("No hits found in response.")
    df = pl.DataFrame()

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df = df.with_columns(
    pl.col('_source').map_elements(
        lambda x: x['record']['title'] if 'record' in x and 'title' in x['record'] else None
    ).alias('hit_title')
)

In [None]:
# apply this logic: hit_record_urls = [x['objectUrl'] for x in i['record']['digitalObjects']] to create a new column
df = df.with_columns(
    pl.col('_source').map_elements(
        lambda x: [obj['objectUrl'] for obj in (x['record'].get('digitalObjects') or [])] if 'record' in x and 'digitalObjects' in x['record'] else []
    ).alias('hit_record_urls')
)

In [None]:
df = df.with_columns(
    pl.col('_source').map_elements(
        lambda x: x['record'].get('digitalObjects') if 'record' in x else None
    ).alias('hit_digitalObjects_metadata')
)

In [None]:
df = df.with_columns(
    pl.col('_id').map_elements(lambda naId: fetch_transcription_and_contributors(naId)).alias('transcription')
)

In [None]:
# save the dataframe as a parquet file
df.write_parquet(f'{parentNaId}_transcriptions.parquet', compression='snappy')

In [None]:
# retrieve the parquet file and read it back
df = pl.read_parquet(f'{parentNaId}_transcriptions.parquet')

In [None]:
df = df.explode("transcription")

In [None]:
# filter out where transcription is None
df = df.filter(pl.col("transcription").is_not_null())

In [None]:
# filter out where transcription is any empty dict
df_transcriptions = df.filter(pl.col("transcription").map_elements(lambda x: bool(x) and isinstance(x, dict) and any(value is not None for value in x.values())))

In [None]:
len(df_transcriptions)

In [None]:
# for each item in the transcription column, extract the object_id and contributors_info

In [None]:
import os

def download_mp3(url):
    os.makedirs('./audio', exist_ok=True)
    filename = url.split('/')[-1]
    filepath = os.path.join('./audio', filename)
    response = requests.get(url)
    if response.status_code == 200:
        with open(filepath, 'wb') as f:
            f.write(response.content)
        return filepath
    else:
        print(f"Failed to download {url}: {response.status_code} - {response.text}")
        return None
    
# url_test = "https://s3.amazonaws.com/NARAprodstorage/lz/mopix/208a/GENERALa/208-192.mp3"

# # Download a test MP3 file
# downloaded_file = download_mp3(url_test)

# for each item in df_transcriptions['hit_record_urls'] (list of URLs), download the audio files and mark the filepaths in a new column 'audio_filepaths'
df_transcriptions = df_transcriptions.with_columns(
    pl.col('hit_record_urls').map_elements(
        lambda urls: [download_mp3(url) for url in urls if url]  # Filter out None URLs
    ).alias('audio_filepaths')
)

In [None]:
# export df_transcriptions to a parquet file
df_transcriptions.write_parquet(f'{parentNaId}_transcriptions_with_audio.parquet', compression='snappy')

In [None]:
# df_transcriptions

In [None]:


params = {
    'q': 'world war',
    # 'naID': ["2131159"],
    'rows': 10,
    'api_key': API_KEY_NA
}
# Make the API request
response = requests.get(API_ENDPOINT_RECORD_NA, params=params, headers={'Content-Type': 'application/json', 'x-api-key': API_KEY_NA})
# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    print("Records fetched successfully:")
    # print(response.json())
else:
    # Print the error message
    print(f"Error: {response.status_code} - {response.text}")
# This script fetches records from the National Archives and Records Administration (NARA) API

In [None]:
# plan: for each collection, get all the records' naIds, fetch audio + metadata, and check if there are transcriptions
