In [None]:
# use ?fo=json to get metadata of the item

In [None]:
import io
import pandas as pd                     # for reading, manipulating, and displaying data
import requests
from helpers_loc import get_file_stats
# jukebox dataset
DATA_URL = 'https://data.labs.loc.gov/jukebox/' # Base URL of this data package
# Download the file manifest
file_manifest_url = f'{DATA_URL}manifest.json'
response = requests.get(file_manifest_url, timeout=60)
response_json = response.json()
# file information json
files = [dict(zip(response_json["cols"], row)) for row in response_json["rows"]] # zip columns and rows

# Convert to Pandas DataFrame and show stats table
stats = get_file_stats(files)

In [None]:
df = pd.DataFrame(stats)
df

In [None]:
metadata_url = f'{DATA_URL}metadata.json'
response = requests.get(metadata_url, timeout=60)
data = response.json()
print(f'Loaded metadata file with {len(data):,} entries.')

In [None]:
df_metadata = pd.DataFrame(data)
print(', '.join(df.columns.to_list()))

In [None]:
df_metadata.head()

In [None]:
df_metadata_by_subject = df_metadata.explode('Subjects')
df_opera = df_metadata_by_subject[df_metadata_by_subject.Subjects == 'Opera']
print(f'Found {df_opera.shape[0]:,} items with subject "Opera"')

In [None]:
# create a DataFrame from the file information
df_files = pd.DataFrame(files)
# join the metadata dataframe with the file information dataframe
opera_set_with_audio = pd.merge(df_opera, df_files, left_on='Id', right_on='item_id', how='inner')
print(f'Found {opera_set_with_audio.shape[0]:,} opera items with audio files')

In [None]:
opera_set_with_audio.head()

In [None]:
# object_key contains the path to the audio file
import io

import matplotlib.pyplot as plt         # for displaying data
import numpy as np
from pydub import AudioSegment          # for reading and manipulating audio files
from scipy import signal                # for visualizing audio

item = opera_set_with_audio.iloc[0]
file_url = f'https://{item["object_key"]}'

In [None]:
# Download the audio to memory
response = requests.get(file_url, timeout=60)
audio_filestream = io.BytesIO(response.content)

In [None]:
# Read as mp3
sample_rate = 48000
sample_width = 1
channels = 1
audio_filestream.seek(0)  # Ensure stream is at the beginning
sound = AudioSegment.from_mp3(audio_filestream)
sound = sound.set_channels(channels)
sound = sound.set_sample_width(sample_width)
sound = sound.set_frame_rate(sample_rate)

# Get the first 10 seconds
ten_seconds = 10 * 1000
first_10_seconds = sound[:ten_seconds]

# Get audio samples and sample rate
samples = first_10_seconds.get_array_of_samples()
samples = np.array(samples)

# Visualize the results
frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
plt.pcolormesh(times, frequencies, np.log(spectrogram))
# plt.imshow(spectrogram)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.show()

In [None]:
# veterans history project dataset
DATA_URL = 'https://www.loc.gov/collections/veterans-history-project-collection'
DATA_URL = "https://www.loc.gov/collections/veterans-history-project-collection/?fa=online-format:online+text"
# adding fo=json can return web representation: https://www.loc.gov/collections/veterans-history-project-collection?fo=json
# example code to parse the returned JSON
web_rep_url = f'{DATA_URL}?fo=json'
web_rep_url = "https://www.loc.gov/collections/veterans-history-project-collection/?fo=json&fa=online-format:online+text"
# web_rep_url = "https://www.loc.gov/search/?q=baseball&fo=json"
# Start with the initial URL
current_url = web_rep_url
# Initialize an empty list to store all results
all_results = []
import time
import requests
# import error type
from requests.exceptions import RequestException
# chucked encoding error
from requests.exceptions import ChunkedEncodingError

In [None]:
# Constants for rate limiting and paging
RATE_LIMIT_DELAY = 3  # Delay in seconds between requests (20 requests per minute)
MAX_ITEMS_PER_PAGE = 1000  # Recommended maximum items per page
MAX_TOTAL_ITEMS = 10  # Maximum items per query !! Set low for testing !!
RETRY_DELAY = 1  # Initial delay in seconds for retries
MAX_RETRIES = 5  # Maximum number of retries for rate-limited requests

In [None]:
# Retry logic with exponential backoff for 429 status codes
def fetch_url_with_rate_limit(url, timeout=60):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            print(f"Attempting to fetch URL: {url}")
            response = requests.get(url, timeout=timeout)
            if response.status_code == 429:  # Too Many Requests
                print("Rate limit exceeded. Pausing...")
                time.sleep(RETRY_DELAY * (2 ** retries))  # Exponential backoff
                retries += 1
                continue
            response.raise_for_status()  # Raise HTTPError for bad responses
            return response
        except requests.exceptions.RequestException as req_err:
            print(f"Request error occurred: {req_err}")
            retries += 1
            time.sleep(RETRY_DELAY * (2 ** retries))  # Exponential backoff
    raise Exception(f"Failed to fetch {url} after {MAX_RETRIES} retries.")

# Loop through pages with rate limiting and paging limits
total_items_processed = 0
while current_url and total_items_processed < MAX_TOTAL_ITEMS:
    try:
        response = fetch_url_with_rate_limit(current_url)
        data = response.json()

        # Process the results on the current page
        results = data.get('results', [])
        all_results.extend(results)
        total_items_processed += len(results)
        print(f"Processed {len(results)} results. Total: {total_items_processed}")

        # Get the URL for the next page
        current_url = data.get('pagination', {}).get('next')
        print(f"Next page URL: {current_url}")

        # Delay to respect rate limits
        time.sleep(RATE_LIMIT_DELAY)
    except Exception as e:
        print(f"Error processing page {current_url}: {e}")
        break

# Convert the accumulated results into a DataFrame
if all_results:
    df_results = pd.DataFrame(all_results)
    print("DataFrame created successfully.")
else:
    print("No results found.")


In [None]:
df_results.head()

In [None]:
# save df as parquet file
df_results.to_parquet('veterans_history_project.parquet', index=False)

In [None]:
print(f'Loaded web representation with {len(data):,} entries.')

In [None]:
# retrieve the df_results DataFrame from the parquet file
df_results = pd.read_parquet('veterans_history_project.parquet')

In [None]:
df_results.head()

In [None]:
df_results.columns
# for each column print the first row data
for col in df_results.columns:
    print(f'{col}: {df_results[col].iloc[0]}')

In [None]:
l_resource = []
# for n in range(len(df_results)):
for n in range(10):
    collection_number = df_results['item'][n]['collection_number']
    title = df_results['title'][n]
    description = df_results['description'][n]
    dates = df_results['dates'][n]
    language = df_results['language'][n]
    location = df_results['location'][n]
    location_home = df_results['location_home'][n]
    location_service = df_results['location_service'][n]
    partof = df_results['partof'][n]
    subject = df_results['subject'][n]
    subject_battles = df_results['subject_battles'][n]
    subject_branch = df_results['subject_branch'][n]
    subject_conflict = df_results['subject_conflict'][n]
    subject_entrance = df_results['subject_entrance'][n]
    subject_format = df_results['subject_format'][n]
    subject_gender = df_results['subject_gender'][n]
    subject_rank = df_results['subject_rank'][n]
    subject_status = df_results['subject_status'][n]
    subject_unit = df_results['subject_unit'][n]
    subject_race = df_results['subject_race'][n]
    collection_resource = []

    for resource in df_results['resources'][n]:
        try:
            fulltext_file_url = resource.get('fulltext_file', None)
            retries = 0
            while retries < MAX_RETRIES:
                try:
                    if fulltext_file_url:
                        print(f"Fetching fulltext file: {fulltext_file_url}")
                        response = requests.get(fulltext_file_url, timeout=60)
                        if response.status_code == 429:  # Too Many Requests
                            print("Rate limit exceeded. Pausing...")
                            time.sleep(RETRY_DELAY * (2 ** retries))  # Exponential backoff
                            retries += 1
                            continue
                        response.raise_for_status()
                        fulltext_file_str = response.text
                        # print(f"Successfully fetched fulltext file: {fulltext_file_str}")
                    else:
                        fulltext_file_str = None
                    break
                except requests.exceptions.RequestException as req_err:
                    print(f"Request error occurred: {req_err}")
                    retries += 1
                    time.sleep(RETRY_DELAY * (2 ** retries))  # Exponential backoff
                except Exception as e:
                    print(f"Error fetching fulltext file: {e}")
                    fulltext_file_str = None
                    break
            video_url = resource.get('video', None)
            audio_url = resource.get('audio', None)
        except Exception as e:
            fulltext_file_url = None
            fulltext_file_str = None
            video_url = None
            audio_url = None
        finally:
            collection_resource.append({
            'collection_number': collection_number,
            'fulltext_file_url': fulltext_file_url,
            'fulltext_file_str': fulltext_file_str,
            'video_url': video_url,
            'audio_url': audio_url,
            'title': title,
            'description': description,
            'dates': dates,
            'language': language,
            'location': location,
            'location_home': location_home,
            'location_service': location_service,
            'partof': partof,
            'subject': subject,
            'subject_battles': subject_battles,
            'subject_branch': subject_branch,
            'subject_conflict': subject_conflict,
            'subject_entrance': subject_entrance,
            'subject_format': subject_format,
            'subject_gender': subject_gender,
            'subject_rank': subject_rank,
            'subject_status': subject_status,
            'subject_unit': subject_unit,
            'subject_race': subject_race       
            })
    l_resource.append(collection_resource)
#transform the list of resources into a DataFrame
df_resources = pd.DataFrame([item for sublist in l_resource for item in sublist])

In [None]:
df_resources.head()

In [None]:
# save the DataFrame to a parquet file
df_resources.to_parquet('veterans_history_project_resources.parquet', index=False)

In [None]:
# retrieve the df_results DataFrame from the parquet file
df_resources = pd.read_parquet('veterans_history_project_resources.parquet')

In [None]:
# helper to inspect raw transcript text structure
# import pprint
# pprint.pprint(df_resources['fulltext_file_str'].iloc[0])

In [None]:
import re
from bs4 import BeautifulSoup

def clean_raw_transcript_str(fulltext_file_str: str) -> str:
    l_transcript_lines = []
    # utilize bs4 xml parser
    soup = BeautifulSoup(fulltext_file_str, 'xml')
    # each sp tag in the document represents a "line" in the transcript
    for sp in soup.find_all('sp'):
        
        try:
            speaker = sp.find('speaker').get_text(strip=True)
        
        except:
            # placeholder speaker tag if not found
            speaker = "speaker_unknown"
        try:
            # return empty text if p tag not found
            spoken_text = sp.find('p').get_text(strip=True)
        
        except:
            spoken_text = ""
        
        l_transcript_lines.append(f"<{speaker}>{spoken_text}</{speaker}> ")
    
    # merge lines into one string
    transcript_lines = ''.join(l_transcript_lines)
    
    # remove (), [], {} and anything in between
    transcript_lines_stripped = re.sub(r'\([^)]*\)', '', transcript_lines)
    transcript_lines_stripped = re.sub(r'\[[^]]*\]', '', transcript_lines_stripped)
    transcript_lines_stripped = re.sub(r'\{[^}]*\)\}', '', transcript_lines_stripped)

    # remove double dashes and ellipsis
    transcript_lines_stripped = re.sub(r'--+', '', transcript_lines_stripped)
    transcript_lines_stripped = re.sub(r'\.{2,}', '', transcript_lines_stripped)

    # clean whitespace
    transcript_lines_stripped = re.sub(r'\s+', ' ', transcript_lines_stripped).strip()
    
    return transcript_lines_stripped

def remove_speaker_tag(transcript_lines_stripped: str) -> str:
    # remove <> and anything in between
    try:
        return re.sub(r'\<[^>]*\>', '', transcript_lines_stripped)
    except:
        return False

In [None]:
df_resources['fulltext_file_str_cleaned'] = df_resources['fulltext_file_str'].apply(clean_raw_transcript_str)
df_resources['transcript_raw_text_only'] = df_resources['fulltext_file_str_cleaned'].apply(remove_speaker_tag)

In [None]:
transcript_gt_sample = df_resources['transcript_raw_text_only'][1]

In [None]:
transcript_gt_sample_120 = """
H. Marie Thomas. I'm speaking with what's your name? John Aaron, Jr. John Aaron, Jr. And you got your information on here, your birth date. 
I've got your address and information. 
I need the birth date. . Okay. August. And where were you born, city of birthplace? Little Rock, Arkansas All right. Okay.
What branch of service were you in? 
I was in the Navy first. You said first? Yes. I was in the Navy from 1941 to 1945. 
Then I went in the Reserve in I went back to school, and then in 1957 I received a commission, second lieutenant in the Army. 
Was that your highest ranking? I retired as a major. Oh, okay. Okay. 
Do you remember your serial number? Which one? You know, your social security number is this this is something you don't give out. Is this  No. It says serial number for service. 
It's different than social security number. Okay. That's fine. What battalion, regiment or division were you in? 
I was in the Navy. The first I was in the Navy.
"""

In [None]:
import os
import soundfile as sf
import librosa
import torch
from transformers import AutoProcessor, AutoModelForCTC

AUDIO_DIR = "/Users/ac/main/amia2025-stt-benchmarking/data/audio/transcript_audio_sample.mp3"
# Use small, CPU/MPS friendly model for now
MODEL = "facebook/wav2vec2-base-960h"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
# keep CPU stable in notebooks
torch.set_num_threads(4)  
# load first 120s without reading entire file
info = sf.info(AUDIO_DIR)
wave, sr = librosa.load(AUDIO_DIR, sr=16000, mono=True, duration=120)

if wave.ndim == 2:
    wave = wave.mean(axis=1)
    
# set up processor
processor = AutoProcessor.from_pretrained(MODEL)
# set up model
model = AutoModelForCTC.from_pretrained(MODEL).to("cpu").eval()

inputs = processor(wave, sampling_rate=sr, return_tensors="pt")
with torch.no_grad():
    logits = model(inputs.input_values.to("cpu")).logits

ids = torch.argmax(logits, dim=-1)
hyp_ctc = processor.batch_decode(ids)[0].lower()

print("partial transcription results (Wav2Vec2)")
print(hyp_ctc)

In [None]:
import jiwer

# Normalize beforehand if desired:
transform = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip()
])

In [None]:
ref_norm = transform(transcript_gt_sample_120)
hyp_norm = transform(hyp_ctc)

output = jiwer.process_words(ref_norm, hyp_norm)
print(output)  # a dataclass with attributes
print("WER:", output.wer)
print("Substitutions:", output.substitutions)
print("Deletions:", output.deletions)
print("Insertions:", output.insertions)

In [None]:
# uv run python - <<'PY'
# from faster_whisper import WhisperModel
# root = "/Users/ac/main/amia2025-stt-benchmarking/models/local"
# for name in ["base", "small"]:   # add "medium" later if you want
#     print(f"Downloading {name} to {root} ...")
#     WhisperModel(name, device="cpu", download_root=root)
# print("Done.")
# PY

In [None]:
from pathlib import Path
from tempfile import NamedTemporaryFile
import soundfile as sf
import librosa
from faster_whisper import WhisperModel

ROOT = Path("/Users/ac/main/amia2025-stt-benchmarking/models/local/models--Systran--faster-whisper-small")
SNAP = max((ROOT / "snapshots").iterdir(), key=lambda p: p.stat().st_mtime)
LOCAL_MODEL_DIR = str(SNAP)  # folder that contains model.bin
print("Using model dir:", LOCAL_MODEL_DIR)

# load exactly first 120s as mono 16k
wave, _ = librosa.load(AUDIO_DIR, sr=16000, mono=True, duration=120)

model = WhisperModel(LOCAL_MODEL_DIR, device="auto")

with NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
    sf.write(tmp.name, wave, 16000)  # write trimmed clip
    segments, info = model.transcribe(
        tmp.name,
        beam_size=1,
        temperature=0.0,
        vad_filter=True,
        no_speech_threshold=0.6,
        word_timestamps=False,
        initial_prompt=None,
        suppress_tokens=[-1],   # <-- FIX: list of ints, or set to None
        condition_on_previous_text=False,
    )

hyp_whisper = " ".join(s.text.strip().lower() for s in segments)
print(f"[{info.language}] {hyp_whisper[:400]}...")

In [None]:
# function to download audio/ video files from URLs, and store them under veteran_interviews/{idx}
def download_media_files(df):
    import os
    base_dir = '/Volumes/KINGSTON/veteran_interviews'
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    for idx, row in df.iterrows():
        # Prefer video if available, otherwise use audio
        media_type = None
        media_url = None
        if pd.notnull(row.get('video_url')) and row['video_url']:
            media_type = 'video'
            media_url = row['video_url']
            ext = '.mp4'
        elif pd.notnull(row.get('audio_url')) and row['audio_url']:
            media_type = 'audio'
            media_url = row['audio_url']
            ext = '.mp3'
        else:
            print(f"No audio or video URL for index {idx}")
            continue

        try:
            response = requests.get(media_url, timeout=60)
            response.raise_for_status()
            # Create a subdirectory for each index
            sub_dir = os.path.join(base_dir, str(idx))
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)
            # skip downloading if the file already exists
            elif os.path.exists(os.path.join(sub_dir, f'{media_type}{ext}')):
                print(f"File already exists for index {idx}: {media_type}{ext}")
                continue
            else:
                file_path = os.path.join(sub_dir, f'{media_type}{ext}')
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded {media_type} file for index {idx} to {file_path}")
        except Exception as e:
            print(f"Failed to download {media_type} file for index {idx}: {e}")

In [None]:
#create splits for df_resources (average size to run processing)
def create_splits(df, split_size=1000):
    splits = []
    for i in range(0, len(df), split_size):
        splits.append(df.iloc[i:i + split_size])
    return splits
# Create splits of the DataFrame
splits = create_splits(df_resources, split_size=1000)

In [None]:
download_media_files(df_resources)