In [None]:
# veterans history project dataset
import time
import requests
# import error type
from requests.exceptions import RequestException
# chucked encoding error
from requests.exceptions import ChunkedEncodingError
# data manipulation
import pandas as pd
# utilities
from helpers_loc import get_file_stats

In [None]:
# Constants for rate limiting and paging
RATE_LIMIT_DELAY = 3  # Delay in seconds between requests (20 requests per minute)
MAX_ITEMS_PER_PAGE = 1000  # Recommended maximum items per page
MAX_TOTAL_ITEMS = 100000  # Maximum items per query !! Set low for testing !!
RETRY_DELAY = 1  # Initial delay in seconds for retries
MAX_RETRIES = 5  # Maximum number of retries for rate-limited requests
DATA_URL_BASE = 'https://www.loc.gov/collections/veterans-history-project-collection'

In [None]:
# Function to parse the returned JSON
def fetch_url_with_rate_limit(rep_url, timeout=60):
    retries = 0
    # Retry logic with exponential backoff for 429 status codes
    while retries < MAX_RETRIES:
        try:
            print(f"Attempting to fetch URL: {rep_url}")
            response = requests.get(rep_url, timeout=timeout)
            if response.status_code == 429:  # Too Many Requests
                print("Rate limit exceeded. Pausing...")
                time.sleep(RETRY_DELAY * (2 ** retries))  # Exponential backoff
                retries += 1
                continue
            response.raise_for_status()  # Raise HTTPError for bad responses
            return response
        except requests.exceptions.RequestException as req_err:
            print(f"Request error occurred: {req_err}")
            retries += 1
            time.sleep(RETRY_DELAY * (2 ** retries))  # Exponential backoff
    raise Exception(f"Failed to fetch {rep_url} after {MAX_RETRIES} retries.")

In [None]:
# adding fo=json can return web representation: e.g. https://www.loc.gov/collections/veterans-history-project-collection?fo=json
web_rep_url = f'{DATA_URL_BASE}?fo=json&fa=online-format:online+text'
print(f"web representation url: {web_rep_url}")

In [None]:
# Initialize an empty list to store all results
all_results = []
# Loop through pages with rate limiting and paging limits
total_items_processed = 0
# Start with web_rep_url
current_url = web_rep_url
print("Start fetching information from each url:")
while current_url and total_items_processed < MAX_TOTAL_ITEMS:
    try:
        response = fetch_url_with_rate_limit(current_url)
        data = response.json()
        # Process the results on the current page
        results = data.get('results', [])
        all_results.extend(results)
        total_items_processed += len(results)
        print(f"Processed {len(results)} results. Total: {total_items_processed}")

        # Get the URL for the next page
        current_url = data.get('pagination', {}).get('next')
        print(f"Next page URL: {current_url}")

        # Delay to respect rate limits
        time.sleep(RATE_LIMIT_DELAY)
    except Exception as e:
        print(f"Error processing page {current_url}: {e}")
        break

# Convert the accumulated results into a DataFrame
if all_results:
    df_results = pd.DataFrame(all_results)
    print("DataFrame created successfully.")
else:
    print("No results found.")

In [None]:
# print(f'Loaded web representation with {len(data):,} entries.')
len(df_results)

In [None]:
# save df as parquet file
df_results.to_parquet('../data/raw/loc/veterans_history_project.parquet', index=False)

In [None]:
# retrieve the df_results DataFrame from the parquet file
df_results = pd.read_parquet('../data/veterans_history_project.parquet')
df_results.head()

In [None]:
# for each column print the first row data
for col in df_results.columns:
    print(f'{col}: {df_results[col].iloc[0]}')

In [None]:
l_resource = []
for n in range(len(df_results)):
    collection_number = df_results['item'][n]['collection_number']
    title = df_results['title'][n]
    description = df_results['description'][n]
    dates = df_results['dates'][n]
    language = df_results['language'][n]
    location = df_results['location'][n]
    location_home = df_results['location_home'][n]
    location_service = df_results['location_service'][n]
    partof = df_results['partof'][n]
    subject = df_results['subject'][n]
    subject_battles = df_results['subject_battles'][n]
    subject_branch = df_results['subject_branch'][n]
    subject_conflict = df_results['subject_conflict'][n]
    subject_entrance = df_results['subject_entrance'][n]
    subject_format = df_results['subject_format'][n]
    subject_gender = df_results['subject_gender'][n]
    subject_rank = df_results['subject_rank'][n]
    subject_status = df_results['subject_status'][n]
    subject_unit = df_results['subject_unit'][n]
    subject_race = df_results['subject_race'][n]
    collection_resource = []

    for resource in df_results['resources'][n]:
        try:
            fulltext_file_url = resource.get('fulltext_file', None)
            retries = 0
            while retries < MAX_RETRIES:
                try:
                    if fulltext_file_url:
                        print(f"Fetching fulltext file: {fulltext_file_url}")
                        response = requests.get(fulltext_file_url, timeout=60)
                        if response.status_code == 429:  # Too Many Requests
                            print("Rate limit exceeded. Pausing...")
                            time.sleep(RETRY_DELAY * (2 ** retries))  # Exponential backoff
                            retries += 1
                            continue
                        response.raise_for_status()
                        fulltext_file_str = response.text
                        # print(f"Successfully fetched fulltext file: {fulltext_file_str}")
                    else:
                        fulltext_file_str = None
                    break
                except requests.exceptions.RequestException as req_err:
                    print(f"Request error occurred: {req_err}")
                    retries += 1
                    time.sleep(RETRY_DELAY * (2 ** retries))  # Exponential backoff
                except Exception as e:
                    print(f"Error fetching fulltext file: {e}")
                    fulltext_file_str = None
                    break
            video_url = resource.get('video', None)
            audio_url = resource.get('audio', None)
        except Exception as e:
            fulltext_file_url = None
            fulltext_file_str = None
            video_url = None
            audio_url = None
        finally:
            collection_resource.append({
            'collection_number': collection_number,
            'fulltext_file_url': fulltext_file_url,
            'fulltext_file_str': fulltext_file_str,
            'video_url': video_url,
            'audio_url': audio_url,
            'title': title,
            'description': description,
            'dates': dates,
            'language': language,
            'location': location,
            'location_home': location_home,
            'location_service': location_service,
            'partof': partof,
            'subject': subject,
            'subject_battles': subject_battles,
            'subject_branch': subject_branch,
            'subject_conflict': subject_conflict,
            'subject_entrance': subject_entrance,
            'subject_format': subject_format,
            'subject_gender': subject_gender,
            'subject_rank': subject_rank,
            'subject_status': subject_status,
            'subject_unit': subject_unit,
            'subject_race': subject_race       
            })
    l_resource.append(collection_resource)

In [None]:
#transform the list of resources into a DataFrame
df_resources = pd.DataFrame([item for sublist in l_resource for item in sublist])
df_resources.head()

In [None]:
import re
from bs4 import BeautifulSoup

def clean_raw_transcript_str(fulltext_file_str: str) -> str:
    l_transcript_lines = []
    # utilize bs4 xml parser
    soup = BeautifulSoup(fulltext_file_str, 'xml')
    # each sp tag in the document represents a "line" in the transcript
    for sp in soup.find_all('sp'):
        
        try:
            speaker = sp.find('speaker').get_text(strip=True)
        
        except:
            # placeholder speaker tag if not found
            speaker = "speaker_unknown"
        try:
            # return empty text if p tag not found
            spoken_text = sp.find('p').get_text(strip=True)
        
        except:
            spoken_text = ""
        
        l_transcript_lines.append(f"<{speaker}>{spoken_text}</{speaker}> ")
    
    # merge lines into one string
    transcript_lines = ''.join(l_transcript_lines)
    
    # remove (), [], {} and anything in between
    transcript_lines_stripped = re.sub(r'\([^)]*\)', '', transcript_lines)
    transcript_lines_stripped = re.sub(r'\[[^]]*\]', '', transcript_lines_stripped)
    transcript_lines_stripped = re.sub(r'\{[^}]*\)\}', '', transcript_lines_stripped)

    # remove double dashes and ellipsis
    transcript_lines_stripped = re.sub(r'--+', '', transcript_lines_stripped)
    transcript_lines_stripped = re.sub(r'\.{2,}', '', transcript_lines_stripped)

    # clean whitespace
    transcript_lines_stripped = re.sub(r'\s+', ' ', transcript_lines_stripped).strip()
    
    return transcript_lines_stripped

def remove_speaker_tag(transcript_lines_stripped: str) -> str:
    # remove <> and anything in between
    try:
        return re.sub(r'\<[^>]*\>', '', transcript_lines_stripped)
    except:
        return False
    
print("Extracting raw transcripts from dataframe")
df_resources['fulltext_file_str_cleaned'] = df_resources['fulltext_file_str'].astype(str).apply(clean_raw_transcript_str)
df_resources['transcript_raw_text_only'] = df_resources['fulltext_file_str_cleaned'].apply(remove_speaker_tag)

In [None]:
# save the DataFrame to a parquet file
df_resources.to_parquet('../data/raw/loc/veterans_history_project_resources.parquet', index=False)

In [None]:
# retrieve resource DataFrame from the parquet file
df_resources = pd.read_parquet('../data/raw/loc/veterans_history_project_resources.parquet')

In [None]:
# function to download audio/ video files from URLs, and store them under veteran_interviews/{idx}
def download_media_files(df):
    import os
    base_dir = '/Volumes/KINGSTON/veteran_interviews'
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    for idx, row in df.iterrows():
        # Prefer video if available, otherwise use audio
        media_type = None
        media_url = None
        if pd.notnull(row.get('video_url')) and row['video_url']:
            media_type = 'video'
            media_url = row['video_url']
            ext = '.mp4'
        elif pd.notnull(row.get('audio_url')) and row['audio_url']:
            media_type = 'audio'
            media_url = row['audio_url']
            ext = '.mp3'
        else:
            print(f"No audio or video URL for index {idx}")
            continue

        try:
            response = requests.get(media_url, timeout=60)
            response.raise_for_status()
            # Create a subdirectory for each index
            sub_dir = os.path.join(base_dir, str(idx))
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)
            # skip downloading if the file already exists
            elif os.path.exists(os.path.join(sub_dir, f'{media_type}{ext}')):
                print(f"File already exists for index {idx}: {media_type}{ext}")
                continue
            else:
                file_path = os.path.join(sub_dir, f'{media_type}{ext}')
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded {media_type} file for index {idx} to {file_path}")
        except Exception as e:
            print(f"Failed to download {media_type} file for index {idx}: {e}")
            
download_media_files(df_resources)

In [None]:
# # helper: create splits for df_resources (average size to run processing)
# def create_splits(df, split_size=1000):
#     splits = []
#     for i in range(0, len(df), split_size):
#         splits.append(df.iloc[i:i + split_size])
#     return splits
# # Create splits of the DataFrame
# splits = create_splits(df_resources, split_size=1000)

In [None]:
# helper to inspect raw transcript text structure
# import pprint
# pprint.pprint(df_resources['fulltext_file_str'].iloc[0])

In [None]:
transcript_gt_sample = df_resources['transcript_raw_text_only'][1]
from pprint import pprint
pprint(transcript_gt_sample)

In [None]:
transcript_gt_sample_120 = """
H. Marie Thomas. I'm speaking with what's your name? John Aaron, Jr. John Aaron, Jr. And you got your information on here, your birth date. 
I've got your address and information. 
I need the birth date. . Okay. August. And where were you born, city of birthplace? Little Rock, Arkansas All right. Okay.
What branch of service were you in? 
I was in the Navy first. You said first? Yes. I was in the Navy from 1941 to 1945. 
Then I went in the Reserve in I went back to school, and then in 1957 I received a commission, second lieutenant in the Army. 
Was that your highest ranking? I retired as a major. Oh, okay. Okay. 
Do you remember your serial number? Which one? You know, your social security number is this this is something you don't give out. Is this  No. It says serial number for service. 
It's different than social security number. Okay. That's fine. What battalion, regiment or division were you in? 
I was in the Navy. The first I was in the Navy.
"""