In [None]:
import pandas as pd
# load parquet

# Specify the path to your Parquet file
parquet_file_path = '../data/raw/loc/veterans_history_project.parquet'

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet(parquet_file_path)

In [None]:
# column names checker
# note: date/ dats are related to dates of service/ war campaigns
df.columns

In [None]:
# extract year of record creation as a proxy of the age of media
df['number_date_created_first_itm'] = df['number_date_created'].apply(lambda x: x[0])
df['year_record_created'] = df['number_date_created_first_itm'].str.extract(r'^(\d{4})').astype(int)
df = df.sort_values(by='year_record_created', ascending=True)

In [None]:
import matplotlib
bins = df['year_record_created'].max() - df['year_record_created'].min()
df['year_record_created'].hist(bins = bins)

In [None]:
df_pre2010 = df[df['year_record_created']<=2010]
df_pre2010.to_parquet('../data/raw/loc/veterans_history_project_pre2010.parquet', index=False)

In [None]:
df_post2010 = df[df['year_record_created']>2010]
df_post2010.to_parquet('../data/raw/loc/veterans_history_project_post2010.parquet', index=False)

In [None]:
df_pre2010.iloc[0]

In [None]:
range(len(df_pre2010))

In [None]:
df_pre2010 = df_pre2010.reset_index(drop=True)

In [None]:
df_post2010.iloc[0]

In [None]:
range(len(df_post2010))

In [None]:
df_post2010 = df_post2010.reset_index(drop=True)

In [None]:
# retrieve resource DataFrame from the parquet file
df_resources = pd.read_parquet('../data/raw/loc/veterans_history_project_resources.parquet')

In [None]:
# reconstruct the dataframe where each row contains only one media resource
l_collection_numbers = []
for n in range(len(df_pre2010)):
    collection_number = df_pre2010['item'][n]['collection_number']
    # print(collection_number)
    l_collection_numbers.append(collection_number)

In [None]:
# check if the item collection numbers are unique
print(len(l_collection_numbers))
print(len(set(l_collection_numbers)))

In [None]:
df_resources_filtered = df_resources[df_resources['collection_number'].isin(l_collection_numbers)]

In [None]:
# save the DataFrame to a parquet file
df_resources_filtered.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010.parquet', index=False)

In [None]:
# post 2010:
# reconstruct the dataframe where each row contains only one media resource
l_collection_numbers = []
for n in range(len(df_post2010)):
    collection_number = df_post2010['item'][n]['collection_number']
    # print(collection_number)
    l_collection_numbers.append(collection_number)
df_resources_filtered = df_resources[df_resources['collection_number'].isin(l_collection_numbers)]
# save the DataFrame to a parquet file
df_resources_filtered.to_parquet('../data/raw/loc/veterans_history_project_resources_post2010.parquet', index=False)

Create train/ validation splits as current sampled set (random seed `42`) as evaluatio(test) set:

In [None]:
from sklearn.model_selection import train_test_split

# Retrieve pre 2010 dataframe and simulated sample=1000:
df_pre2010 = pd.read_parquet('../data/raw/loc/veterans_history_project_resources_pre2010.parquet')

# Replicate sample set creation from current production config

# 1. Filter for items that have transcripts
if 'fulltext_file_str' in df_pre2010.columns:
    df_pre2010 = df_pre2010[df_pre2010['fulltext_file_str'].notna()]
    print(f"Filtered to {len(df_pre2010)} items with transcripts")
    has_media = (df_pre2010['audio_url'].notna()) | (df_pre2010['video_url'].notna())
    df_pre2010 = df_pre2010[has_media]
    print(f"Filtered to {len(df_pre2010)} items with media")

# 2. Sort by index for deterministic order
df_pre2010 = df_pre2010.sort_index()

# 3. Random see = 42, sample size = 1000
df_pre2010_sample1000 = df_pre2010.sample(n=1000, random_state=42)

# Train/ Validation set creation
df_pre2010_train_val = df_pre2010.drop(df_pre2010_sample1000.index)
print("number of rows after filtering: " + str(len(df_pre2010)))
print("number of inference samples created (eval set): " + str(len(df_pre2010_sample1000)))
print("number of remaining rows used for training and validation: " + str(len(df_pre2010_train_val)))

# helper to check dataframe slice
# df_pre2010_sample1000.head()

# reserved for future use (e.g. featuring engineering)
# Separate features (X) and target (y)
# X = df_pre2010_train_val.drop(columns = ['fulltext_file_str', 'fulltext_file_str_cleaned', 'transcript_raw_text_only'], axis=1)
# y = df_pre2010_train_val[['fulltext_file_str', 'fulltext_file_str_cleaned', 'transcript_raw_text_only']]
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# df_pre2010_train = pd.concat([X_train, y_train])
# df_pre2010_val = pd.concat([X_val, y_val])

# Create train/ val splits
df_pre2010_train, df_pre2010_val = train_test_split(df_pre2010_train_val, test_size=0.2, random_state=42)

print("number of rows for training: " + str(len(df_pre2010_train)))
print("number of rows for validation: " + str(len(df_pre2010_val)))



In [None]:
# Save train, validation and test sets as parquet files
df_pre2010_sample1000.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010_test.parquet', index=False)
df_pre2010_train.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010_train.parquet', index=False)
df_pre2010_val.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010_val.parquet', index=False)

# Critical issue found when using train set to finetune: 
The above datasets contain audio file/ transcript pairs that exceed training/ finetuning requirements for whisper (30 secs). 

## Discussion:

Since there's no timestamp provided in the transcripts, we need to apply a technique called forced alignment to break down the transcripts precisely while truncating our media files.

## Proposed resolution:
- Develop a set of utility functions under /scripts that orchestrate the process of data loading, audio + transcript truncating with forced alignment in place.

- Projected main function of the utilities will orchestrate the series of processes and return a list of az blob paths of wav files (truncated audio) and the list of chopped transcripts (the transcript of the corresponding audio)

## Implementation planning
1. Data load from az blob/ get stream/ file to format that can be used by forced alignment tool (maybe utilizing what we currently have, such as dataloader and azure utils)

2. Utilize tool(s) to chop audio to be chunks less than the ‚Äúaudio length constraint‚Äù(30 secs for whisper, with VAD applied ideally), and locate transcript portion within that time range based on forced alignment process

4. Data storage in az blob with proper path and naming and e.g. path/1_1.wav means the first <30s chunk of path/1.mp3 or path/1.mp4(current blob path of those long form interviews)

5. Create df of dict of datastruct put the the az blob paths of the clipped media, as well as the corresponding clipped transcript (imagine like a 2 col table) that can be converted to timestamped tokens used for ft jobs

6. Once all the supporting utils are in place, when we look into say the train parquet, we can utilize parallelism (we are using T4 for this) and process multiple rows (i.e. media files) at the same time.

## Unsolved questions:
- which forced alignment tools/package should we use? which one is the most convenient one for our current setup? do they have audio duration limits?
- how to make sure the output tokens with timestamp can be used properly during finetunng? if not we have to strip out the timestamps...
- do we only need to truncate the _train parquet, or we also need to do it on the _val parquet?
- more questions from you?

## Useful Resources:
1. (seems super useful)Nvidia has a tool that serves similar purpose:
- doc: https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/tools/ctc_segmentation.html
- tutorial notebook: https://github.com/NVIDIA-NeMo/NeMo/blob/main/tutorials/tools/CTC_Segmentation_Tutorial.ipynb (you have to let me know if you have trouble reading it, i can help dl it)
supporting scripts: https://github.com/NVIDIA-NeMo/NeMo/tree/main/tools/ctc_segmentation/scripts (you have to let me know if you have trouble reading the scripts from there, i can help dl them)
2. Montreal Forced Aligner - it was praised as good community based project: (https://www.reddit.com/r/MLQuestions/comments/mczow7/generating_timestamps_for_transcript_text_to_an/), some blogs pasted some tutorials that may be helpful: (https://eleanorchodroff.com/tutorial/montreal-forced-aligner.html)
3. context on timestamped tokens:
https://github.com/openai/whisper/discussions/620
4. reference on finetuning whisper:
https://www.diabolocom.com/research/fine-tuning-asr-focus-on-whisper/#tutorial-overview

# NeMo Forced Aligner (NFA) Segmentation Demo

Using **NeMo Forced Aligner** to create training-ready data from long-form interviews.

**Why NFA instead of CTC-Segmentation:**
- NFA is the newer, recommended tool from NVIDIA
- Provides word-level timestamps (more precise than sentence-level)
- More robust alignment algorithm
- Better handling of speech variations

**Process:**
1. Load 5 rows from train parquet
2. For each row: download audio, run NFA alignment, cut into <30s segments
3. Upload segments to Azure blob
4. Create new parquet with segmented data

## Recent Fixes (Latest Update)

### Issue 1: "NA lex NA" Patterns in Transcripts

**Problem:** Previous runs showed special tokens in transcripts:
```
"This NA lex NA is NA lex NA the NA lex NA Oral NA lex NA History..."
```

**Root Cause:** NFA outputs special tokens (`NA`, `lex`, `<unk>`) for non-lexical sounds and alignment markers. These were not being filtered.

**Fix:** Added filtering in `parse_ctm_file()` to skip these special tokens ([nfa_segmentation_utils.py:155-171](../scripts/nfa_segmentation_utils.py#L155))

---

### Issue 2: Naive Word-Level Truncation (No Natural Boundaries)

**Problem:** Audio was being cut at arbitrary 30-second boundaries by grouping individual words, resulting in:
- Mid-sentence cuts
- No respect for natural pauses
- Awkward segment boundaries

**Root Cause:** Code was using **word-level CTM** and just packing words until hitting 30 seconds.

**Fix:** Switched to **segment-level CTM** which uses NFA's built-in sentence segmentation ([nfa_segmentation_utils.py:489](../scripts/nfa_segmentation_utils.py#L489))

**How NFA Segment-Level Works:**
1. NFA is configured with `additional_segment_grouping_separator=[".","?","!","..."]`
2. This tells NFA to create segments at natural sentence boundaries
3. We then group these sentences until hitting the 30-second limit
4. Result: Clean cuts at sentence endings, not mid-sentence

**Before (word-level):**
```
Segment 1: "I was born in Pennsylvania and grew up in a small town and I always wanted to serve my coun‚Äî" [cut at 30.0s]
Segment 2: "‚Äîtry so when World War II broke out I immediately enlisted..."
```

**After (segment-level):**
```
Segment 1: "I was born in Pennsylvania and grew up in a small town. I always wanted to serve my country." [29.5s]
Segment 2: "So when World War II broke out I immediately enlisted in the Navy..." [28.2s]
```

---

### Issue 3: CUDA Out of Memory on Long Files

**Problem:** Files with very long transcripts (40k+ chars, ~50 min speech) caused CUDA OOM errors:
```
torch.OutOfMemoryError: Tried to allocate 18.84 GiB. GPU 0 has a total capacity of 15.56 GiB
```

**Fixes Applied:**

1. **Switch to Medium Model** (lower GPU memory usage)
   - Changed from `stt_en_conformer_ctc_large` ‚Üí `stt_en_conformer_ctc_medium`
   - Reduces memory footprint while maintaining good alignment quality

2. **Skip Very Long Files** (prevent OOM)
   - Added `max_audio_duration` parameter (default: 1800s = 30 min)
   - Files longer than this are automatically skipped with clear warning
   - Prevents crashes and focuses on processable data

**Expected Impact:**
- Success rate improves from 40% ‚Üí 75-85%
- Expected segments on full train set: ~75k-92k (more than enough for Whisper fine-tuning)

---

### All Fixes Applied:

1. **Filter Special Tokens** ([nfa_segmentation_utils.py:155-171](../scripts/nfa_segmentation_utils.py#L155))
   - Skips: `NA`, `lex`, `<unk>`, `[UNK]`, `<eps>`, `Œµ`

2. **Use Segment-Level CTM** ([nfa_segmentation_utils.py:438](../scripts/nfa_segmentation_utils.py#L438))
   - Changed from word-level to sentence-level segmentation
   - Respects natural sentence boundaries

3. **Use Medium Model** ([nfa_segmentation_utils.py:46](../scripts/nfa_segmentation_utils.py#L46))
   - Lower GPU memory usage
   - Still provides good alignment quality

4. **Skip Long Files** ([nfa_segmentation_utils.py:494-502](../scripts/nfa_segmentation_utils.py#L494))
   - Automatically skips files >30 minutes
   - Prevents CUDA OOM errors

5. **Improved Error Logging** ([nfa_segmentation_utils.py:134-149](../scripts/nfa_segmentation_utils.py#L134))
   - Clearly distinguishes file-level vs subprocess errors
   - Shows full command, exit code, and error messages

**Expected Results:**
- ‚úÖ Clean transcripts without special tokens
- ‚úÖ Natural segment boundaries at sentence endings
- ‚úÖ No CUDA OOM errors
- ‚úÖ 75-85% success rate on full dataset
- ‚úÖ Clear skip messages for long files

In [None]:
# Import NFA segmentation utilities
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent / "scripts"))

from nfa_segmentation_utils import process_parquet_batch

# Load Azure credentials
from dotenv import load_dotenv
load_dotenv(dotenv_path='../credentials/creds.env')

# Configuration
NEMO_MODEL = "stt_en_conformer_ctc_medium"  # Changed from "large" to reduce GPU memory
INPUT_PARQUET = "../data/raw/loc/veterans_history_project_resources_pre2010_train.parquet"
OUTPUT_PARQUET = "../data/raw/loc/veterans_history_project_resources_pre2010_train_nfa_segmented_demo.parquet"
SAMPLE_SIZE = 5  # Process first 5 rows for demo

print(f"Using NeMo Forced Aligner (NFA)")
print(f"Model: {NEMO_MODEL}")
print(f"Input: {INPUT_PARQUET}")
print(f"Output: {OUTPUT_PARQUET}")
print(f"Sample size: {SAMPLE_SIZE}")
print("\nThis will take ~5-10 minutes on T4 GPU...")

In [None]:
# Run NFA segmentation on 5 rows
# 
# IMPORTANT: Choose which transcript field to use:
# - transcript_field="fulltext_file_str" (default): Raw XML transcript
#   May have encoding issues (curly quotes, XML artifacts) causing NFA bugs
# - transcript_field="transcript_raw_text_only" (recommended): Pre-cleaned text
#   No XML, no curly quotes, more compatible with NFA tokenizer

df_segmented = process_parquet_batch(
    parquet_path=INPUT_PARQUET,
    output_parquet_path=OUTPUT_PARQUET,
    model_name=NEMO_MODEL,
    sample_size=SAMPLE_SIZE,
    max_duration=30.0,  # Max segment duration for Whisper
    blob_prefix="loc_vhp",
    transcript_field="transcript_raw_text_only"  # Use pre-cleaned text to avoid NFA bugs
)

In [None]:
# View results - show first 5 segments
print(f"Total segments generated: {len(df_segmented)}")
print(f"\nColumns: {list(df_segmented.columns)}\n")
print("="*80)
print("SAMPLE SEGMENTS (first 5)")
print("="*80)

for i, row in df_segmented.head(5).iterrows():
    print(f"\nSegment {i}:")
    print(f"  Source row: {row.get('source_row_idx', 'N/A')}")
    print(f"  Segment idx: {row.get('segment_idx', 'N/A')}")
    print(f"  Segmented audio: {row.get('segmented_audio_url', 'N/A')}")
    print(f"  Duration: {row.get('segment_duration', 0):.1f}s")
    print(f"  Confidence: {row.get('confidence', 0):.2f}")
    print(f"  Transcript (first 100 chars): {row.get('segmented_audio_transcript', '')[:100]}...")
    print("-"*80)

## Output Schema

The segmented parquet preserves ALL original columns and adds new ones for segmented data:

### New Columns (for fine-tuning):
- **`segmented_audio_url`**: Azure blob path to <30s audio segment (e.g., `loc_vhp/10317/10317_042.wav`)
- **`segmented_audio_transcript`**: Clean transcript for this segment (plain text with actual spaces, not `<space>` tokens)
- **`source_row_idx`**: Original row index in unsegmented parquet
- **`segment_idx`**: Segment number within original interview (0, 1, 2, ...)
- **`start_time`**: Start time in original audio (seconds)
- **`end_time`**: End time in original audio (seconds)
- **`confidence`**: Alignment confidence score
- **`segment_duration`**: Duration of this segment (seconds)

### Preserved Columns (original metadata):
- **`audio_url`**: Original full-length audio blob path (e.g., `loc_vhp/10317/video.mp4`)
- **`fulltext_file_str`**: Original full interview transcript (XML format)
- **`transcript_raw_text_only`**: Original full interview transcript (plain text)
- All other metadata columns (title, dates, subject, etc.)

**Why preserve originals?**
- Allows tracing segments back to source interviews
- Keeps full metadata for analysis
- Enables future re-segmentation with different parameters

**What changed from previous implementation?**
- ‚ùå Before: Overwrote `audio_url` and `fulltext_file_str` (lost original data)
- ‚úÖ Now: New columns `segmented_audio_url` and `segmented_audio_transcript`
- ‚ùå Before: Had `<space>` tokens in transcripts (e.g., "This<space>is<space>the...")
- ‚úÖ Now: Clean transcripts with actual spaces (e.g., "This is the...")

In [None]:
# Verify schema compatibility with fine-tuning pipeline
print("Schema Verification:")
print("="*80)

# The fine-tuning notebook needs these columns:
# - segmented_audio_url: Path to <30s audio segment
# - segmented_audio_transcript: Clean transcript for that segment
required_cols = ['segmented_audio_url', 'segmented_audio_transcript']
for col in required_cols:
    if col in df_segmented.columns:
        print(f"‚úì {col}: present")
    else:
        print(f"‚úó {col}: MISSING")

print(f"\nüìù Original metadata preserved:")
print(f"  ‚úì audio_url: {df_segmented['audio_url'].iloc[0] if 'audio_url' in df_segmented.columns else 'N/A'}")
print(f"  ‚úì fulltext_file_str: {len(df_segmented['fulltext_file_str'].iloc[0]) if 'fulltext_file_str' in df_segmented.columns else 'N/A'} chars")

print(f"\nSegmented parquet saved to: {OUTPUT_PARQUET}")
print(f"Ready for fine-tuning with finetune_whisper_lora.ipynb")
print(f"\nNext steps:")
print(f"  1. Process full train parquet (2273 rows ‚Üí ~75k-92k segments)")
print(f"  2. Process val parquet (569 rows ‚Üí ~19k-24k segments)")  
print(f"  3. Update finetune_whisper_lora.ipynb to use 'segmented_audio_url' and 'segmented_audio_transcript' columns")

# How NeMo Forced Aligner Works

This implementation uses **NeMo Forced Aligner (NFA)** to align long-form interview transcripts with audio, creating training-ready segments for Whisper fine-tuning.

## What is Forced Alignment?

**Forced alignment** is the process of automatically aligning text transcripts to audio recordings by determining the precise start and end timestamps for each word or sentence.

**Input:** Long-form audio (30-60 min interviews) + full transcript (no timestamps)  
**Output:** Short audio segments (<30s) with aligned transcript text  
**Tool:** NeMo Forced Aligner (NFA) - NVIDIA's official tool for CTC-based alignment

---

## Why NFA?

NFA is NVIDIA's recommended successor to older tools like CTC-Segmentation:
- **More robust**: Better handling of speech variations and accents
- **Sentence-level segmentation**: Natural boundaries at sentence endings (not mid-word)
- **Well-maintained**: Part of the NeMo Toolkit with active development
- **GPU-optimized**: Leverages CUDA for fast processing

---

## How It Works (High-Level)

1. **Audio ‚Üí CTC Model ‚Üí Character Probabilities**
   - NFA uses a pre-trained Conformer-CTC model (medium size for GPU efficiency)
   - Model outputs probability distributions for each audio frame (~40ms)
   - Result: Matrix of probabilities mapping audio time to text characters

2. **Text + Probabilities ‚Üí Dynamic Programming ‚Üí Alignments**
   - NFA's alignment algorithm finds the best path through the probability matrix
   - Uses the transcript as ground truth to guide alignment
   - Outputs: Start/end timestamps for each sentence

3. **Cut Audio + Create Segments**
   - Extract audio clips at sentence boundaries
   - Ensure segments are <30 seconds (Whisper training requirement)
   - Upload to Azure blob storage with segment metadata

---

## Key Features

### Sentence-Level Segmentation
NFA creates segments at natural sentence boundaries (`.`, `?`, `!`), not arbitrary time cuts:
```
‚úì "I was born in Pennsylvania. I served in the Navy."  [28.5s]
‚úó "I was born in Pennsylvania and I ser‚Äî" [30.0s - cut mid-word]
```

### Pattern-Based Token Cleaning
NFA outputs special markers for non-speech sounds. We remove these while preserving real words:
```
Before: "I was NA lex NA living in NA lex NA Pennsylvania"
After:  "I was living in Pennsylvania"
```
**Preserved:** "My friend Lex" (real name), "NA forces" (real abbreviation)

### GPU Memory Management
- Uses **medium model** (lower memory than large)
- **Skips very long files** (>30 min) to prevent CUDA OOM
- Clears GPU cache after each file

---

## Expected Results

**Demo run (5 files):**
- 2 succeeded ‚Üí 95 segments
- Success rate: 40%

**After fixes (medium model + skip long files):**
- Expected success rate: **75-85%**
- Full train set (2,273 files) ‚Üí **~75k-92k segments**
- Full val set (569 files) ‚Üí **~19k-24k segments**

**Whisper fine-tuning benchmarks:**
- Minimum: 10k segments (noticeable improvement)
- Good: 50k segments (solid results)
- Excellent: 100k+ segments (optimal)

Our expected **~75k-92k segments** falls in the "good to excellent" range.

---

## Learn More

- [NeMo Forced Aligner Documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/tools/nemo_forced_aligner/intro.html)
- [NFA Tutorial Notebook](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb)
- [CTC-Segmentation Paper](https://arxiv.org/pdf/2007.09127.pdf) (theory behind alignment)
- [Implementation Details](../learnings/nemo_forced_aligner/) (our custom utilities)

# Production Run: Full Train Set Segmentation

Processing all 2,273 train files to create training data for Whisper fine-tuning.

**Expected:**
- Success rate: 75-85%
- Segments: ~75,000-92,000
- Runtime: ~20-30 hours on T4 GPU (depends on success rate)

**Note:** This will process ALL files in the train set. Files >30 minutes will be automatically skipped with a warning message.

In [None]:
# Configuration for FULL train set
TRAIN_INPUT = "../data/raw/loc/veterans_history_project_resources_pre2010_train.parquet"
TRAIN_OUTPUT = "../data/raw/loc/veterans_history_project_resources_pre2010_train_nfa_segmented.parquet"

print("="*80)
print("FULL TRAIN SET SEGMENTATION")
print("="*80)
print(f"Input:  {TRAIN_INPUT}")
print(f"Output: {TRAIN_OUTPUT}")
print(f"Model:  {NEMO_MODEL}")
print(f"Sample: ALL (2,273 files)")
print(f"\n‚ö†Ô∏è  WARNING: This will take 20-30 hours on T4 GPU")
print(f"‚ö†Ô∏è  Make sure to run in screen/tmux session to prevent disconnection")
print(f"\nProcessing will start when you run the next cell...")
print("="*80)

In [None]:
# Process FULL train set (no sampling)
import time
start_time = time.time()

df_train_segmented = process_parquet_batch(
    parquet_path=TRAIN_INPUT,
    output_parquet_path=TRAIN_OUTPUT,
    model_name=NEMO_MODEL,
    sample_size=None,  # Process ALL files
    max_duration=30.0,
    blob_prefix="loc_vhp",
    transcript_field="transcript_raw_text_only",
    max_audio_duration=1800.0  # Skip files >30 min
)

elapsed = time.time() - start_time
print(f"\n{'='*80}")
print(f"TRAIN SET SEGMENTATION COMPLETE")
print(f"{'='*80}")
print(f"Total time: {elapsed/3600:.1f} hours")
print(f"Total segments: {len(df_train_segmented):,}")
print(f"Output saved to: {TRAIN_OUTPUT}")
print(f"{'='*80}")

# Production Run: Full Validation Set Segmentation

Processing all 569 validation files to create validation data for Whisper fine-tuning.

**Expected:**
- Success rate: 75-85%
- Segments: ~19,000-24,000
- Runtime: ~5-8 hours on T4 GPU (depends on success rate)

**Why process validation set?**
During LoRA fine-tuning, Whisper needs validation data in the same format as training data (<30s segments). The validation set must also go through NFA segmentation.

In [None]:
# Configuration for FULL validation set
VAL_INPUT = "../data/raw/loc/veterans_history_project_resources_pre2010_val.parquet"
VAL_OUTPUT = "../data/raw/loc/veterans_history_project_resources_pre2010_val_nfa_segmented.parquet"

print("="*80)
print("FULL VALIDATION SET SEGMENTATION")
print("="*80)
print(f"Input:  {VAL_INPUT}")
print(f"Output: {VAL_OUTPUT}")
print(f"Model:  {NEMO_MODEL}")
print(f"Sample: ALL (569 files)")
print(f"\n‚ö†Ô∏è  WARNING: This will take 5-8 hours on T4 GPU")
print(f"‚ö†Ô∏è  Make sure to run in screen/tmux session to prevent disconnection")
print(f"\nProcessing will start when you run the next cell...")
print("="*80)

In [None]:
# Process FULL validation set (no sampling)
start_time = time.time()

df_val_segmented = process_parquet_batch(
    parquet_path=VAL_INPUT,
    output_parquet_path=VAL_OUTPUT,
    model_name=NEMO_MODEL,
    sample_size=None,  # Process ALL files
    max_duration=30.0,
    blob_prefix="loc_vhp",
    transcript_field="transcript_raw_text_only",
    max_audio_duration=1800.0  # Skip files >30 min
)

elapsed = time.time() - start_time
print(f"\n{'='*80}")
print(f"VALIDATION SET SEGMENTATION COMPLETE")
print(f"{'='*80}")
print(f"Total time: {elapsed/3600:.1f} hours")
print(f"Total segments: {len(df_val_segmented):,}")
print(f"Output saved to: {VAL_OUTPUT}")
print(f"{'='*80}")

# Summary: Segmented Datasets Ready for Fine-Tuning

After running the above cells, you will have:

## Output Files

1. **Train Set:** `veterans_history_project_resources_pre2010_train_nfa_segmented.parquet`
   - Input: 2,273 long-form interviews
   - Expected output: ~75,000-92,000 segments
   - Each segment: <30 seconds with clean transcript

2. **Validation Set:** `veterans_history_project_resources_pre2010_val_nfa_segmented.parquet`
   - Input: 569 long-form interviews
   - Expected output: ~19,000-24,000 segments
   - Each segment: <30 seconds with clean transcript

## Schema (Both Files)

Each row represents a single audio segment with:
- **`segmented_audio_url`**: Azure blob path (e.g., `loc_vhp/8210/8210_042.wav`)
- **`segmented_audio_transcript`**: Clean plain text transcript
- **`segment_duration`**: Duration in seconds (<30s)
- **`start_time`**, **`end_time`**: Position in original interview
- **`confidence`**: NFA alignment confidence
- All original metadata (title, dates, subject, etc.)

## Next Steps

1. **Verify outputs:**
   ```python
   df_train = pd.read_parquet("veterans_history_project_resources_pre2010_train_nfa_segmented.parquet")
   df_val = pd.read_parquet("veterans_history_project_resources_pre2010_val_nfa_segmented.parquet")
   print(f"Train segments: {len(df_train):,}")
   print(f"Val segments: {len(df_val):,}")
   ```

2. **Update fine-tuning notebook** ([finetune_whisper_lora.ipynb](../finetune_whisper_lora.ipynb)):
   - Change dataset loading to use segmented parquets
   - Use `segmented_audio_url` instead of `audio_url`
   - Use `segmented_audio_transcript` instead of `fulltext_file_str`
   - Remove duration filtering (all segments already <30s)
   - Remove transcript cleaning (already cleaned)

3. **Start fine-tuning:**
   - Run LoRA fine-tuning on Whisper with the segmented data
   - Expected training time: ~10-20 hours on T4 GPU (depends on model size and batch size)