In [None]:
import pandas as pd
# load parquet

# Specify the path to your Parquet file
parquet_file_path = '../data/raw/loc/veterans_history_project.parquet'

# Read the Parquet file into a Pandas DataFrame
df = pd.read_parquet(parquet_file_path)

In [None]:
# column names checker
# note: date/ dats are related to dates of service/ war campaigns
df.columns

In [None]:
# extract year of record creation as a proxy of the age of media
df['number_date_created_first_itm'] = df['number_date_created'].apply(lambda x: x[0])
df['year_record_created'] = df['number_date_created_first_itm'].str.extract(r'^(\d{4})').astype(int)
df = df.sort_values(by='year_record_created', ascending=True)

In [None]:
import matplotlib
bins = df['year_record_created'].max() - df['year_record_created'].min()
df['year_record_created'].hist(bins = bins)

In [None]:
df_pre2010 = df[df['year_record_created']<=2010]
df_pre2010.to_parquet('../data/raw/loc/veterans_history_project_pre2010.parquet', index=False)

In [None]:
df_post2010 = df[df['year_record_created']>2010]
df_post2010.to_parquet('../data/raw/loc/veterans_history_project_post2010.parquet', index=False)

In [None]:
df_pre2010.iloc[0]

In [None]:
range(len(df_pre2010))

In [None]:
df_pre2010 = df_pre2010.reset_index(drop=True)

In [None]:
df_post2010.iloc[0]

In [None]:
range(len(df_post2010))

In [None]:
df_post2010 = df_post2010.reset_index(drop=True)

In [None]:
# retrieve resource DataFrame from the parquet file
df_resources = pd.read_parquet('../data/raw/loc/veterans_history_project_resources.parquet')

In [None]:
# reconstruct the dataframe where each row contains only one media resource
l_collection_numbers = []
for n in range(len(df_pre2010)):
    collection_number = df_pre2010['item'][n]['collection_number']
    # print(collection_number)
    l_collection_numbers.append(collection_number)

In [None]:
# check if the item collection numbers are unique
print(len(l_collection_numbers))
print(len(set(l_collection_numbers)))

In [None]:
df_resources_filtered = df_resources[df_resources['collection_number'].isin(l_collection_numbers)]

In [None]:
# save the DataFrame to a parquet file
df_resources_filtered.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010.parquet', index=False)

In [None]:
# post 2010:
# reconstruct the dataframe where each row contains only one media resource
l_collection_numbers = []
for n in range(len(df_post2010)):
    collection_number = df_post2010['item'][n]['collection_number']
    # print(collection_number)
    l_collection_numbers.append(collection_number)
df_resources_filtered = df_resources[df_resources['collection_number'].isin(l_collection_numbers)]
# save the DataFrame to a parquet file
df_resources_filtered.to_parquet('../data/raw/loc/veterans_history_project_resources_post2010.parquet', index=False)

Create train/ validation splits as current sampled set (random seed `42`) as evaluatio(test) set:

In [None]:
from sklearn.model_selection import train_test_split

# Retrieve pre 2010 dataframe and simulated sample=1000:
df_pre2010 = pd.read_parquet('../data/raw/loc/veterans_history_project_resources_pre2010.parquet')

# Replicate sample set creation from current production config

# 1. Filter for items that have transcripts
if 'fulltext_file_str' in df_pre2010.columns:
    df_pre2010 = df_pre2010[df_pre2010['fulltext_file_str'].notna()]
    print(f"Filtered to {len(df_pre2010)} items with transcripts")
    has_media = (df_pre2010['audio_url'].notna()) | (df_pre2010['video_url'].notna())
    df_pre2010 = df_pre2010[has_media]
    print(f"Filtered to {len(df_pre2010)} items with media")

# 2. Sort by index for deterministic order
df_pre2010 = df_pre2010.sort_index()

# 3. Random see = 42, sample size = 1000
df_pre2010_sample1000 = df_pre2010.sample(n=1000, random_state=42)

# Train/ Validation set creation
df_pre2010_train_val = df_pre2010.drop(df_pre2010_sample1000.index)
print("number of rows after filtering: " + str(len(df_pre2010)))
print("number of inference samples created (eval set): " + str(len(df_pre2010_sample1000)))
print("number of remaining rows used for training and validation: " + str(len(df_pre2010_train_val)))

# helper to check dataframe slice
# df_pre2010_sample1000.head()

# reserved for future use (e.g. featuring engineering)
# Separate features (X) and target (y)
# X = df_pre2010_train_val.drop(columns = ['fulltext_file_str', 'fulltext_file_str_cleaned', 'transcript_raw_text_only'], axis=1)
# y = df_pre2010_train_val[['fulltext_file_str', 'fulltext_file_str_cleaned', 'transcript_raw_text_only']]
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# df_pre2010_train = pd.concat([X_train, y_train])
# df_pre2010_val = pd.concat([X_val, y_val])

# Create train/ val splits
df_pre2010_train, df_pre2010_val = train_test_split(df_pre2010_train_val, test_size=0.2, random_state=42)

print("number of rows for training: " + str(len(df_pre2010_train)))
print("number of rows for validation: " + str(len(df_pre2010_val)))



In [None]:
# Save train, validation and test sets as parquet files
df_pre2010_sample1000.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010_test.parquet', index=False)
df_pre2010_train.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010_train.parquet', index=False)
df_pre2010_val.to_parquet('../data/raw/loc/veterans_history_project_resources_pre2010_val.parquet', index=False)

# Critical issue found when using train set to finetune: 
The above datasets contain audio file/ transcript pairs that exceed training/ finetuning requirements for whisper (30 secs). 

## Discussion:

Since there's no timestamp provided in the transcripts, we need to apply a technique called forced alignment to break down the transcripts precisely while truncating our media files.

## Proposed resolution:
- Develop a set of utility functions under /scripts that orchestrate the process of data loading, audio + transcript truncating with forced alignment in place.

- Projected main function of the utilities will orchestrate the series of processes and return a list of az blob paths of wav files (truncated audio) and the list of chopped transcripts (the transcript of the corresponding audio)

## Implementation planning
1. Data load from az blob/ get stream/ file to format that can be used by forced alignment tool (maybe utilizing what we currently have, such as dataloader and azure utils)

2. Utilize tool(s) to chop audio to be chunks less than the “audio length constraint”(30 secs for whisper, with VAD applied ideally), and locate transcript portion within that time range based on forced alignment process

4. Data storage in az blob with proper path and naming and e.g. path/1_1.wav means the first <30s chunk of path/1.mp3 or path/1.mp4(current blob path of those long form interviews)

5. Create df of dict of datastruct put the the az blob paths of the clipped media, as well as the corresponding clipped transcript (imagine like a 2 col table) that can be converted to timestamped tokens used for ft jobs

6. Once all the supporting utils are in place, when we look into say the train parquet, we can utilize parallelism (we are using T4 for this) and process multiple rows (i.e. media files) at the same time.

## Unsolved questions:
- which forced alignment tools/package should we use? which one is the most convenient one for our current setup? do they have audio duration limits?
- how to make sure the output tokens with timestamp can be used properly during finetunng? if not we have to strip out the timestamps...
- do we only need to truncate the _train parquet, or we also need to do it on the _val parquet?
- more questions from you?

## Useful Resources:
1. (seems super useful)Nvidia has a tool that serves similar purpose:
- doc: https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/tools/ctc_segmentation.html
- tutorial notebook: https://github.com/NVIDIA-NeMo/NeMo/blob/main/tutorials/tools/CTC_Segmentation_Tutorial.ipynb (you have to let me know if you have trouble reading it, i can help dl it)
supporting scripts: https://github.com/NVIDIA-NeMo/NeMo/tree/main/tools/ctc_segmentation/scripts (you have to let me know if you have trouble reading the scripts from there, i can help dl them)
2. Montreal Forced Aligner - it was praised as good community based project: (https://www.reddit.com/r/MLQuestions/comments/mczow7/generating_timestamps_for_transcript_text_to_an/), some blogs pasted some tutorials that may be helpful: (https://eleanorchodroff.com/tutorial/montreal-forced-aligner.html)
3. context on timestamped tokens:
https://github.com/openai/whisper/discussions/620
4. reference on finetuning whisper:
https://www.diabolocom.com/research/fine-tuning-asr-focus-on-whisper/#tutorial-overview

# CTC Segmentation Demo

Using NeMo CTC Segmentation to create training-ready data from long-form interviews.

**Process:**
1. Load 5 rows from train parquet
2. For each row: download audio, run CTC alignment, cut into <30s segments
3. Upload segments to Azure blob
4. Create new parquet with segmented data

In [None]:
# Import segmentation utilities
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent / "scripts"))

from ctc_segmentation_utils import process_parquet_batch

# Load Azure credentials
from dotenv import load_dotenv
load_dotenv(dotenv_path='../credentials/creds.env')

# Configuration
NEMO_MODEL = "stt_en_conformer_ctc_large"  # Will auto-download on first run
INPUT_PARQUET = "../data/raw/loc/veterans_history_project_resources_pre2010_train.parquet"
OUTPUT_PARQUET = "../data/raw/loc/veterans_history_project_resources_pre2010_train_segmented_demo.parquet"
SAMPLE_SIZE = 5  # Process first 5 rows for demo

print(f"Model: {NEMO_MODEL}")
print(f"Input: {INPUT_PARQUET}")
print(f"Output: {OUTPUT_PARQUET}")
print(f"Sample size: {SAMPLE_SIZE}")
print("\nThis will take ~5-10 minutes on T4 GPU...")

In [None]:
# Run CTC segmentation on 5 rows
df_segmented = process_parquet_batch(
    parquet_path=INPUT_PARQUET,
    output_parquet_path=OUTPUT_PARQUET,
    model_name=NEMO_MODEL,
    sample_size=SAMPLE_SIZE,
    max_duration=30.0,  # Max segment duration for Whisper
    min_confidence=-2.0,  # CTC confidence threshold
    blob_prefix="loc_vhp"
)

In [None]:
# View results - show first 5 segments
print(f"Total segments generated: {len(df_segmented)}")
print(f"\nColumns: {list(df_segmented.columns)}\n")
print("="*80)
print("SAMPLE SEGMENTS (first 5)")
print("="*80)

for i, row in df_segmented.head(5).iterrows():
    print(f"\nSegment {i}:")
    print(f"  Source row: {row.get('source_row_idx', 'N/A')}")
    print(f"  Segment idx: {row.get('segment_idx', 'N/A')}")
    print(f"  Audio URL: {row.get('audio_url', 'N/A')}")
    print(f"  Duration: {row.get('segment_duration', 0):.1f}s")
    print(f"  Confidence: {row.get('confidence', 0):.2f}")
    print(f"  Text (first 100 chars): {row.get('fulltext_file_str', '')[:100]}...")
    print("-"*80)

In [None]:
# Verify schema compatibility with fine-tuning pipeline
print("Schema Verification:")
print("="*80)

required_cols = ['audio_url', 'fulltext_file_str']
for col in required_cols:
    if col in df_segmented.columns:
        print(f"✓ {col}: present")
    else:
        print(f"✗ {col}: MISSING")

print(f"\nSegmented parquet saved to: {OUTPUT_PARQUET}")
print(f"Ready for fine-tuning with finetune_whisper_lora.ipynb")
print(f"\nNext steps:")
print(f"  1. Process full train parquet (2273 rows → ~100k segments)")
print(f"  2. Process val parquet (569 rows → ~25k segments)")  
print(f"  3. Use segmented parquets in fine-tuning notebooks")