# Parlamint Data Processing Pipeline (Google Colab Version)

‚ö° **Optimized for GPU execution on Google Colab**

Automated pipeline for processing ParlaMint 5.0 data with GPU acceleration.

## Setup: Mount Google Drive & Install Packages

**Before running:**
1. Upload your data folder to Google Drive
2. Update `BASE_DATA_DIR` path below to match your Drive location
3. Enable GPU: Runtime ‚Üí Change runtime type ‚Üí Hardware accelerator ‚Üí GPU

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("‚úÖ Google Drive mounted at /content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Google Drive mounted at /content/drive


In [2]:
# Install required packages
!pip install -q sentence-transformers scikit-learn

print("‚úÖ Packages installed")

‚úÖ Packages installed


In [None]:
# Verify GPU availability
import torch
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"‚úÖ GPU available: {gpu_name}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("‚ö†Ô∏è  WARNING: No GPU detected!")

In [None]:
# Configuration
import os
import pandas as pd
import numpy as np
import torch
import warnings
import pickle
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

BASE_DATA_DIR = "/content/drive/MyDrive/thesis/data"
CHECKPOINT_DIR = os.path.join(BASE_DATA_DIR, "checkpoints")
OUTPUT_DIR = os.path.join(BASE_DATA_DIR, "processed")
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

CONFIG = {
    'AT': {
        'name': 'Austria',
        'bilingual': True,
        'english_path': os.path.join(BASE_DATA_DIR, "AT", "ParlaMint-AT-en.ana", "ParlaMint-AT-en.txt"),
        'native_path': os.path.join(BASE_DATA_DIR, "AT", "ParlaMint-AT", "ParlaMint-AT.txt"),
        'native_keywords': ['tagesordnung', 'tagesordnungspunkt', 'punkt', 'verhandlung', 'behandlung', 'n√§chster', 'weiter', 'fortsetzen']
    },
    'HR': {
        'name': 'Croatia',
        'bilingual': True,
        'english_path': os.path.join(BASE_DATA_DIR, "HR", "ParlaMint-HR-en.ana", "ParlaMint-HR-en.txt"),
        'native_path': os.path.join(BASE_DATA_DIR, "HR", "ParlaMint-HR", "ParlaMint-HR.txt"),
        'native_keywords': ['dnevni', 'red', 'toƒçka', 'taƒçka', 'sljedeƒái', 'sljedeƒáe', 'prijedlog', 'zakon', 'tema', 'nastavljamo', 'prelazimo']
    },
    'GB': {
        'name': 'Great Britain',
        'bilingual': False,
        'english_path': os.path.join(BASE_DATA_DIR, "GB", "ParlaMint-GB", "ParlaMint-GB.txt"),
        'native_keywords': None
    }
}

ENGLISH_KEYWORDS = ['agenda', 'proceed', 'point', 'item', 'topic', 'next', 'following', 'move on']

print(f"‚úÖ Config loaded | Data: {BASE_DATA_DIR} | GPU batch: 128")

## Step 1: Data Loading

Load parliamentary speeches from year-based folder structure.

In [None]:
def load_parlamint_data(parent_folder):
    """Load ParlaMint data from year folders."""
    if not os.path.exists(parent_folder):
        return None
    
    df_list = []
    year_folders = sorted([f for f in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, f))])
    
    if not year_folders:
        return None
    
    print(f"  Loading {len(year_folders)} years: {year_folders[0]}-{year_folders[-1]}")
    
    for year_folder in year_folders:
        folder_path = os.path.join(parent_folder, year_folder)
        meta_files = [f for f in os.listdir(folder_path) if f.endswith('-meta.tsv') and not f.endswith('-ana-meta.tsv')]
        
        for meta_file in meta_files:
            base = meta_file.replace('-meta.tsv', '')
            txt_path = os.path.join(folder_path, base + '.txt')
            
            if not os.path.exists(txt_path):
                continue
            
            try:
                df_meta = pd.read_csv(os.path.join(folder_path, meta_file), sep='\t', encoding='utf-8', index_col=False)
                
                text_map = {}
                with open(txt_path, encoding='utf-8') as f:
                    for line in f:
                        parts = line.strip().split('\t', 1)
                        if len(parts) == 2:
                            text_map[parts[0]] = parts[1]
                
                df_meta['Text'] = df_meta['ID'].map(text_map)
                df_meta = df_meta[df_meta['Text'].notnull() & (df_meta['Text'].str.strip() != '')]
                
                if len(df_meta) > 0:
                    df_list.append(df_meta)
            except Exception as e:
                print(f"    Error {meta_file}: {e}")
    
    if not df_list:
        return None
    
    df_all = pd.concat(df_list, ignore_index=True)
    print(f"  ‚úÖ {len(df_all):,} speeches")
    return df_all


checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step1_raw_data.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        raw_data = pickle.load(f)
    
    for code, df in raw_data.items():
        has_en = 'Text_English' in df.columns and df['Text_English'].notna().any()
        has_nat = 'Text_Native' in df.columns and df['Text_Native'].notna().any()
        CONFIG[code]['mode'] = 'bilingual' if (has_en and has_nat) else ('english_only' if has_en else 'native_only')
else:
    print("üîÑ Loading data...")
    raw_data = {}
    
    for code, config in CONFIG.items():
        print(f"\n{config['name']} ({code})")
        
        df_english = load_parlamint_data(config['english_path'])
        df_native = load_parlamint_data(config['native_path']) if config['bilingual'] else None
        
        if df_english is not None:
            df = df_english.copy().rename(columns={'Text': 'Text_English'})
            has_english = True
        elif df_native is not None:
            df = df_native.copy().rename(columns={'Text': 'Text_Native'})
            has_english = False
        else:
            print("  ‚ùå No data")
            continue
        
        if has_english and df_native is not None:
            df = df.merge(df_native[['ID', 'Text']].rename(columns={'Text': 'Text_Native'}), on='ID', how='left')
            config['mode'] = 'bilingual'
        elif has_english:
            df['Text_Native'] = None
            config['mode'] = 'english_only'
        else:
            df['Text_English'] = None
            config['mode'] = 'native_only'
        
        raw_data[code] = df
        print(f"  ‚úÖ {config['mode'].upper()}: {len(df):,} speeches")
    
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(raw_data, f)

print(f"\n‚úÖ Loaded: {list(raw_data.keys())}")

## Step 2: Speech Embeddings (GPU Optimized)

Generate BGE-m3 embeddings with GPU acceleration.

**GPU optimizations:**
- Batch size: 128 (vs 16 on CPU)
- Reduced garbage collection frequency
- Checkpoint every 10% (optional - can disable for speed)

In [None]:
def add_speech_embeddings(df, text_column, checkpoint_prefix=''):
    from sentence_transformers import SentenceTransformer
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size = 128 if device == "cuda" else 16
    
    model = SentenceTransformer("BAAI/bge-m3", device=device)
    tokenizer = model.tokenizer
    
    MAX_TOKENS, CHUNK_SIZE, STRIDE = 8192, 8000, 6000
    texts = df[text_column].astype(str).values
    
    # Check for partial checkpoint
    partial_checkpoint = os.path.join(CHECKPOINT_DIR, f'{checkpoint_prefix}_partial.pkl')
    if os.path.exists(partial_checkpoint):
        with open(partial_checkpoint, 'rb') as f:
            embeddings = pickle.load(f)
        start_idx = len(embeddings)
        print(f"  üìÇ Resuming from {start_idx:,}/{len(texts):,} ({start_idx/len(texts)*100:.1f}%)")
    else:
        embeddings = []
        start_idx = 0
    
    checkpoint_interval = 5000  # Save every 5k speeches
    last_checkpoint = start_idx
    
    with tqdm(total=len(texts), initial=start_idx, desc=f"Embed {text_column}", unit="speech") as pbar:
        for i in range(start_idx, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = []
            
            for text in batch_texts:
                token_ids = tokenizer.encode(text, add_special_tokens=False)
                
                if len(token_ids) <= MAX_TOKENS:
                    emb = model.encode([text], convert_to_tensor=False, show_progress_bar=False)[0]
                else:
                    chunks = [tokenizer.decode(token_ids[start:min(start + CHUNK_SIZE, len(token_ids))], skip_special_tokens=True)
                             for start in range(0, len(token_ids), STRIDE)]
                    emb = np.mean(model.encode(chunks, convert_to_tensor=False, show_progress_bar=False), axis=0)
                
                batch_embeddings.append(emb)
            
            embeddings.extend(batch_embeddings)
            pbar.update(len(batch_texts))
            
            # Save checkpoint every 5k speeches
            if checkpoint_prefix and len(embeddings) - last_checkpoint >= checkpoint_interval:
                with open(partial_checkpoint, 'wb') as f:
                    pickle.dump(embeddings, f)
                last_checkpoint = len(embeddings)
    
    # Clean up partial checkpoint when done
    if checkpoint_prefix and os.path.exists(partial_checkpoint):
        os.remove(partial_checkpoint)
    
    df_result = df.copy()
    df_result['Speech_Embeddings'] = embeddings
    
    if device == "cuda":
        torch.cuda.empty_cache()
    
    return df_result


checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step2_speech_embeddings.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        processed_data = pickle.load(f)
else:
    processed_data = {}

for idx, (code, df) in enumerate(raw_data.items(), 1):
    config = CONFIG[code]
    print(f"\n[{idx}/{len(raw_data)}] {config['name']} ({code})")
    
    df_emb = processed_data.get(code, df.copy())
    
    # English embeddings
    if 'Text_English' in df.columns and df['Text_English'].notna().any():
        if not ('Speech_Embeddings_English' in df_emb.columns and df_emb['Speech_Embeddings_English'].notna().any()):
            df_temp = add_speech_embeddings(df, 'Text_English', f'step2_{code}_en')
            df_emb['Speech_Embeddings_English'] = df_temp['Speech_Embeddings']
            
            # Save immediately after English completes
            processed_data[code] = df_emb
            with open(checkpoint_file, 'wb') as f:
                pickle.dump(processed_data, f)
            print(f"  ‚úÖ English: {df_emb['Speech_Embeddings_English'].notna().sum():,} (saved)")
        else:
            print(f"  ‚úÖ English: cached")
    else:
        df_emb['Speech_Embeddings_English'] = None
    
    # Native embeddings
    if 'Text_Native' in df.columns and df['Text_Native'].notna().any():
        if not ('Speech_Embeddings_Native' in df_emb.columns and df_emb['Speech_Embeddings_Native'].notna().any()):
            df_temp = add_speech_embeddings(df, 'Text_Native', f'step2_{code}_native')
            df_emb['Speech_Embeddings_Native'] = df_temp['Speech_Embeddings']
            
            # Save immediately after Native completes
            processed_data[code] = df_emb
            with open(checkpoint_file, 'wb') as f:
                pickle.dump(processed_data, f)
            print(f"  ‚úÖ Native: {df_emb['Speech_Embeddings_Native'].notna().sum():,} (saved)")
        else:
            print(f"  ‚úÖ Native: cached")
    else:
        df_emb['Speech_Embeddings_Native'] = None
    
    processed_data[code] = df_emb

# Final cleanup
if len(processed_data) == len(raw_data):
    step1_checkpoint = os.path.join(CHECKPOINT_DIR, 'step1_raw_data.pkl')
    if os.path.exists(step1_checkpoint):
        os.remove(step1_checkpoint)

## Step 3: Segmentation & Segment IDs

Find segment boundaries using automatic parameter optimization and multi-signal detection.

In [None]:
def detect_boundaries_by_keywords(texts, roles, keywords):
    return [i for i, (text, role) in enumerate(zip(texts, roles))
            if 'Chairperson' in str(role) and any(kw in str(text).lower() for kw in keywords)]


def detect_boundaries_by_similarity(embeddings, window_size=1):
    n = len(embeddings)
    if n < window_size * 2 + 1:
        return []
    
    similarity_drops = []
    for i in range(window_size, n - window_size + 1):
        window_before = embeddings[max(0, i - window_size):i]
        window_after = embeddings[i:min(n, i + window_size)]
        
        if len(window_before) == 0 or len(window_after) == 0:
            continue
        
        mean_before = np.mean(window_before, axis=0)
        mean_after = np.mean(window_after, axis=0)
        sim = cosine_similarity(mean_before.reshape(1, -1), mean_after.reshape(1, -1))[0][0]
        similarity_drops.append((i, 1 - sim))
    
    if not similarity_drops:
        return []
    
    threshold = np.percentile([d[1] for d in similarity_drops], 75)
    return [pos for pos, drop in similarity_drops if drop >= threshold]


def combine_boundaries(keyword_boundaries, similarity_boundaries, min_distance=3):
    all_boundaries = set(keyword_boundaries)
    for sim_b in similarity_boundaries:
        if not any(abs(sim_b - kb) < min_distance for kb in keyword_boundaries):
            all_boundaries.add(sim_b)
    return sorted(all_boundaries)


def evaluate_segmentation(embeddings, boundaries, texts, roles, keywords):
    if len(boundaries) == 0:
        return 0.0, {'error': 'No boundaries'}
    
    keyword_boundaries = detect_boundaries_by_keywords(texts, roles, keywords)
    keyword_score = sum(1 for b in boundaries if b in keyword_boundaries) / len(boundaries)
    
    breaks = [0] + boundaries + [len(embeddings)]
    coherence_scores, separation_scores = [], []
    
    for i in range(len(breaks) - 1):
        segment = embeddings[breaks[i]:breaks[i+1]]
        if len(segment) > 1:
            coherence_scores.append(cosine_similarity(segment).mean())
            if i < len(breaks) - 2:
                next_segment = embeddings[breaks[i+1]:breaks[i+2]]
                if len(next_segment) > 0:
                    separation_scores.append(1 - cosine_similarity(segment, next_segment).mean())
    
    coherence = np.mean(coherence_scores) if coherence_scores else 0
    separation = np.mean(separation_scores) if separation_scores else 0
    semantic_score = (coherence + separation) / 2
    
    segment_lengths = [breaks[i+1] - breaks[i] for i in range(len(breaks) - 1)]
    
    return (keyword_score * 0.5 + semantic_score * 0.5), {
        'keyword_score': keyword_score,
        'semantic_score': semantic_score,
        'coherence': coherence,
        'separation': separation,
        'avg_length': np.mean(segment_lengths),
        'num_segments': len(segment_lengths),
        'num_boundaries': len(boundaries)
    }


def optimize_window_size(embeddings, texts, roles, keywords):
    results = []
    keyword_boundaries = detect_boundaries_by_keywords(texts, roles, keywords)
    
    for window in range(1, 11):
        similarity_boundaries = detect_boundaries_by_similarity(embeddings, window)
        combined_boundaries = combine_boundaries(keyword_boundaries, similarity_boundaries)
        
        if len(combined_boundaries) == 0:
            results.append({'window': window, 'score': 0.0})
            continue
        
        score, stats = evaluate_segmentation(embeddings, combined_boundaries, texts, roles, keywords)
        results.append({'window': window, 'score': score, **stats})
    
    valid_results = [r for r in results if r['score'] > 0]
    if not valid_results:
        return 5, 0.0, results
    
    best = max(valid_results, key=lambda x: x['score'])
    print(f"  üîç Optimal window={best['window']} (score={best['score']:.3f})")
    return best['window'], best['score'], results


def create_segments(dataset, embedding_col, text_col, keywords, text_id_col='Text_ID'):
    print(f"\n{'='*60}\nSEGMENTING: {text_col}\n{'='*60}")
    
    # Optimize window size
    all_sessions = dataset[text_id_col].unique()
    sample_size = min(max(10, int(len(all_sessions) * 0.2)), 50)
    sample_sessions = np.random.choice(all_sessions, sample_size, replace=False)
    sample_data = dataset[dataset[text_id_col].isin(sample_sessions)]
    
    if len(sample_data) >= 50:
        optimal_window, _, _ = optimize_window_size(
            np.array(sample_data[embedding_col].tolist()),
            sample_data[text_col].values,
            sample_data['Speaker_role'].values,
            keywords
        )
    else:
        optimal_window = 5
        print(f"  ‚ö†Ô∏è Using default window=5")
    
    # Segment all sessions
    all_segments = []
    stats = {'optimal_window': optimal_window, 'total_segments': 0, 'segment_lengths': []}
    
    for session_id in tqdm(dataset[text_id_col].unique(), desc="Segmenting"):
        session = dataset[dataset[text_id_col] == session_id].reset_index(drop=True)
        
        if len(session) < 5:
            all_segments.append({
                'Text_ID': session_id,
                'Segment_ID': f"{session_id}_seg_1",
                'Start_Index': 0,
                'End_Index': len(session) - 1
            })
            stats['total_segments'] += 1
            stats['segment_lengths'].append(len(session))
            continue
        
        embeddings = np.array(session[embedding_col].tolist())
        keyword_boundaries = detect_boundaries_by_keywords(session[text_col].values, session['Speaker_role'].values, keywords)
        similarity_boundaries = detect_boundaries_by_similarity(embeddings, optimal_window)
        combined_boundaries = combine_boundaries(keyword_boundaries, similarity_boundaries)
        
        breaks = [0] + combined_boundaries + [len(session)]
        for seg_idx in range(len(breaks) - 1):
            start, end = breaks[seg_idx], breaks[seg_idx + 1] - 1
            stats['segment_lengths'].append(end - start + 1)
            all_segments.append({
                'Text_ID': session_id,
                'Segment_ID': f"{session_id}_seg_{seg_idx + 1}",
                'Start_Index': start,
                'End_Index': end
            })
            stats['total_segments'] += 1
    
    stats['avg_length'] = np.mean(stats['segment_lengths'])
    print(f"  ‚úÖ {stats['total_segments']:,} segments | avg={stats['avg_length']:.1f} speeches/segment")
    
    return all_segments, stats


def add_segment_ids_to_df(df, segments, text_id_col='Text_ID'):
    df = df.copy()
    df['Segment_ID'] = None
    
    for seg in segments:
        mask = df[text_id_col] == seg['Text_ID']
        indices = df[mask].index
        if len(indices) > seg['Start_Index']:
            df.loc[indices[seg['Start_Index']:seg['End_Index']+1], 'Segment_ID'] = seg['Segment_ID']
    
    missing_mask = df['Segment_ID'].isna()
    if missing_mask.any():
        df.loc[missing_mask, 'Segment_ID'] = df.loc[missing_mask, text_id_col] + '_seg_0'
    
    return df


checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step3_segmentation.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        final_data = pickle.load(f)
else:
    with open(os.path.join(CHECKPOINT_DIR, 'step2_speech_embeddings.pkl'), 'rb') as f:
        processed_data = pickle.load(f)
    
    final_data = {}
    
    for idx, (code, df) in enumerate(processed_data.items(), 1):
        config = CONFIG[code]
        print(f"\n[{idx}/{len(processed_data)}] {config['name']} ({code})")
        
        df_final = df.copy()
        
        if 'Speech_Embeddings_English' in df.columns and df['Speech_Embeddings_English'].notna().any():
            segments_en, stats_en = create_segments(df, 'Speech_Embeddings_English', 'Text_English', ENGLISH_KEYWORDS)
            df_final['Segment_ID_English'] = add_segment_ids_to_df(df, segments_en)['Segment_ID']
            optimal_window = stats_en['optimal_window']
        else:
            df_final['Segment_ID_English'] = None
            optimal_window = 5
        
        if 'Speech_Embeddings_Native' in df.columns and df['Speech_Embeddings_Native'].notna().any():
            print(f"  Using window={optimal_window} for Native")
            segments_native, _ = create_segments(df, 'Speech_Embeddings_Native', 'Text_Native', config.get('native_keywords', ENGLISH_KEYWORDS))
            df_final['Segment_ID_Native'] = add_segment_ids_to_df(df, segments_native)['Segment_ID']
        else:
            df_final['Segment_ID_Native'] = None
        
        final_data[code] = df_final
        
        with open(checkpoint_file, 'wb') as f:
            pickle.dump(final_data, f)
    
    step2_checkpoint = os.path.join(CHECKPOINT_DIR, 'step2_speech_embeddings.pkl')
    if os.path.exists(step2_checkpoint):
        os.remove(step2_checkpoint)

## Step 4: Segment Embeddings (GPU Optimized)

Generate embeddings for segments with GPU acceleration.

In [None]:
def add_segment_embeddings(df, text_col, segment_col):
    from sentence_transformers import SentenceTransformer
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size = 128 if device == "cuda" else 16
    
    model = SentenceTransformer("BAAI/bge-m3", device=device)
    tokenizer = model.tokenizer
    MAX_TOKENS, CHUNK_SIZE, STRIDE = 8192, 8000, 6000
    
    segment_texts = df.groupby(segment_col)[text_col].apply(lambda x: ' '.join(x.astype(str)))
    texts = segment_texts.tolist()
    embeddings = []
    
    with tqdm(total=len(texts), desc=f"Segment Embed", unit="seg") as pbar:
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            batch_emb = []
            
            for text in batch:
                tokens = tokenizer.encode(text, add_special_tokens=False)
                if len(tokens) <= MAX_TOKENS:
                    emb = model.encode([text], convert_to_tensor=False, show_progress_bar=False)[0]
                else:
                    chunks = [tokenizer.decode(tokens[start:min(start + CHUNK_SIZE, len(tokens))], skip_special_tokens=True)
                             for start in range(0, len(tokens), STRIDE)]
                    emb = np.mean(model.encode(chunks, convert_to_tensor=False, show_progress_bar=False), axis=0)
                batch_emb.append(emb)
            
            embeddings.extend(batch_emb)
            pbar.update(len(batch))
    
    emb_map = dict(zip(segment_texts.index.tolist(), embeddings))
    df = df.copy()
    df[f'Segment_Embeddings_{text_col}'] = df[segment_col].map(emb_map)
    
    if device == "cuda":
        torch.cuda.empty_cache()
    
    return df


checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step4_segment_embeddings.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        final_data = pickle.load(f)
else:
    with open(os.path.join(CHECKPOINT_DIR, 'step3_segmentation.pkl'), 'rb') as f:
        final_data = pickle.load(f)

for idx, (code, df) in enumerate(final_data.items(), 1):
    config = CONFIG[code]
    print(f"\n[{idx}/{len(final_data)}] {config['name']} ({code})")
    
    # English segment embeddings
    if 'Segment_ID_English' in df.columns and df['Segment_ID_English'].notna().any():
        if not ('Segment_Embeddings_English' in df.columns and df['Segment_Embeddings_English'].notna().any()):
            df = add_segment_embeddings(df, 'Text_English', 'Segment_ID_English')
            df = df.rename(columns={'Segment_Embeddings_Text_English': 'Segment_Embeddings_English'})
            print(f"  ‚úÖ English: {df['Segment_Embeddings_English'].notna().sum():,}")
        else:
            print(f"  ‚úÖ English: cached")
    
    # Native segment embeddings
    if 'Segment_ID_Native' in df.columns and df['Segment_ID_Native'].notna().any():
        if not ('Segment_Embeddings_Native' in df.columns and df['Segment_Embeddings_Native'].notna().any()):
            df = add_segment_embeddings(df, 'Text_Native', 'Segment_ID_Native')
            df = df.rename(columns={'Segment_Embeddings_Text_Native': 'Segment_Embeddings_Native'})
            print(f"  ‚úÖ Native: {df['Segment_Embeddings_Native'].notna().sum():,}")
        else:
            print(f"  ‚úÖ Native: cached")
    
    final_data[code] = df
    
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(final_data, f)

step3_checkpoint = os.path.join(CHECKPOINT_DIR, 'step3_segmentation.pkl')
if os.path.exists(step3_checkpoint):
    os.remove(step3_checkpoint)

## Final Verification & Save

Verify all processed data and save to Google Drive.

In [None]:
print("üìä FINAL VERIFICATION")
print("="*60)

for code, df in final_data.items():
    config = CONFIG[code]
    mode = config.get('mode', 'unknown').upper()
    print(f"\n{config['name']} ({code}) - {mode}:")
    print(f"  Speeches: {len(df):,}")
    print(f"  Sessions: {df['Text_ID'].nunique():,}")

    has_english_emb = 'Speech_Embeddings_English' in df.columns and df['Speech_Embeddings_English'].notna().any()
    has_native_emb = 'Speech_Embeddings_Native' in df.columns and df['Speech_Embeddings_Native'].notna().any()

    if has_english_emb:
        sample_emb = df[df['Speech_Embeddings_English'].notna()]['Speech_Embeddings_English'].iloc[0]
        print(f"  ‚úÖ English speech embeddings: {sample_emb.shape}")
    if has_native_emb:
        sample_emb = df[df['Speech_Embeddings_Native'].notna()]['Speech_Embeddings_Native'].iloc[0]
        print(f"  ‚úÖ Native speech embeddings: {sample_emb.shape}")

    if 'Segment_ID_English' in df.columns and df['Segment_ID_English'].notna().any():
        print(f"  ‚úÖ English segments: {df['Segment_ID_English'].nunique():,}")
    if 'Segment_ID_Native' in df.columns and df['Segment_ID_Native'].notna().any():
        print(f"  ‚úÖ Native segments: {df['Segment_ID_Native'].nunique():,}")

print(f"\n‚úÖ All processing complete!")

In [None]:
print("üíæ SAVING FINAL DATA\n" + "="*60)

for code, df in final_data.items():
    output_path = os.path.join(OUTPUT_DIR, f"{code}_speeches_processed.pkl")
    df.to_pickle(output_path)
    
    n_seg = df['Segment_ID_English'].nunique() if 'Segment_ID_English' in df.columns else 0
    print(f"‚úÖ {CONFIG[code]['name']}: {len(df):,} speeches | {n_seg:,} segments")

step4_checkpoint = os.path.join(CHECKPOINT_DIR, 'step4_segment_embeddings.pkl')
if os.path.exists(step4_checkpoint):
    os.remove(step4_checkpoint)

print(f"\n{'='*60}\n‚úÖ COMPLETE\n{'='*60}\nüìÅ {OUTPUT_DIR}")