# ParlaMint Data Processing Pipeline

**‚ú® Works locally or on Google Colab** - automatically detects environment

## Quick Start Guide

### Option 1: Local Execution
1. Download ParlaMint 5.0 from [CLARIN.SI](https://www.clarin.si/repository/xmlui/handle/11356/2006)
2. Extract to a local folder
3. Update `LOCAL_DATA_DIR` in the configuration cell
4. Run all cells

### Option 2: Google Colab (GPU Recommended)
1. Upload data to Google Drive
2. Open this notebook in Colab
3. Runtime ‚Üí Change runtime type ‚Üí GPU
4. Update `COLAB_DATA_DIR` in configuration cell
5. Run all cells (will auto-mount Drive)

**Data Structure:**
```
data_folder/
‚îú‚îÄ‚îÄ AT/
‚îÇ   ‚îú‚îÄ‚îÄ ParlaMint5.0-AT-en.ana/ParlaMint-AT-en.txt/
‚îÇ   ‚îî‚îÄ‚îÄ ParlaMint-AT/ParlaMint-AT.txt/              (optional)
‚îú‚îÄ‚îÄ HR/
‚îÇ   ‚îî‚îÄ‚îÄ ...
‚îî‚îÄ‚îÄ GB/
    ‚îî‚îÄ‚îÄ ...
```

In [None]:
# Environment Detection & Setup
import os
import sys

# Detect environment
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("üåê Running in Google Colab")
    
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Install packages
    print("üì¶ Installing packages...")
    os.system('pip install -q sentence-transformers scikit-learn')
    
    # Verify GPU
    import torch
    if torch.cuda.is_available():
        print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
        print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    else:
        print("‚ö†Ô∏è  No GPU detected - processing will be slower")
else:
    print("üíª Running locally")

print("‚úÖ Environment ready")

In [None]:
import pandas as pd
import numpy as np
import torch
import gc
import warnings
import pickle
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# === CONFIGURATION ===
# Update these paths based on your setup
LOCAL_DATA_DIR = r"data folder"  # ‚Üê For local execution
COLAB_DATA_DIR = "/content/drive/MyDrive/thesis/data"  # ‚Üê For Colab

# Auto-select based on environment
BASE_DATA_DIR = COLAB_DATA_DIR if IN_COLAB else LOCAL_DATA_DIR
CHECKPOINT_DIR = os.path.join(BASE_DATA_DIR, "checkpoints")
OUTPUT_DIR = os.path.join(BASE_DATA_DIR, "processed")

os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Country configurations
CONFIG = {
    'AT': {
        'name': 'Austria',
        'bilingual': True,
        'english_path': os.path.join(BASE_DATA_DIR, "AT", "ParlaMint5.0-AT-en.ana", "ParlaMint-AT-en.txt"),
        'native_path': os.path.join(BASE_DATA_DIR, "AT", "ParlaMint-AT", "ParlaMint-AT.txt"),
        'native_keywords': ['tagesordnung', 'tagesordnungspunkt', 'punkt', 'verhandlung', 
                           'behandlung', 'n√§chster', 'weiter', 'fortsetzen']
    },
    'HR': {
        'name': 'Croatia',
        'bilingual': True,
        'english_path': os.path.join(BASE_DATA_DIR, "HR", "ParlaMint5.0-HR-en.ana", "ParlaMint-HR-en.txt"),
        'native_path': os.path.join(BASE_DATA_DIR, "HR", "ParlaMint-HR", "ParlaMint-HR.txt"),
        'native_keywords': ['dnevni', 'red', 'toƒçka', 'taƒçka', 'sljedeƒái', 'sljedeƒáe',
                           'prijedlog', 'zakon', 'tema', 'nastavljamo', 'prelazimo']
    },
    'GB': {
        'name': 'Great Britain',
        'bilingual': False,
        'english_path': os.path.join(BASE_DATA_DIR, "GB", "ParlaMint-GB", "ParlaMint-GB.txt"),
        'native_keywords': None
    }
}

ENGLISH_KEYWORDS = ['agenda', 'proceed', 'point', 'item', 'topic', 'next', 'following', 'move on']

# Environment-specific settings
BATCH_SIZE_SPEECH = 128 if (IN_COLAB and torch.cuda.is_available()) else 64 if torch.cuda.is_available() else 16
BATCH_SIZE_SEGMENT = 32 if torch.cuda.is_available() else 8
CHECKPOINT_INTERVAL_SPEECH = 5000 if IN_COLAB else 10000
CHECKPOINT_INTERVAL_SEGMENT = 1000

print(f"‚úÖ Configuration loaded")
print(f"üìç Environment: {'Colab (GPU)' if IN_COLAB else 'Local'}")
print(f"üìÇ Data directory: {BASE_DATA_DIR}")
print(f"üéØ Batch sizes: Speech={BATCH_SIZE_SPEECH}, Segment={BATCH_SIZE_SEGMENT}")
print(f"üíæ Checkpoints: {CHECKPOINT_DIR}")

## Step 1: Data Loading

Load parliamentary speeches from year-based folder structure.

In [None]:
def load_parlamint_data(parent_folder):
    """Load ParlaMint data from year folders."""
    print(f"Loading from: {parent_folder}")
    
    if not os.path.exists(parent_folder):
        print(f"  ‚ö†Ô∏è Path not found: {parent_folder}")
        return None
    
    df_list = []
    year_folders = sorted([f for f in os.listdir(parent_folder) 
                          if os.path.isdir(os.path.join(parent_folder, f))])
    
    if not year_folders:
        print(f"  ‚ö†Ô∏è No year folders found")
        return None
    
    print(f"  Loading {len(year_folders)} years: {year_folders[0]}-{year_folders[-1]}")
    
    for year_folder in year_folders:
        folder_path = os.path.join(parent_folder, year_folder)
        meta_files = [f for f in os.listdir(folder_path) 
                     if f.endswith('-meta.tsv') and not f.endswith('-ana-meta.tsv')]
        
        for meta_file in meta_files:
            base = meta_file.replace('-meta.tsv', '')
            txt_path = os.path.join(folder_path, base + '.txt')
            
            if not os.path.exists(txt_path):
                continue
            
            try:
                df_meta = pd.read_csv(os.path.join(folder_path, meta_file), sep='\t', 
                                     encoding='utf-8', index_col=False)
                
                text_map = {}
                with open(txt_path, encoding='utf-8') as f:
                    for line in f:
                        parts = line.strip().split('\t', 1)
                        if len(parts) == 2:
                            text_map[parts[0]] = parts[1]
                
                df_meta['Text'] = df_meta['ID'].map(text_map)
                df_meta = df_meta[df_meta['Text'].notnull() & (df_meta['Text'].str.strip() != '')]
                
                if len(df_meta) > 0:
                    df_list.append(df_meta)
            except Exception as e:
                print(f"    Error {meta_file}: {e}")
    
    if not df_list:
        return None
    
    df_all = pd.concat(df_list, ignore_index=True)
    print(f"  ‚úÖ {len(df_all):,} speeches")
    return df_all


checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step1_raw_data.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading from checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        raw_data = pickle.load(f)
    
    # Reconstruct mode
    for code, df in raw_data.items():
        has_en = 'Text_English' in df.columns and df['Text_English'].notna().any()
        has_nat = 'Text_Native' in df.columns and df['Text_Native'].notna().any()
        if has_en and has_nat:
            CONFIG[code]['mode'] = 'bilingual'
        elif has_en:
            CONFIG[code]['mode'] = 'english_only'
        elif has_nat:
            CONFIG[code]['mode'] = 'native_only'
    
    print(f"‚úÖ Loaded {len(raw_data)} countries")
else:
    print("üîÑ Loading data from source...")
    raw_data = {}
    
    for code, config in CONFIG.items():
        print(f"\n{config['name']} ({code})")
        
        df_english = load_parlamint_data(config['english_path'])
        df_native = load_parlamint_data(config['native_path']) if config['bilingual'] else None
        
        if df_english is not None:
            df = df_english.copy().rename(columns={'Text': 'Text_English'})
            has_english = True
        elif df_native is not None:
            df = df_native.copy().rename(columns={'Text': 'Text_Native'})
            has_english = False
        else:
            print("  ‚ùå No data")
            continue
        
        if has_english and df_native is not None:
            df = df.merge(df_native[['ID', 'Text']].rename(columns={'Text': 'Text_Native'}), 
                         on='ID', how='left')
            config['mode'] = 'bilingual'
        elif has_english:
            df['Text_Native'] = None
            config['mode'] = 'english_only'
        else:
            df['Text_English'] = None
            config['mode'] = 'native_only'
        
        raw_data[code] = df
        print(f"  ‚úÖ {config['mode'].upper()}: {len(df):,} speeches")
    
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(raw_data, f)

print(f"\n‚úÖ Loaded: {list(raw_data.keys())}")

## Step 2: Speech Embeddings

Generate BGE-m3 embeddings for each speech with automatic GPU optimization.

In [None]:
def add_speech_embeddings(df, text_column, checkpoint_prefix=''):
    """Generate BGE-m3 embeddings with environment-aware settings."""
    from sentence_transformers import SentenceTransformer
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    model = SentenceTransformer("BAAI/bge-m3", device=device)
    tokenizer = model.tokenizer
    
    MAX_TOKENS, CHUNK_SIZE, STRIDE = 8192, 8000, 6000
    texts = df[text_column].astype(str).values
    
    # Check for partial checkpoint
    partial_checkpoint = os.path.join(CHECKPOINT_DIR, f'{checkpoint_prefix}_partial.pkl')
    if os.path.exists(partial_checkpoint):
        with open(partial_checkpoint, 'rb') as f:
            embeddings = pickle.load(f)
        start_idx = len(embeddings)
        print(f"  üìÇ Resuming from {start_idx:,}/{len(texts):,} ({start_idx/len(texts)*100:.1f}%)")
    else:
        embeddings = []
        start_idx = 0
    
    with tqdm(total=len(texts), initial=start_idx, desc=f"Embed {text_column}", unit="speech") as pbar:
        for i in range(start_idx, len(texts), BATCH_SIZE_SPEECH):
            batch_texts = texts[i:i+BATCH_SIZE_SPEECH]
            batch_embeddings = []
            
            for text in batch_texts:
                token_ids = tokenizer.encode(text, add_special_tokens=False)
                
                if len(token_ids) <= MAX_TOKENS:
                    emb = model.encode([text], convert_to_tensor=False, show_progress_bar=False)[0]
                else:
                    chunks = [tokenizer.decode(token_ids[start:min(start + CHUNK_SIZE, len(token_ids))], 
                                              skip_special_tokens=True)
                             for start in range(0, len(token_ids), STRIDE)]
                    emb = np.mean(model.encode(chunks, convert_to_tensor=False, show_progress_bar=False), axis=0)
                
                batch_embeddings.append(emb)
            
            embeddings.extend(batch_embeddings)
            pbar.update(len(batch_texts))
            
            # Checkpoint at intervals
            if checkpoint_prefix and len(embeddings) % CHECKPOINT_INTERVAL_SPEECH < BATCH_SIZE_SPEECH:
                with open(partial_checkpoint, 'wb') as f:
                    pickle.dump(embeddings, f)
            
            # GPU cleanup
            if device == "cuda" and i % (BATCH_SIZE_SPEECH * 10) == 0:
                torch.cuda.empty_cache()
                gc.collect()
    
    # Clean up partial checkpoint
    if checkpoint_prefix and os.path.exists(partial_checkpoint):
        os.remove(partial_checkpoint)
    
    df_result = df.copy()
    df_result['Speech_Embeddings'] = embeddings
    
    if device == "cuda":
        torch.cuda.empty_cache()
    gc.collect()
    
    return df_result


checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step2_speech_embeddings.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading from checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        processed_data = pickle.load(f)
    print(f"‚úÖ Loaded {len(processed_data)} countries")
else:
    processed_data = {}
    
    for idx, (code, df) in enumerate(raw_data.items(), 1):
        config = CONFIG[code]
        print(f"\n[{idx}/{len(raw_data)}] {config['name']} - {config['mode'].upper()}")
        
        df_emb = df.copy()
        
        # English embeddings
        if config['mode'] in ['bilingual', 'english_only'] and df['Text_English'].notna().any():
            df_temp = add_speech_embeddings(df, 'Text_English', f'step2_{code}_en')
            df_emb['Speech_Embeddings_English'] = df_temp['Speech_Embeddings']
        else:
            df_emb['Speech_Embeddings_English'] = None
        
        # Native embeddings
        if config['mode'] in ['bilingual', 'native_only'] and df['Text_Native'].notna().any():
            df_temp = add_speech_embeddings(df, 'Text_Native', f'step2_{code}_native')
            df_emb['Speech_Embeddings_Native'] = df_temp['Speech_Embeddings']
        else:
            df_emb['Speech_Embeddings_Native'] = None
        
        processed_data[code] = df_emb
        
        # Save checkpoint after each country
        with open(checkpoint_file, 'wb') as f:
            pickle.dump(processed_data, f)
    
    # Cleanup
    if os.path.exists(os.path.join(CHECKPOINT_DIR, 'step1_raw_data.pkl')):
        os.remove(os.path.join(CHECKPOINT_DIR, 'step1_raw_data.pkl'))
    
    del raw_data
    gc.collect()

print(f"\n‚úÖ Speech embeddings complete")

## Step 3: Segmentation & Segment IDs

Find segment boundaries using automatic parameter optimization and multi-signal detection.

**Strategy:** Favor over-segmentation (easier to merge similar segments later than to split under-segmented ones)

In [None]:
def detect_boundaries_by_keywords(texts, roles, keywords):
    return [i for i, (text, role) in enumerate(zip(texts, roles))
            if 'Chairperson' in str(role) and any(kw in str(text).lower() for kw in keywords)]


def detect_boundaries_by_similarity(embeddings, window_size=1, percentile=95):
    n = len(embeddings)
    if n < window_size * 2 + 1:
        return []
    
    similarity_drops = []
    for i in range(window_size, n - window_size + 1):
        window_before = embeddings[max(0, i - window_size):i]
        window_after = embeddings[i:min(n, i + window_size)]
        
        if len(window_before) == 0 or len(window_after) == 0:
            continue
        
        mean_before = np.mean(window_before, axis=0)
        mean_after = np.mean(window_after, axis=0)
        sim = cosine_similarity(mean_before.reshape(1, -1), mean_after.reshape(1, -1))[0][0]
        similarity_drops.append((i, 1 - sim))
    
    if not similarity_drops:
        return []
    
    threshold = np.percentile([d[1] for d in similarity_drops], percentile)
    return [pos for pos, drop in similarity_drops if drop >= threshold]


def combine_boundaries(keyword_boundaries, similarity_boundaries, min_distance=3, require_similarity_match=True):
    if require_similarity_match:
        validated_keywords = [kb for kb in keyword_boundaries 
                             if any(abs(kb - sb) <= min_distance for sb in similarity_boundaries)]
        all_boundaries = set(validated_keywords)
        all_boundaries.update(similarity_boundaries)
    else:
        all_boundaries = set(keyword_boundaries)
        for sim_b in similarity_boundaries:
            if not any(abs(sim_b - kb) < min_distance for kb in keyword_boundaries):
                all_boundaries.add(sim_b)
    
    return sorted(all_boundaries)


def evaluate_segmentation(embeddings, boundaries, texts, roles, keywords):
    if len(boundaries) == 0:
        return 0.0, {'error': 'No boundaries'}
    
    keyword_boundaries = detect_boundaries_by_keywords(texts, roles, keywords)
    keyword_score = sum(1 for b in boundaries if b in keyword_boundaries) / len(boundaries)
    
    breaks = [0] + boundaries + [len(embeddings)]
    coherence_scores, separation_scores = [], []
    
    for i in range(len(breaks) - 1):
        segment = embeddings[breaks[i]:breaks[i+1]]
        if len(segment) > 1:
            coherence_scores.append(cosine_similarity(segment).mean())
            if i < len(breaks) - 2:
                next_segment = embeddings[breaks[i+1]:breaks[i+2]]
                if len(next_segment) > 0:
                    separation_scores.append(1 - cosine_similarity(segment, next_segment).mean())
    
    coherence = np.mean(coherence_scores) if coherence_scores else 0
    separation = np.mean(separation_scores) if separation_scores else 0
    semantic_score = (coherence + separation) / 2
    
    segment_lengths = [breaks[i+1] - breaks[i] for i in range(len(breaks) - 1)]
    
    return (keyword_score * 0.5 + semantic_score * 0.5), {
        'keyword_score': keyword_score,
        'semantic_score': semantic_score,
        'coherence': coherence,
        'separation': separation,
        'avg_length': np.mean(segment_lengths),
        'num_segments': len(segment_lengths),
        'num_boundaries': len(boundaries)
    }


def optimize_window_size(embeddings, texts, roles, keywords, percentile=95):
    results = []
    keyword_boundaries = detect_boundaries_by_keywords(texts, roles, keywords)
    
    for window in range(1, 11):
        similarity_boundaries = detect_boundaries_by_similarity(embeddings, window, percentile)
        combined_boundaries = combine_boundaries(keyword_boundaries, similarity_boundaries, 
                                                 require_similarity_match=True)
        
        if len(combined_boundaries) == 0:
            results.append({'window': window, 'score': 0.0})
            continue
        
        score, stats = evaluate_segmentation(embeddings, combined_boundaries, texts, roles, keywords)
        results.append({'window': window, 'score': score, **stats})
    
    valid_results = [r for r in results if r['score'] > 0]
    if not valid_results:
        return 5, 0.0, results
    
    best = max(valid_results, key=lambda x: x['score'])
    print(f"  üîç Optimal window={best['window']} (score={best['score']:.3f}, avg_len={best['avg_length']:.1f})")
    return best['window'], best['score'], results


def create_segments(dataset, embedding_col, text_col, keywords, text_id_col='Text_ID', percentile=95):
    print(f"\n{'='*60}\nSEGMENTING: {text_col}\n{'='*60}")
    
    # Optimize window size
    all_sessions = dataset[text_id_col].unique()
    sample_size = min(max(10, int(len(all_sessions) * 0.2)), 50)
    sample_sessions = np.random.choice(all_sessions, sample_size, replace=False)
    sample_data = dataset[dataset[text_id_col].isin(sample_sessions)]
    
    if len(sample_data) >= 50:
        optimal_window, _, _ = optimize_window_size(
            np.array(sample_data[embedding_col].tolist()),
            sample_data[text_col].values,
            sample_data['Speaker_role'].values,
            keywords, percentile
        )
    else:
        optimal_window = 5
        print(f"  ‚ö†Ô∏è Using default window=5")
    
    # Segment all sessions
    all_segments = []
    stats = {'optimal_window': optimal_window, 'total_segments': 0, 'segment_lengths': []}
    
    for session_id in tqdm(dataset[text_id_col].unique(), desc="Segmenting"):
        session = dataset[dataset[text_id_col] == session_id].reset_index(drop=True)
        
        if len(session) < 5:
            all_segments.append({
                'Text_ID': session_id, 'Segment_ID': f"{session_id}_seg_1",
                'Start_Index': 0, 'End_Index': len(session) - 1
            })
            stats['total_segments'] += 1
            stats['segment_lengths'].append(len(session))
            continue
        
        embeddings = np.array(session[embedding_col].tolist())
        keyword_boundaries = detect_boundaries_by_keywords(session[text_col].values, 
                                                          session['Speaker_role'].values, keywords)
        similarity_boundaries = detect_boundaries_by_similarity(embeddings, optimal_window, percentile)
        combined_boundaries = combine_boundaries(keyword_boundaries, similarity_boundaries, 
                                                require_similarity_match=True)
        
        breaks = [0] + combined_boundaries + [len(session)]
        for seg_idx in range(len(breaks) - 1):
            start, end = breaks[seg_idx], breaks[seg_idx + 1] - 1
            stats['segment_lengths'].append(end - start + 1)
            all_segments.append({
                'Text_ID': session_id,
                'Segment_ID': f"{session_id}_seg_{seg_idx + 1}",
                'Start_Index': start, 'End_Index': end
            })
            stats['total_segments'] += 1
    
    stats['avg_length'] = np.mean(stats['segment_lengths'])
    print(f"  ‚úÖ {stats['total_segments']:,} segments | avg={stats['avg_length']:.1f} speeches/segment")
    
    return all_segments, stats


def add_segment_ids_to_df(df, segments, text_id_col='Text_ID'):
    df = df.copy()
    df['Segment_ID'] = None
    
    for seg in segments:
        mask = df[text_id_col] == seg['Text_ID']
        indices = df[mask].index
        if len(indices) > seg['Start_Index']:
            df.loc[indices[seg['Start_Index']:seg['End_Index']+1], 'Segment_ID'] = seg['Segment_ID']
    
    missing_mask = df['Segment_ID'].isna()
    if missing_mask.any():
        df.loc[missing_mask, 'Segment_ID'] = df.loc[missing_mask, text_id_col] + '_seg_0'
    
    return df


checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step3_segmentation.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading from checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        final_data = pickle.load(f)
    print(f"‚úÖ Loaded {len(final_data)} countries")
else:
    final_data = {}
    
    for idx, (code, df) in enumerate(processed_data.items(), 1):
        config = CONFIG[code]
        print(f"\n[{idx}/{len(processed_data)}] {config['name']} - {config['mode'].upper()}")
        
        df_final = df.copy()
        
        # English segmentation
        if config['mode'] in ['bilingual', 'english_only'] and df['Speech_Embeddings_English'].notna().any():
            segments_en, stats_en = create_segments(df, 'Speech_Embeddings_English', 'Text_English', 
                                                     ENGLISH_KEYWORDS)
            df_temp = add_segment_ids_to_df(df, segments_en)
            df_final['Segment_ID_English'] = df_temp['Segment_ID']
            optimal_window = stats_en['optimal_window']
        else:
            df_final['Segment_ID_English'] = None
            optimal_window = 5
        
        # Native segmentation
        if config['mode'] in ['bilingual', 'native_only'] and df['Speech_Embeddings_Native'].notna().any():
            segments_native, _ = create_segments(df, 'Speech_Embeddings_Native', 'Text_Native', 
                                                config.get('native_keywords', ENGLISH_KEYWORDS))
            df_temp = add_segment_ids_to_df(df, segments_native)
            df_final['Segment_ID_Native'] = df_temp['Segment_ID']
        else:
            df_final['Segment_ID_Native'] = None
        
        final_data[code] = df_final
        
        # Save after each country
        with open(checkpoint_file, 'wb') as f:
            pickle.dump(final_data, f)
    
    # Cleanup
    if os.path.exists(os.path.join(CHECKPOINT_DIR, 'step2_speech_embeddings.pkl')):
        os.remove(os.path.join(CHECKPOINT_DIR, 'step2_speech_embeddings.pkl'))
    
    del processed_data
    gc.collect()

print(f"\n‚úÖ Segmentation complete")

## Step 4: Segment Embeddings

Generate embeddings for each segment (concatenated speeches).

In [None]:
def add_segment_embeddings(df, text_col, segment_col, checkpoint_prefix=''):
    """Generate segment embeddings with environment-aware settings."""
    from sentence_transformers import SentenceTransformer
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    model = SentenceTransformer("BAAI/bge-m3", device=device)
    tokenizer = model.tokenizer
    MAX_TOKENS, CHUNK_SIZE, STRIDE = 8192, 8000, 6000
    
    # Concatenate speeches by segment
    segment_texts = df.groupby(segment_col)[text_col].apply(lambda x: ' '.join(x.astype(str)))
    texts = segment_texts.tolist()
    segment_ids = segment_texts.index.tolist()
    
    # Check for partial checkpoint
    partial_checkpoint = os.path.join(CHECKPOINT_DIR, f'{checkpoint_prefix}_partial.pkl')
    if os.path.exists(partial_checkpoint):
        with open(partial_checkpoint, 'rb') as f:
            emb_map = pickle.load(f)
        start_idx = len(emb_map)
        print(f"  üìÇ Resuming from {start_idx:,}/{len(texts):,}")
    else:
        emb_map = {}
        start_idx = 0
    
    with tqdm(total=len(texts), initial=start_idx, desc=f"Segment Embed", unit="seg") as pbar:
        for i in range(start_idx, len(texts), BATCH_SIZE_SEGMENT):
            batch_texts = texts[i:i+BATCH_SIZE_SEGMENT]
            batch_ids = segment_ids[i:i+BATCH_SIZE_SEGMENT]
            batch_emb = []
            
            for text in batch_texts:
                try:
                    tokens = tokenizer.encode(text, add_special_tokens=False)
                    
                    if len(tokens) > MAX_TOKENS * 10:
                        batch_emb.append(np.zeros(1024))
                        continue
                    
                    if len(tokens) <= MAX_TOKENS:
                        emb = model.encode([text], convert_to_tensor=False, show_progress_bar=False)[0]
                    else:
                        chunks = [tokenizer.decode(tokens[start:min(start + CHUNK_SIZE, len(tokens))], 
                                                   skip_special_tokens=True)
                                 for start in range(0, len(tokens), STRIDE)]
                        
                        # Process chunks in sub-batches
                        chunk_embeddings = []
                        for chunk_idx in range(0, len(chunks), 4):
                            chunk_batch = chunks[chunk_idx:chunk_idx+4]
                            chunk_emb = model.encode(chunk_batch, convert_to_tensor=False, 
                                                    show_progress_bar=False)
                            chunk_embeddings.extend(chunk_emb)
                            
                            if device == "cuda":
                                torch.cuda.empty_cache()
                        
                        emb = np.mean(chunk_embeddings, axis=0)
                    
                    batch_emb.append(emb)
                except Exception as e:
                    batch_emb.append(np.zeros(1024))
            
            # Update map
            for seg_id, emb in zip(batch_ids, batch_emb):
                emb_map[seg_id] = emb
            
            pbar.update(len(batch_texts))
            
            # GPU cleanup
            if device == "cuda":
                torch.cuda.empty_cache()
            
            # Checkpoint
            if checkpoint_prefix and len(emb_map) % CHECKPOINT_INTERVAL_SEGMENT < BATCH_SIZE_SEGMENT:
                with open(partial_checkpoint, 'wb') as f:
                    pickle.dump(emb_map, f)
                gc.collect()
    
    # Clean up partial checkpoint
    if checkpoint_prefix and os.path.exists(partial_checkpoint):
        os.remove(partial_checkpoint)
    
    df = df.copy()
    df[f'Segment_Embeddings_{text_col}'] = df[segment_col].map(emb_map)
    
    if device == "cuda":
        torch.cuda.empty_cache()
    gc.collect()
    
    return df


checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step4_segment_embeddings.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading from checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        segment_data = pickle.load(f)
    print(f"‚úÖ Loaded {len(segment_data)} countries")
    
    # Use segment_data if exists, otherwise use final_data
    final_data = segment_data
else:
    for idx, (code, df) in enumerate(final_data.items(), 1):
        config = CONFIG[code]
        print(f"\n[{idx}/{len(final_data)}] {config['name']} - {config['mode'].upper()}")
        
        # English segment embeddings
        if 'Segment_ID_English' in df.columns and df['Segment_ID_English'].notna().any():
            if not ('Segment_Embeddings_English' in df.columns and df['Segment_Embeddings_English'].notna().any()):
                df = add_segment_embeddings(df, 'Text_English', 'Segment_ID_English', f'step4_{code}_en')
                df = df.rename(columns={'Segment_Embeddings_Text_English': 'Segment_Embeddings_English'})
                
                # Drop speech embeddings to save memory
                if 'Speech_Embeddings_English' in df.columns:
                    df = df.drop(columns=['Speech_Embeddings_English'])
                
                final_data[code] = df
                
                # Save immediately
                with open(checkpoint_file, 'wb') as f:
                    pickle.dump(final_data, f)
                
                print(f"  ‚úÖ English: saved")
        
        # Native segment embeddings
        if 'Segment_ID_Native' in df.columns and df['Segment_ID_Native'].notna().any():
            if not ('Segment_Embeddings_Native' in df.columns and df['Segment_Embeddings_Native'].notna().any()):
                df = add_segment_embeddings(df, 'Text_Native', 'Segment_ID_Native', f'step4_{code}_native')
                df = df.rename(columns={'Segment_Embeddings_Text_Native': 'Segment_Embeddings_Native'})
                
                # Drop speech embeddings to save memory
                if 'Speech_Embeddings_Native' in df.columns:
                    df = df.drop(columns=['Speech_Embeddings_Native'])
                
                final_data[code] = df
                
                # Save immediately
                with open(checkpoint_file, 'wb') as f:
                    pickle.dump(final_data, f)
                
                print(f"  ‚úÖ Native: saved")
        
        final_data[code] = df
    
    # Cleanup
    if os.path.exists(os.path.join(CHECKPOINT_DIR, 'step3_segmentation.pkl')):
        os.remove(os.path.join(CHECKPOINT_DIR, 'step3_segmentation.pkl'))

print(f"\n‚úÖ Segment embeddings complete")

## Final Verification & Save

Verify all processed data and save to output directory.

In [None]:
print("üìä FINAL VERIFICATION")
print("="*60)

for code, df in final_data.items():
    config = CONFIG[code]
    mode = config.get('mode', 'unknown').upper()
    print(f"\n{config['name']} ({code}) - {mode}:")
    print(f"  Speeches: {len(df):,}")
    print(f"  Sessions: {df['Text_ID'].nunique():,}")
    
    if 'Segment_ID_English' in df.columns and df['Segment_ID_English'].notna().any():
        print(f"  ‚úÖ English segments: {df['Segment_ID_English'].nunique():,}")
    if 'Segment_ID_Native' in df.columns and df['Segment_ID_Native'].notna().any():
        print(f"  ‚úÖ Native segments: {df['Segment_ID_Native'].nunique():,}")
    
    # Check segment embeddings
    if 'Segment_Embeddings_English' in df.columns and df['Segment_Embeddings_English'].notna().any():
        sample = df[df['Segment_Embeddings_English'].notna()]['Segment_Embeddings_English'].iloc[0]
        print(f"  ‚úÖ English segment embeddings: {sample.shape}")
    if 'Segment_Embeddings_Native' in df.columns and df['Segment_Embeddings_Native'].notna().any():
        sample = df[df['Segment_Embeddings_Native'].notna()]['Segment_Embeddings_Native'].iloc[0]
        print(f"  ‚úÖ Native segment embeddings: {sample.shape}")

print(f"\n‚úÖ All processing complete!")

In [None]:
print("üíæ SAVING FINAL DATA")
print("="*60)

for code, df in final_data.items():
    config = CONFIG[code]
    output_path = os.path.join(OUTPUT_DIR, f"{code}_speeches_processed.pkl")
    
    df.to_pickle(output_path)
    
    n_seg = df['Segment_ID_English'].nunique() if 'Segment_ID_English' in df.columns else 0
    print(f"\n‚úÖ {config['name']} ({code}):")
    print(f"   üìÇ {output_path}")
    print(f"   üìä {len(df):,} speeches | {n_seg:,} segments")

# Delete Step 4 checkpoint to save space
step4_checkpoint = os.path.join(CHECKPOINT_DIR, 'step4_segment_embeddings.pkl')
if os.path.exists(step4_checkpoint):
    os.remove(step4_checkpoint)

print(f"\n{'='*60}")
print(f"‚úÖ PROCESSING COMPLETE")
print(f"{'='*60}")
print(f"\nüìÅ Output: {OUTPUT_DIR}")
print(f"üåê Environment: {'Colab' if IN_COLAB else 'Local'}")
print(f"\nüìã Ready for next steps:")
print(f"   ‚Ä¢ Topic modeling")
print(f"   ‚Ä¢ ParlaCAP comparison")
print(f"   ‚Ä¢ LIWC analysis")