# Parlamint Data Processing Pipeline

Automated pipeline for processing ParlaMint 5.0 data. 

Take raw files and create a big dataframe with additional calculated columns for speech embeddings, segment_ID and segment embeddings

## Quick Start Guide

### 1. Download Data

Download ParlaMint 5.0 corpus from [CLARIN.SI](https://www.clarin.si/repository/xmlui/):
- **English (machine-translated)**: [Link](https://www.clarin.si/repository/xmlui/handle/11356/2006) - Universal, works for all countries
- **Native languages**: [Link](https://www.clarin.si/repository/xmlui/handle/11356/2004) - Optional for bilingual analysis

**Minimum requirement:** At least one language version per country.

### 2. Extract & Organize

Extract downloaded files to `BASE_DATA_DIR` (configure in next cell):

```
data folder/
‚îú‚îÄ‚îÄ AT/
‚îÇ   ‚îú‚îÄ‚îÄ ParlaMint5.0-AT-en.ana/ParlaMint-AT-en.txt/  (optional)
‚îÇ   ‚îî‚îÄ‚îÄ ParlaMint-AT/ParlaMint-AT.txt/              (optional)
‚îú‚îÄ‚îÄ HR/
‚îÇ   ‚îú‚îÄ‚îÄ ParlaMint5.0-HR-en.ana/ParlaMint-HR-en.txt/  (optional)
‚îÇ   ‚îî‚îÄ‚îÄ ParlaMint-HR/ParlaMint-HR.txt/              (optional)
‚îî‚îÄ‚îÄ GB/
    ‚îî‚îÄ‚îÄ ParlaMint-GB/ParlaMint-GB.txt/              (required)
```

### 3. Run All Cells

The notebook automatically:
- Detects available data (English-only / Native-only / Bilingual)
- Optimizes segmentation parameters per country
- Creates checkpoints for recovery from interruptions

**Adding new countries native language:** Modify `CONFIG` in the next cell with paths and native keywords (for boundary detection).

In [4]:
import os
import pandas as pd
import numpy as np
import torch
import gc
import warnings
import pickle
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# === UNIFIED CONFIGURATION ===
BASE_DATA_DIR = r"data folder"  # ‚Üê CHANGE THIS to your data location
CHECKPOINT_DIR = os.path.join(BASE_DATA_DIR, "checkpoints")
OUTPUT_DIR = os.path.join(BASE_DATA_DIR, "processed")

os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Country configurations
CONFIG = {
    'AT': {
        'name': 'Austria',
        'bilingual': True,
        'english_path': os.path.join(BASE_DATA_DIR, "AT", "ParlaMint5.0-AT-en.ana", "ParlaMint-AT-en.txt"),
        'native_path': os.path.join(BASE_DATA_DIR, "AT", "ParlaMint-AT", "ParlaMint-AT.txt"),
        'native_keywords': ['tagesordnung', 'tagesordnungspunkt', 'punkt', 'verhandlung', 
                           'behandlung', 'n√§chster', 'weiter', 'fortsetzen']
    },
    'HR': {
        'name': 'Croatia',
        'bilingual': True,
        'english_path': os.path.join(BASE_DATA_DIR, "HR", "ParlaMint5.0-HR-en.ana", "ParlaMint-HR-en.txt"),
        'native_path': os.path.join(BASE_DATA_DIR, "HR", "ParlaMint-HR", "ParlaMint-HR.txt"),
        'native_keywords': ['dnevni', 'red', 'toƒçka', 'taƒçka', 'sljedeƒái', 'sljedeƒáe',
                           'prijedlog', 'zakon', 'tema', 'nastavljamo', 'prelazimo']
    },
    'GB': {
        'name': 'Great Britain',
        'bilingual': False,
        'english_path': os.path.join(BASE_DATA_DIR, "GB", "ParlaMint-GB", "ParlaMint-GB.txt"),
        'native_keywords': None
    }
}

# Keywords for detecting agenda item transitions (used for English and as fallback)
ENGLISH_KEYWORDS = ['agenda', 'proceed', 'point', 'item', 'topic', 'next', 'following', 'move on']

print("‚úÖ Configuration loaded")
print(f"üìç Data directory: {BASE_DATA_DIR}")
# Fix the f-string syntax
country_list = ', '.join([f"{c} ({CONFIG[c]['name']})" for c in CONFIG.keys()])
print(f"üìä Countries: {country_list}")
print(f"üíæ Checkpoints: {CHECKPOINT_DIR}")
print(f"\n‚ÑπÔ∏è  Segmentation parameters auto-optimized per country")

‚úÖ Configuration loaded
üìç Data directory: data folder
üìä Countries: AT (Austria), HR (Croatia), GB (Great Britain)
üíæ Checkpoints: data folder\checkpoints

‚ÑπÔ∏è  Segmentation parameters auto-optimized per country


## Step 1: Data Loading

Load parliamentary speeches from year-based folder structure.

In [5]:
def load_parlamint_data(parent_folder):
    """Load ParlaMint data from year folders."""
    print(f"Loading from: {parent_folder}")
    
    if not os.path.exists(parent_folder):
        print(f"  ‚ö†Ô∏è Path not found: {parent_folder}")
        return None
    
    df_list = []
    year_folders = sorted([f for f in os.listdir(parent_folder) 
                          if os.path.isdir(os.path.join(parent_folder, f))])
    
    if not year_folders:
        print(f"  ‚ö†Ô∏è No year folders found")
        return None
    
    print(f"  Found {len(year_folders)} year folders: {year_folders[0]} to {year_folders[-1]}")
    
    for year_folder in year_folders:
        folder_path = os.path.join(parent_folder, year_folder)
        meta_files = [f for f in os.listdir(folder_path) 
                     if f.endswith('-meta.tsv') and not f.endswith('-ana-meta.tsv')]
        
        for meta_file in meta_files:
            base = meta_file.replace('-meta.tsv', '')
            meta_path = os.path.join(folder_path, meta_file)
            txt_path = os.path.join(folder_path, base + '.txt')
            
            try:
                df_meta = pd.read_csv(meta_path, sep='\t', encoding='utf-8', index_col=False)
                
                text_map = {}
                with open(txt_path, encoding='utf-8') as f:
                    for line in f:
                        parts = line.strip().split('\t', 1)
                        if len(parts) == 2:
                            text_map[parts[0]] = parts[1]
                
                df_meta['Text'] = df_meta['ID'].map(text_map)
                df_meta = df_meta[df_meta['Text'].notnull() & (df_meta['Text'].str.strip() != '')]
                
                if len(df_meta) > 0:  # Only append non-empty dataframes
                    df_list.append(df_meta)
            except Exception as e:
                print(f"    Error: {meta_file}: {e}")
    
    if not df_list:
        print(f"  ‚ö†Ô∏è No valid data loaded")
        return None
    
    df_all = pd.concat(df_list, ignore_index=True)
    
    if len(df_all) == 0:
        print(f"  ‚ö†Ô∏è All speeches were empty - no data loaded")
        return None
    
    print(f"  Loaded {len(df_all):,} speeches")
    return df_all


# Load all countries
checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step1_raw_data.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading from checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        raw_data = pickle.load(f)
    print(f"‚úÖ Loaded {len(raw_data)} countries")
    print("\n‚ö†Ô∏è  NOTE: If you added/removed language data:")
    print(f"   Delete checkpoints: rm -rf {CHECKPOINT_DIR}/*")
    print("   Then rerun this cell")
    
    # Reconstruct mode (needed for steps 2-4)
    for code, df in raw_data.items():
        has_en = 'Text_English' in df.columns and df['Text_English'].notna().any()
        has_nat = 'Text_Native' in df.columns and df['Text_Native'].notna().any()
        if has_en and has_nat:
            CONFIG[code]['mode'] = 'bilingual'
        elif has_en:
            CONFIG[code]['mode'] = 'english_only'
        elif has_nat:
            CONFIG[code]['mode'] = 'native_only'
        else:
            CONFIG[code]['mode'] = 'unknown'
else:
    print("üîÑ Loading data from source...")
    raw_data = {}
    
    for code, config in CONFIG.items():
        print(f"\n{'='*60}")
        print(f"{config['name']} ({code})")
        print(f"{'='*60}")
        
        df_english = load_parlamint_data(config['english_path'])
        df_native = load_parlamint_data(config['native_path']) if config['bilingual'] else None
        
        # Start with base dataframe (prefer English for metadata)
        if df_english is not None:
            df = df_english.copy()
            df = df.rename(columns={'Text': 'Text_English'})
            has_english = True
        elif df_native is not None:
            df = df_native.copy()
            df = df.rename(columns={'Text': 'Text_Native'})
            has_english = False
        else:
            print(f"  ‚ùå No data found - skipping")
            continue
        
        # Add missing language columns (always have both columns)
        if has_english and df_native is not None:
            # Merge native text
            df = df.merge(df_native[['ID', 'Text']], on='ID', how='left', suffixes=('', '_native'))
            df = df.rename(columns={'Text_native': 'Text_Native'})
            config['mode'] = 'bilingual'
            print(f"  ‚úÖ BILINGUAL mode ({len(df):,} speeches)")
        elif has_english:
            # Only English available
            df['Text_Native'] = None
            config['mode'] = 'english_only'
            print(f"  ‚úÖ ENGLISH ONLY mode ({len(df):,} speeches)")
        else:
            # Only Native available
            df['Text_English'] = None
            config['mode'] = 'native_only'
            print(f"  ‚úÖ NATIVE ONLY mode ({len(df):,} speeches)")
        
        raw_data[code] = df
    
    if not raw_data:
        raise ValueError("No data loaded. Check your data paths in CONFIG.")
    
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(raw_data, f)
    print(f"\nüíæ Checkpoint saved")

print(f"\n‚úÖ Data loaded: {list(raw_data.keys())}")

üîÑ Loading data from source...

Austria (AT)
Loading from: data folder\AT\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt
  ‚ö†Ô∏è Path not found: data folder\AT\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt
Loading from: data folder\AT\ParlaMint-AT\ParlaMint-AT.txt
Austria (AT)
Loading from: data folder\AT\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt
  ‚ö†Ô∏è Path not found: data folder\AT\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt
Loading from: data folder\AT\ParlaMint-AT\ParlaMint-AT.txt
  Found 27 year folders: 1996 to 2022

  Found 27 year folders: 1996 to 2022
  Loaded 231,759 speeches
  Loaded 231,759 speeches
  ‚úÖ NATIVE ONLY mode (231,759 speeches)

Croatia (HR)
Loading from: data folder\HR\ParlaMint5.0-HR-en.ana\ParlaMint-HR-en.txt
  ‚ö†Ô∏è Path not found: data folder\HR\ParlaMint5.0-HR-en.ana\ParlaMint-HR-en.txt
Loading from: data folder\HR\ParlaMint-HR\ParlaMint-HR.txt
  Found 20 year folders: 2003 to 2022
  ‚úÖ NATIVE ONLY mode (231,759 speeches)

Croatia (HR)
Loading from: data 

## Step 2: Speech Embeddings

Generate BGE-m3 1024 dimensional embeddings for each speech. The model has a token limit of 8192.

In [None]:
def add_speech_embeddings(df, text_column='Text', checkpoint_prefix=''):
    """Generate BGE-m3 embeddings for speeches with 10% checkpoint intervals."""
    from sentence_transformers import SentenceTransformer
    
    print(f"\n{'='*60}")
    print(f"Generating Speech Embeddings ({text_column})")
    print(f"{'='*60}")
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size = 64 if device == "cuda" else 16
    print(f"Device: {device}, Batch size: {batch_size}")
    
    model = SentenceTransformer("BAAI/bge-m3", device=device)
    tokenizer = model.tokenizer
    
    # Optimized chunking: 25% overlap with maximum token size
    MAX_TOKENS = 8192
    CHUNK_SIZE = 8000
    STRIDE = 6000
    
    texts = df[text_column].astype(str).values
    total = len(texts)
    checkpoint_interval = max(1, total // 10)  # Every 10%
    
    embeddings = []
    last_checkpoint = 0  # Track last checkpoint position
    
    with tqdm(total=total, desc="Embedding", unit="speech") as pbar:
        for i in range(0, total, batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = []
            
            for text in batch_texts:
                token_ids = tokenizer.encode(text, add_special_tokens=False)
                
                if len(token_ids) <= MAX_TOKENS:
                    emb = model.encode([text], convert_to_tensor=False, show_progress_bar=False)[0]
                else:
                    chunks = []
                    for start in range(0, len(token_ids), STRIDE):
                        end = min(start + CHUNK_SIZE, len(token_ids))
                        chunk = tokenizer.decode(token_ids[start:end], skip_special_tokens=True)
                        chunks.append(chunk)
                    emb = np.mean(model.encode(chunks, convert_to_tensor=False, show_progress_bar=False), axis=0)
                
                batch_embeddings.append(emb)
            
            embeddings.extend(batch_embeddings)
            pbar.update(len(batch_texts))
            
            # Checkpoint every 10% (improved logic)
            if checkpoint_prefix and len(embeddings) - last_checkpoint >= checkpoint_interval:
                checkpoint_file = os.path.join(CHECKPOINT_DIR, f'{checkpoint_prefix}_partial.pkl')
                with open(checkpoint_file, 'wb') as f:
                    pickle.dump(embeddings, f)
                progress = int((len(embeddings) / total) * 100)
                print(f"  üíæ Checkpoint saved: {progress}% complete ({len(embeddings):,}/{total:,})")
                last_checkpoint = len(embeddings)
            
            # More aggressive GPU cleanup
            if device == "cuda" and i % 1000 == 0:  # Changed from 10000 to 1000
                torch.cuda.empty_cache()
                gc.collect()
    
    # Clean up partial checkpoint
    if checkpoint_prefix:
        partial_checkpoint = os.path.join(CHECKPOINT_DIR, f'{checkpoint_prefix}_partial.pkl')
        if os.path.exists(partial_checkpoint):
            os.remove(partial_checkpoint)
    
    df_result = df.copy()
    df_result['Speech_Embeddings'] = embeddings
    
    if device == "cuda":
        torch.cuda.empty_cache()
    gc.collect()
    
    return df_result


# Process all countries
checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step2_speech_embeddings.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading from checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        processed_data = pickle.load(f)
    print(f"‚úÖ Loaded {len(processed_data)} countries")
else:
    processed_data = {}
    
    for idx, (code, df) in enumerate(raw_data.items(), 1):
        config = CONFIG[code]
        print(f"\n[{idx}/{len(raw_data)}] {config['name']} - {config['mode'].upper()}")
        
        df_emb = df.copy()
        
        # English embeddings (only if data exists)
        if config['mode'] in ['bilingual', 'english_only'] and df['Text_English'].notna().any():
            df_temp = add_speech_embeddings(df, 'Text_English', f'step2_{code}_en')
            df_emb['Speech_Embeddings_English'] = df_temp['Speech_Embeddings']
        else:
            df_emb['Speech_Embeddings_English'] = None
        
        # Native embeddings (only if data exists)
        if config['mode'] in ['bilingual', 'native_only'] and df['Text_Native'].notna().any():
            df_temp = add_speech_embeddings(df, 'Text_Native', f'step2_{code}_native')
            df_emb['Speech_Embeddings_Native'] = df_temp['Speech_Embeddings']
        else:
            df_emb['Speech_Embeddings_Native'] = None
        
        processed_data[code] = df_emb
        
        with open(checkpoint_file, 'wb') as f:
            pickle.dump(processed_data, f)
    
    if os.path.exists(os.path.join(CHECKPOINT_DIR, 'step1_raw_data.pkl')):
        os.remove(os.path.join(CHECKPOINT_DIR, 'step1_raw_data.pkl'))

print(f"\n‚úÖ Speech embeddings complete")


[1/3] Austria - NATIVE_ONLY

Generating Speech Embeddings (Text_Native)
Device: cpu, Batch size: 16

Generating Speech Embeddings (Text_Native)
Device: cpu, Batch size: 16


Embedding:   0%|          | 64/231759 [02:20<115:50:49,  1.80s/speech]Token indices sequence length is longer than the specified maximum sequence length for this model (9284 > 8192). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9284 > 8192). Running this sequence through the model will result in indexing errors
Embedding:   0%|          | 80/231759 [08:17<592:19:08,  9.20s/speech]

## Step 3: Segmentation & Segment IDs

Find segment boundaries using **automatic parameter optimization** and multi-signal detection.

**How it works:**
1. **Auto-optimize parameters** for each country using silhouette scoring
   - `window_size`: 3-15 speeches (context window for similarity comparison)
   - `min_segment_size`: 5-50 speeches (minimum speeches per topic)

2. **Multi-signal boundary detection:**
   - **Keyword detection** (weight: 3.0): Chairperson announcements ("next agenda item")
   - **Similarity drop** (weight: 2.0): Semantic shift between speech windows
   - **Distance spike** (weight: 1.0): Large embedding jumps between consecutive speeches

3. **Validation:**
   - Must be local maximum (highest in ¬±5 speech window)
   - Must be ‚â• min_segment_size apart from other boundaries
   - Adaptive threshold (top 20% of boundary scores)

**Strategy:** Prefer over-segmentation (easier to merge similar segments later than to split under-segmented ones)

**Session handling:**
- Each `Text_ID` (parliamentary sitting) is segmented independently
- Brief sessions (< min_segment_size) ‚Üí 1 segment
- Regular sessions ‚Üí Multiple segments based on detected boundaries

In [None]:
def optimize_segmentation_params(embeddings, min_window=3, max_window=15, min_seg=5, max_seg=50):
    """
    Optimize WINDOW_SIZE and MIN_SEGMENT_SIZE using silhouette score.
    Tests different parameter combinations and returns the best.
    
    Ranges adjusted for:
    - Finer granularity (window: 3-15) to catch short topic transitions
    - Longer segments (min_seg: 5-50) to accommodate multi-hour parliamentary debates
    """
    from sklearn.metrics import silhouette_score
    from sklearn.cluster import AgglomerativeClustering
    
    best_score = -1
    best_params = {'window_size': 7, 'min_segment_size': 10}
    
    # Sample if too large (for efficiency)
    if len(embeddings) > 1000:
        indices = np.random.choice(len(embeddings), 1000, replace=False)
        sample_embs = embeddings[indices]
    else:
        sample_embs = embeddings
    
    for window in range(min_window, max_window + 1, 2):
        for min_seg in range(min_seg, max_seg + 1, 5):
            if len(sample_embs) < window * 2:
                continue
            
            # Quick clustering to evaluate
            n_clusters = max(2, len(sample_embs) // min_seg)
            if n_clusters >= len(sample_embs):
                continue
                
            try:
                clustering = AgglomerativeClustering(n_clusters=n_clusters)
                labels = clustering.fit_predict(sample_embs)
                
                if len(np.unique(labels)) > 1:
                    score = silhouette_score(sample_embs, labels)
                    if score > best_score:
                        best_score = score
                        best_params = {'window_size': window, 'min_segment_size': min_seg}
            except:
                continue
    
    return best_params


def detect_boundaries_robust(embeddings, texts, roles, keywords, window_size, min_segment_size):
    """
    Robust boundary detection using multiple weighted signals.
    
    Strategy: Prefer over-segmentation to under-segmentation.
    - Over-segmented topics can be merged in post-processing
    - Under-segmented topics are difficult to split without re-running pipeline
    """
    n = len(embeddings)
    if n < min_segment_size * 2:
        return []
    
    boundary_scores = np.zeros(n)
    
    # === Signal 1: Keyword Detection (Strong Signal) ===
    for i, (text, role) in enumerate(zip(texts, roles)):
        if 'Chairperson' in str(role):
            text_lower = str(text).lower()
            if any(kw in text_lower for kw in keywords):
                boundary_scores[i] += 3.0  # High weight
    
    # === Signal 2: Cosine Similarity Drop (Medium Signal) ===
    if n > window_size * 2:
        for i in range(window_size, n - window_size):
            # Compare windows before and after position i
            w_before = embeddings[max(0, i-window_size):i]
            w_after = embeddings[i:min(n, i+window_size)]
            
            if len(w_before) > 0 and len(w_after) > 0:
                mean_before = np.mean(w_before, axis=0)
                mean_after = np.mean(w_after, axis=0)
                
                sim = cosine_similarity(mean_before.reshape(1, -1), mean_after.reshape(1, -1))[0][0]
                
                # Invert similarity to boundary score (low similarity = high boundary score)
                boundary_scores[i] += (1 - sim) * 2.0  # Medium weight
    
    # === Signal 3: Embedding Distance Spike (Weak Signal) ===
    for i in range(1, n):
        dist = np.linalg.norm(embeddings[i] - embeddings[i-1])
        # Normalize and add
        boundary_scores[i] += min(dist / 10.0, 1.0)  # Cap at 1.0
    
    # === Find peaks in boundary scores ===
    # Use adaptive threshold - lowered to 80th percentile for more boundaries (over-segmentation)
    threshold = np.percentile(boundary_scores, 80)  # Top 20% as candidates
    
    candidates = []
    for i in range(min_segment_size, n - min_segment_size):
        if boundary_scores[i] > threshold:
            # Check if it's a local maximum
            window = boundary_scores[max(0, i-5):min(n, i+6)]
            if boundary_scores[i] == np.max(window):
                candidates.append((i, boundary_scores[i]))
    
    # Sort by score and apply minimum distance constraint
    candidates.sort(key=lambda x: x[1], reverse=True)
    
    validated = []
    for pos, score in candidates:
        if not validated or all(abs(pos - v) >= min_segment_size for v in validated):
            validated.append(pos)
    
    return sorted(validated)


def create_segments(dataset, embedding_col, text_col, keywords, text_id_col='Text_ID', 
                   optimize_params=True, optimized_params=None):
    """
    Create segments with automatic parameter optimization.
    
    Design philosophy: Favor over-segmentation
    - Better to split a long debate into multiple segments
    - Post-processing can merge similar adjacent segments
    - Under-segmentation is difficult to fix retroactively
    
    Args:
        optimized_params: Optional dict with {'window_size': int, 'min_segment_size': int}
                         If provided, skips optimization step
    """
    all_segments = []
    stats = {
        'total_segments': 0, 
        'keyword_boundaries': 0, 
        'similarity_boundaries': 0,
        'optimized_params': None
    }
    
    # Use provided params or optimize
    if optimized_params:
        window_size = optimized_params['window_size']
        min_segment_size = optimized_params['min_segment_size']
        stats['optimized_params'] = optimized_params
        print(f"Using provided params: window_size={window_size}, min_segment_size={min_segment_size}")
    elif optimize_params:
        print("üîç Optimizing segmentation parameters...")
        sample_session = dataset[text_id_col].unique()[0]
        sample_data = dataset[dataset[text_id_col] == sample_session]
        sample_embs = np.array(sample_data[embedding_col].tolist())
        
        if len(sample_embs) >= 50:
            optimal = optimize_segmentation_params(sample_embs)
            window_size = optimal['window_size']
            min_segment_size = optimal['min_segment_size']
            stats['optimized_params'] = optimal
            print(f"‚úÖ Optimized: window_size={window_size}, min_segment_size={min_segment_size}")
        else:
            # Fallback to reasonable defaults
            window_size = 7
            min_segment_size = 10
            print(f"‚ö†Ô∏è Sample too small, using safe defaults: window_size={window_size}, min_segment_size={min_segment_size}")
    else:
        # Should not happen but provide fallback
        window_size = 7
        min_segment_size = 10
    
    # Process each session
    for session_id in tqdm(dataset[text_id_col].unique(), desc="Segmenting", unit="session"):
        session = dataset[dataset[text_id_col] == session_id].reset_index(drop=True)
        
        if len(session) < min_segment_size:
            all_segments.append({
                'Text_ID': session_id, 
                'Segment_ID': f"{session_id}_seg_1",
                'Start_Index': 0, 
                'End_Index': len(session) - 1
            })
            stats['total_segments'] += 1
            continue
        
        embeddings = np.array(session[embedding_col].tolist())
        texts = session[text_col].values
        roles = session['Speaker_role'].values
        
        # Detect boundaries
        boundaries = detect_boundaries_robust(
            embeddings, texts, roles, keywords, window_size, min_segment_size
        )
        
        # Count boundary types (approximate)
        for b in boundaries:
            if 'Chairperson' in str(roles[b]):
                stats['keyword_boundaries'] += 1
            else:
                stats['similarity_boundaries'] += 1
        
        # Create segments
        breaks = [0] + boundaries + [len(session)]
        for idx in range(len(breaks) - 1):
            start, end = breaks[idx], breaks[idx + 1] - 1
            all_segments.append({
                'Text_ID': session_id,
                'Segment_ID': f"{session_id}_seg_{idx + 1}",
                'Start_Index': start,
                'End_Index': end
            })
            stats['total_segments'] += 1
    
    return all_segments, stats


def add_segment_ids_to_df(df, segments, text_id_col='Text_ID'):
    """Map segment IDs to dataframe rows."""
    df = df.copy()
    df['Segment_ID'] = None
    
    for seg in segments:
        mask = df[text_id_col] == seg['Text_ID']
        indices = df[mask].index
        if len(indices) > seg['Start_Index']:
            seg_indices = indices[seg['Start_Index']:seg['End_Index']+1]
            df.loc[seg_indices, 'Segment_ID'] = seg['Segment_ID']
    
    # Fill missing
    missing_mask = df['Segment_ID'].isna()
    if missing_mask.any():
        df.loc[missing_mask, 'Segment_ID'] = df.loc[missing_mask, text_id_col] + '_seg_0'
    
    return df


# Process segmentation
checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step3_segmentation.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading from checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        final_data = pickle.load(f)
    print(f"‚úÖ Loaded {len(final_data)} countries")
else:
    final_data = {}
    
    for idx, (code, df) in enumerate(processed_data.items(), 1):
        config = CONFIG[code]
        print(f"\n[{idx}/{len(processed_data)}] {config['name']} - {config['mode'].upper()}")
        
        df_final = df.copy()
        
        # English segmentation (only if embeddings exist)
        if config['mode'] in ['bilingual', 'english_only'] and df['Speech_Embeddings_English'].notna().any():
            segments_en, stats_en = create_segments(df, 'Speech_Embeddings_English', 'Text_English', 
                                                     ENGLISH_KEYWORDS, optimize_params=True)
            df_temp = add_segment_ids_to_df(df, segments_en)
            df_final['Segment_ID_English'] = df_temp['Segment_ID']
            print(f"‚úÖ English: {stats_en['total_segments']:,} segments | params: {stats_en['optimized_params']}")
            optimized_params = stats_en['optimized_params']
        else:
            df_final['Segment_ID_English'] = None
            optimized_params = {'window_size': 7, 'min_segment_size': 10}
        
        # Native segmentation (only if embeddings exist)
        if config['mode'] in ['bilingual', 'native_only'] and df['Speech_Embeddings_Native'].notna().any():
            segments_native, _ = create_segments(df, 'Speech_Embeddings_Native', 'Text_Native', 
                                                 config.get('native_keywords', ENGLISH_KEYWORDS), 
                                                 optimize_params=False, optimized_params=optimized_params)
            df_temp = add_segment_ids_to_df(df, segments_native)
            df_final['Segment_ID_Native'] = df_temp['Segment_ID']
            print(f"‚úÖ Native: {len(segments_native):,} segments")
        else:
            df_final['Segment_ID_Native'] = None
        
        final_data[code] = df_final
    
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(final_data, f)
    
    if os.path.exists(os.path.join(CHECKPOINT_DIR, 'step2_speech_embeddings.pkl')):
        os.remove(os.path.join(CHECKPOINT_DIR, 'step2_speech_embeddings.pkl'))

print(f"\n‚úÖ Segmentation complete")

## Step 4: Segment Embeddings

Generate embeddings for each segment (concatenated speeches).

In [None]:
def add_segment_embeddings(df, text_col, segment_col, checkpoint_prefix=''):
    """Generate segment embeddings by concatenating speeches with 10% checkpoints."""
    from sentence_transformers import SentenceTransformer
    
    print(f"\n{'='*60}")
    print(f"Segment Embeddings ({segment_col})")
    print(f"{'='*60}")
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size = 64 if device == "cuda" else 16
    
    model = SentenceTransformer("BAAI/bge-m3", device=device)
    tokenizer = model.tokenizer
    
    # Optimized chunking: 25% overlap with maximum token size
    MAX_TOKENS = 8192
    CHUNK_SIZE = 8000
    STRIDE = 6000
    
    # Concatenate speeches by segment
    segment_texts = df.groupby(segment_col)[text_col].apply(lambda x: ' '.join(x.astype(str)))
    segment_ids = segment_texts.index.tolist()
    texts = segment_texts.tolist()
    
    total = len(texts)
    checkpoint_interval = max(1, total // 10)  # Every 10%
    print(f"Processing {total:,} segments...")
    
    embeddings = []
    last_checkpoint = 0  # Track last checkpoint position
    
    with tqdm(total=total, desc="Embedding", unit="segment") as pbar:
        for i in range(0, total, batch_size):
            batch = texts[i:i+batch_size]
            batch_emb = []
            
            for text in batch:
                tokens = tokenizer.encode(text, add_special_tokens=False)
                if len(tokens) <= MAX_TOKENS:
                    emb = model.encode([text], convert_to_tensor=False, show_progress_bar=False)[0]
                else:
                    chunks = []
                    for start in range(0, len(tokens), STRIDE):
                        end = min(start + CHUNK_SIZE, len(tokens))
                        chunk = tokenizer.decode(tokens[start:end], skip_special_tokens=True)
                        chunks.append(chunk)
                    emb = np.mean(model.encode(chunks, convert_to_tensor=False, show_progress_bar=False), axis=0)
                batch_emb.append(emb)
            
            embeddings.extend(batch_emb)
            pbar.update(len(batch))
            
            # Checkpoint every 10% (improved logic)
            if checkpoint_prefix and len(embeddings) - last_checkpoint >= checkpoint_interval:
                checkpoint_file = os.path.join(CHECKPOINT_DIR, f'{checkpoint_prefix}_partial.pkl')
                with open(checkpoint_file, 'wb') as f:
                    pickle.dump(embeddings, f)
                progress = int((len(embeddings) / total) * 100)
                print(f"  üíæ Checkpoint saved: {progress}% complete ({len(embeddings):,}/{total:,})")
                last_checkpoint = len(embeddings)
    
    # Clean up partial checkpoint
    if checkpoint_prefix:
        partial_checkpoint = os.path.join(CHECKPOINT_DIR, f'{checkpoint_prefix}_partial.pkl')
        if os.path.exists(partial_checkpoint):
            os.remove(partial_checkpoint)
    
    # Map back to dataframe
    emb_map = dict(zip(segment_ids, embeddings))
    df = df.copy()
    df[f'Segment_Embeddings_{text_col}'] = df[segment_col].map(emb_map)
    
    if device == "cuda":
        torch.cuda.empty_cache()
    gc.collect()
    
    return df


# Process segment embeddings
checkpoint_file = os.path.join(CHECKPOINT_DIR, 'step4_segment_embeddings.pkl')

if os.path.exists(checkpoint_file):
    print("üìÇ Loading from checkpoint...")
    with open(checkpoint_file, 'rb') as f:
        final_data = pickle.load(f)
    print(f"‚úÖ Loaded {len(final_data)} countries")
else:
    for idx, (code, df) in enumerate(final_data.items(), 1):
        config = CONFIG[code]
        print(f"\n[{idx}/{len(final_data)}] {config['name']} - {config['mode'].upper()}")
        
        # English segment embeddings (only if segments exist)
        if config['mode'] in ['bilingual', 'english_only'] and df['Segment_ID_English'].notna().any():
            df = add_segment_embeddings(df, 'Text_English', 'Segment_ID_English', f'step4_{code}_en')
            df = df.rename(columns={'Segment_Embeddings_Text_English': 'Segment_Embeddings_English'})
        else:
            df['Segment_Embeddings_English'] = None
        
        # Native segment embeddings (only if segments exist)
        if config['mode'] in ['bilingual', 'native_only'] and df['Segment_ID_Native'].notna().any():
            df = add_segment_embeddings(df, 'Text_Native', 'Segment_ID_Native', f'step4_{code}_native')
            df = df.rename(columns={'Segment_Embeddings_Text_Native': 'Segment_Embeddings_Native'})
        else:
            df['Segment_Embeddings_Native'] = None
        
        final_data[code] = df
    
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(final_data, f)
    
    if os.path.exists(os.path.join(CHECKPOINT_DIR, 'step3_segmentation.pkl')):
        os.remove(os.path.join(CHECKPOINT_DIR, 'step3_segmentation.pkl'))

print(f"\n‚úÖ Segment embeddings complete")

## Final Verification

Verify all processed data.

In [None]:
print("üìä FINAL VERIFICATION")
print("="*60)

for code, df in final_data.items():
    config = CONFIG[code]
    mode = config.get('mode', 'unknown').upper()
    print(f"\n{config['name']} ({code}) - {mode}:")
    print(f"  Speeches: {len(df):,}")
    print(f"  Sessions: {df['Text_ID'].nunique():,}")
    
    # Check embeddings
    has_english_emb = 'Speech_Embeddings_English' in df.columns and df['Speech_Embeddings_English'].notna().any()
    has_native_emb = 'Speech_Embeddings_Native' in df.columns and df['Speech_Embeddings_Native'].notna().any()
    
    if has_english_emb:
        sample_emb = df[df['Speech_Embeddings_English'].notna()]['Speech_Embeddings_English'].iloc[0]
        print(f"  ‚úÖ English speech embeddings: {sample_emb.shape}")
    if has_native_emb:
        sample_emb = df[df['Speech_Embeddings_Native'].notna()]['Speech_Embeddings_Native'].iloc[0]
        print(f"  ‚úÖ Native speech embeddings: {sample_emb.shape}")
    
    # Check segments
    if 'Segment_ID_English' in df.columns and df['Segment_ID_English'].notna().any():
        print(f"  ‚úÖ English segments: {df['Segment_ID_English'].nunique():,}")
    if 'Segment_ID_Native' in df.columns and df['Segment_ID_Native'].notna().any():
        print(f"  ‚úÖ Native segments: {df['Segment_ID_Native'].nunique():,}")
    
    # Mode-specific details
    if mode == 'BILINGUAL':
        print(f"  üìö Languages: English + Native")
    elif mode == 'ENGLISH_ONLY':
        print(f"  üìù Language: English only")
    elif mode == 'NATIVE_ONLY':
        print(f"  üìù Language: Native only")

print(f"\n‚úÖ All processing complete!")
print(f"\nüíæ Data available in 'final_data' dictionary")
print(f"   Access via: final_data['AT'], final_data['HR'], final_data['GB']")

In [None]:
print("üíæ SAVING FINAL PROCESSED DATA")
print("="*60)

# Save each country's processed data (one file per country with all columns)
for code, df in final_data.items():
    config = CONFIG[code]
    output_path = os.path.join(OUTPUT_DIR, f"{code}_speeches_processed.pkl")
    
    # Save the complete dataframe
    df.to_pickle(output_path)
    
    print(f"\n‚úÖ Saved {config['name']} ({code}):")
    print(f"   üìÇ File: {output_path}")
    print(f"   üìä {len(df):,} speeches | {df['Segment_ID'].nunique():,} segments")
    print(f"   üìã {len(df.columns)} columns:")
    
    # List key columns
    key_cols = [c for c in df.columns if 'Embedding' in c or 'Segment_ID' in c]
    for col in key_cols:
        print(f"      - {col}")

# Delete Step 4 checkpoint to save space
step4_checkpoint = os.path.join(CHECKPOINT_DIR, 'step4_segment_embeddings.pkl')
if os.path.exists(step4_checkpoint):
    os.remove(step4_checkpoint)
    print(f"\nüóëÔ∏è  Deleted Step 4 checkpoint (final data saved in {OUTPUT_DIR})")

print(f"\n{'='*60}")
print(f"‚úÖ ALL PROCESSING COMPLETE")
print(f"{'='*60}")
print(f"\nüìÅ Final outputs saved to: {OUTPUT_DIR}")
print(f"   - {len(final_data)} country files ({', '.join(final_data.keys())})")
print(f"\nüìã Ready for next steps:")
print(f"   ‚Ä¢ Topic modeling (topic_modelling.ipynb)")
print(f"   ‚Ä¢ Visualization (visualization.ipynb)")