# Multi-Audio Speaker Identification & Analysis

This notebook processes multiple audio files to:
1. **Transcribe and diarize** speakers in each audio file
2. **Identify speakers** using AI-powered analysis
3. **Compare speaker patterns** across different audio files
4. **Extract topics** and speaking characteristics per speaker

---

In [1]:
# ============================================================
# IMPORTS AND SETUP
# ============================================================
import whisperx
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
from dotenv import load_dotenv
load_dotenv()

# Audio processing
import librosa
import numpy as np
import torch

# AI and NLP
from openai import OpenAI
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer

# Visualization
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


All libraries imported successfully!


In [2]:
# ============================================================
# CONFIGURATION AND HELPER FUNCTIONS
# ============================================================

# Audio files to process
DATA_DIR = Path("../data")
AUDIO_FILES = {
    "US_Debate": DATA_DIR / "US_DebateAudio.wav",
    "Irish_Presidential": DATA_DIR / "IrishPresidentalElection.wav", 
    "Sample_Audio": DATA_DIR / "sampleaudio.wav"
}

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 16
compute_type = "float16" if device == "cuda" else "int8"

print(f"Using device: {device}")
print(f"Audio files found:")
for name, path in AUDIO_FILES.items():
    exists = "YES" if path.exists() else "NO"
    print(f"  {exists} {name}: {path.name}")

Using device: cpu
Audio files found:
  YES US_Debate: US_DebateAudio.wav
  YES Irish_Presidential: IrishPresidentalElection.wav
  YES Sample_Audio: sampleaudio.wav


In [3]:
# ============================================================
# CONTEXT DETECTION AND HELPER FUNCTIONS
# ============================================================

def detect_context(full_text):
    """Detect the type of conversation from transcript text."""
    text = full_text.lower()
    
    # Political debates/discussions
    if any(w in text for w in ["president", "administration", "policy", "senator", "congress", "election"]):
        if "irish" in text or "ireland" in text:
            return "Irish political debate/election"
        return "US political debate"
    
    # Economic discussions
    if any(w in text for w in ["budget", "inflation", "economy", "financial", "market"]):
        return "Economic/financial discussion"
    
    # Interviews/podcasts
    if any(w in text for w in ["host", "welcome", "interview", "guest", "today we have"]):
        return "Interview/podcast"
    
    # Educational/academic
    if any(w in text for w in ["research", "study", "analysis", "data", "methodology"]):
        return "Academic/educational discussion"
    
    return "General conversation"

def chunk_text(text, max_len=1200):
    """Limit text length for AI processing efficiency."""
    return text[:max_len]

def clean_text(text):
    """Clean transcript text by removing filler words and normalizing."""
    import re
    text = text.lower()
    
    # Remove common filler words
    filler_words = ["um", "uh", "like", "you know", "yeah", "ok", "right", "well"]
    for fw in filler_words:
        text = re.sub(rf'\b{re.escape(fw)}\b', '', text)
    
    # Remove extra spaces and normalize
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("Helper functions defined successfully!")

Helper functions defined successfully!


In [5]:
# ============================================================
# WHISPERX SETUP AND AUDIO PROCESSING
# ============================================================

# Initialize WhisperX models
print("Loading WhisperX models...")

# Load ASR model
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
print("ASR model loaded")

# Load alignment model
model_a, metadata = whisperx.load_align_model(language_code="en", device=device)
print("Alignment model loaded")

# Load diarization model (you'll need to set your HuggingFace token)
# Get your token from: https://huggingface.co/settings/tokens
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN") or "YOUR_HF_TOKEN_HERE"
try:
    diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
    print("Diarization model loaded")
except AttributeError:
    # For newer versions of whisperx, use this approach
    from pyannote.audio import Pipeline
    diarize_model = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=HF_TOKEN
    )
    if device == "cuda":
        diarize_model = diarize_model.to(torch.device("cuda"))
    print("Diarization model loaded (pyannote)")

print("All WhisperX models ready!")

Loading WhisperX models...
2025-11-24 20:01:44 - whisperx.asr - INFO - No language specified, language will be detected for each audio file (increases inference time)
2025-11-24 20:01:44 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...
2025-11-24 20:01:44 - whisperx.asr - INFO - No language specified, language will be detected for each audio file (increases inference time)
2025-11-24 20:01:44 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\norak\SpeakSense\venv\Lib\site-packages\whisperx\assets\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.
ASR model loaded
ASR model loaded
Alignment model loaded
Alignment model loaded
Diarization model loaded (pyannote)
All WhisperX models ready!
Diarization model loaded (pyannote)
All WhisperX models ready!


In [6]:
# ============================================================
# AUDIO PROCESSING FUNCTION
# ============================================================

def process_audio_file(audio_path, file_label):
    """
    Complete audio processing pipeline:
    1. Load and transcribe audio
    2. Perform speaker diarization
    3. Assign speakers to segments
    4. Return processed results
    """
    print(f"\n{'='*60}")
    print(f"PROCESSING: {file_label}")
    print(f"File: {audio_path}")
    print(f"{'='*60}")
    
    try:
        # Step 1: Load audio
        print("Loading audio...")
        audio = whisperx.load_audio(str(audio_path))
        
        # Step 2: Transcribe
        print("Transcribing...")
        result = model.transcribe(audio, batch_size=batch_size)
        
        # Step 3: Align whisper output
        print("Aligning transcript...")
        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
        
        # Step 4: Diarize speakers
        print("Performing speaker diarization...")
        try:
            # Try whisperx approach first
            diarize_segments = diarize_model(audio)
        except:
            # Use pyannote approach
            import torchaudio
            waveform, sample_rate = torchaudio.load(str(audio_path))
            diarize_segments = diarize_model({"waveform": waveform, "sample_rate": sample_rate})
        
        # Step 5: Assign word speakers
        print("Assigning speakers to segments...")
        result = whisperx.assign_word_speakers(diarize_segments, result)
        
        # Step 6: Create summary stats
        speakers = sorted(list(set(seg.get('speaker', 'UNKNOWN') for seg in result['segments'] if seg.get('speaker'))))
        total_duration = max(seg.get('end', 0) for seg in result['segments'])
        
        print(f"Processing complete!")
        print(f"  - Duration: {total_duration/60:.1f} minutes")
        print(f"  - Speakers detected: {len(speakers)}")
        print(f"  - Segments: {len(result['segments'])}")
        
        return {
            'label': file_label,
            'result': result,
            'speakers': speakers,
            'duration': total_duration,
            'audio_path': audio_path
        }
        
    except Exception as e:
        print(f"Error processing {file_label}: {str(e)}")
        return None

print("Audio processing function ready!")

Audio processing function ready!


In [7]:
# ============================================================
# AI-POWERED SPEAKER IDENTIFICATION
# ============================================================

# Setup OpenAI client
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    print("OPENAI_API_KEY not found. Speaker identification will be limited.")
    client = None
else:
    client = OpenAI(api_key=api_key)
    print("OpenAI client initialized")

def suggest_speaker_identity(speaker_text, global_context="auto", audio_label=""):
    """
    Advanced speaker identification using complete transcript analysis.
    Analyzes full speaking patterns, vocabulary, and context clues.
    """
    if not client:
        return "Unknown Speaker"
    
    # Auto-detect context
    if global_context == "auto":
        global_context = detect_context(speaker_text)
    
    # Prepare analysis prompt
    prompt = f"""
You are an expert in political speech analysis and speaker identification.

CONTEXT: {global_context}
AUDIO SOURCE: {audio_label}

TASK: Analyze this speaker's complete transcript and identify who they most likely are.

FULL SPEAKER TRANSCRIPT:
{speaker_text[:2000]}

ANALYSIS INSTRUCTIONS:
1. Look for distinctive speech patterns, phrases, and policy positions
2. Identify characteristic vocabulary and speaking style
3. Note any specific names, titles, or references mentioned
4. Consider the political context and likely participants
5. If this appears to be a well-known political figure, provide their name
6. If uncertain, provide a descriptive role (Moderator, Candidate A, Journalist, etc.)

Respond with ONLY the most likely identity or role:
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=50,
            temperature=0.1,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error in speaker identification: {e}")
        return "Unknown Speaker"

def analyze_speakers_in_audio(processed_audio):
    """
    Analyze and identify all speakers in a processed audio file.
    """
    if not processed_audio:
        return None
    
    result = processed_audio['result']
    speakers = processed_audio['speakers']
    label = processed_audio['label']
    
    print(f"\n{'='*60}")
    print(f"SPEAKER IDENTIFICATION: {label}")
    print(f"{'='*60}")
    
    speaker_analysis = {}
    
    for speaker in speakers:
        print(f"\nAnalyzing {speaker}...")
        
        # Gather all text for this speaker
        speaker_segments = [seg for seg in result['segments'] if seg.get('speaker') == speaker]
        speaker_text = " ".join([seg.get('text', '').strip() for seg in speaker_segments])
        
        # Get speaker statistics
        word_count = len(speaker_text.split())
        segment_count = len(speaker_segments)
        
        # Get AI identification
        identified_name = suggest_speaker_identity(
            speaker_text, 
            global_context="auto",
            audio_label=label
        )
        
        speaker_analysis[speaker] = {
            'identified_as': identified_name,
            'text': speaker_text,
            'word_count': word_count,
            'segment_count': segment_count,
            'clean_text': clean_text(speaker_text)
        }
        
        print(f"  {speaker} â†’ {identified_name}")
        print(f"  Word count: {word_count:,}")
        print(f"  Segments: {segment_count}")
    
    return speaker_analysis

print("Speaker identification functions ready!")

OpenAI client initialized
Speaker identification functions ready!


In [None]:
# ============================================================
# PROCESS ALL AUDIO FILES
# ============================================================

# Store results for all audio files
all_results = {}
all_speaker_analyses = {}

print("Starting multi-audio processing...")
print(f"Found {len(AUDIO_FILES)} audio files to process")

for audio_name, audio_path in AUDIO_FILES.items():
    if audio_path.exists():
        # Process audio file
        processed = process_audio_file(audio_path, audio_name)
        
        if processed:
            all_results[audio_name] = processed
            
            # Analyze speakers in this audio
            speaker_analysis = analyze_speakers_in_audio(processed)
            if speaker_analysis:
                all_speaker_analyses[audio_name] = speaker_analysis
        
        # Clean up GPU memory
        if device == "cuda":
            torch.cuda.empty_cache()
            gc.collect()
    else:
        print(f"Audio file not found: {audio_path}")

print(f"\n{'='*70}")
print("MULTI-AUDIO PROCESSING COMPLETE")
print(f"{'='*70}")
print(f"Successfully processed: {len(all_results)} audio files")
for name in all_results.keys():
    print(f" {name}")

Starting multi-audio processing...
Found 3 audio files to process

PROCESSING: US_Debate
File: ..\data\US_DebateAudio.wav
Loading audio...
Transcribing...
Transcribing...
2025-11-24 20:02:57 - whisperx.asr - INFO - Detected language: en (0.99) in first 30s of audio
2025-11-24 20:02:57 - whisperx.asr - INFO - Detected language: en (0.99) in first 30s of audio
Aligning transcript...
Aligning transcript...
Performing speaker diarization...
Performing speaker diarization...
Assigning speakers to segments...
Error processing US_Debate: 'e'

PROCESSING: Irish_Presidential
File: ..\data\IrishPresidentalElection.wav
Loading audio...
Assigning speakers to segments...
Error processing US_Debate: 'e'

PROCESSING: Irish_Presidential
File: ..\data\IrishPresidentalElection.wav
Loading audio...
Transcribing...
Transcribing...
2025-11-24 20:32:31 - whisperx.asr - INFO - Detected language: en (0.99) in first 30s of audio
2025-11-24 20:32:31 - whisperx.asr - INFO - Detected language: en (0.99) in first 