# Full Voice Assistant Pipeline Integration

This notebook integrates all three services (ASR, LLM, TTS) and measures end-to-end performance.

**Pipeline Flow:**
```
Audio Input → ASR → Text → LLM → Response → TTS → Audio Output
```

**Measurements:**
- Individual service timings
- Total pipeline latency
- GPU memory usage
- Real-time performance analysis

## 1. Setup and Imports

In [None]:
import torch
import numpy as np
import sounddevice as sd
import soundfile as sf
from transformers import (
    AutoModelForSpeechSeq2Seq, 
    AutoProcessor, 
    AutoModelForCausalLM, 
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from TTS.api import TTS
from pathlib import Path
from datetime import datetime
import time
import pandas as pd
from IPython.display import Audio, display
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("  VOICE ASSISTANT PIPELINE - FULL INTEGRATION TEST")
print("="*70)
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"Available Memory: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated()) / 1024**3:.2f} GB")

## 2. Utility Functions

In [None]:
def get_gpu_memory():
    """Get current GPU memory usage."""
    if torch.cuda.is_available():
        return {
            "allocated_gb": torch.cuda.memory_allocated() / 1024**3,
            "reserved_gb": torch.cuda.memory_reserved() / 1024**3,
            "peak_gb": torch.cuda.max_memory_allocated() / 1024**3
        }
    return {"allocated_gb": 0, "reserved_gb": 0, "peak_gb": 0}

def print_stage_header(stage_name, stage_num, total_stages):
    """Print a formatted stage header."""
    print(f"\n{'='*70}")
    print(f"  [{stage_num}/{total_stages}] {stage_name}")
    print(f"{'='*70}")

def record_audio(duration=5, sample_rate=16000):
    """Record audio from microphone."""
    print(f"\n🎤 Recording for {duration} seconds...")
    print("Speak now!\n")
    
    recording = sd.rec(
        int(duration * sample_rate),
        samplerate=sample_rate,
        channels=1,
        dtype='float32'
    )
    sd.wait()
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"pipeline_input_{timestamp}.wav"
    sf.write(filename, recording, sample_rate)
    
    print(f"✓ Recording saved: {filename}\n")
    return filename

## 3. Load ASR Model (Whisper Small)

In [None]:
print_stage_header("Loading ASR Model", 1, 3)

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32

print("Loading Whisper Small...")
start_time = time.time()

asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "openai/whisper-small",
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
asr_model.to(device)

asr_processor = AutoProcessor.from_pretrained("openai/whisper-small")

asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=asr_model,
    tokenizer=asr_processor.tokenizer,
    feature_extractor=asr_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

load_time = time.time() - start_time
asr_memory = get_gpu_memory()

print(f"\n✓ ASR loaded in {load_time:.2f}s")
print(f"GPU Memory: {asr_memory['allocated_gb']:.3f} GB")

## 4. Load LLM Model (Qwen2.5-7B-Instruct 4-bit)

In [None]:
print_stage_header("Loading LLM Model", 2, 3)

print("Loading Qwen2.5-7B-Instruct with 4-bit quantization...")
start_time = time.time()

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

llm_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)

llm_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    trust_remote_code=True
)

if llm_tokenizer.pad_token is None:
    llm_tokenizer.pad_token = llm_tokenizer.eos_token

load_time = time.time() - start_time
llm_memory = get_gpu_memory()

print(f"\n✓ LLM loaded in {load_time:.2f}s")
print(f"GPU Memory: {llm_memory['allocated_gb']:.3f} GB")

## 5. Load TTS Model (XTTS-v2)

In [None]:
print_stage_header("Loading TTS Model", 3, 3)

print("Loading XTTS-v2...")
start_time = time.time()

tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

load_time = time.time() - start_time
tts_memory = get_gpu_memory()

print(f"\n✓ TTS loaded in {load_time:.2f}s")
print(f"Total GPU Memory: {tts_memory['allocated_gb']:.3f} GB")

print("\n" + "="*70)
print("  ALL MODELS LOADED SUCCESSFULLY")
print("="*70)
print(f"\nTotal GPU Memory Usage: {tts_memory['allocated_gb']:.3f} GB")

## 6. Define Pipeline Functions

In [None]:
def transcribe_audio(audio_path, language="spanish"):
    """Step 1: ASR - Audio to Text"""
    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()
    
    result = asr_pipe(
        audio_path,
        generate_kwargs={"language": language, "task": "transcribe"}
    )
    
    inference_time = time.time() - start_time
    memory = get_gpu_memory()
    
    return {
        "text": result["text"],
        "time": inference_time,
        "memory": memory["peak_gb"]
    }


def generate_response(text, system_prompt=None, max_tokens=200):
    """Step 2: LLM - Text to Response"""
    if system_prompt is None:
        system_prompt = "Eres un asistente de voz útil y amigable. Responde de manera concisa, clara y en español."
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text}
    ]
    
    chat_text = llm_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = llm_tokenizer(chat_text, return_tensors="pt", padding=True, truncation=True)
    inputs = inputs.to(llm_model.device)
    
    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()
    
    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=llm_tokenizer.pad_token_id,
            eos_token_id=llm_tokenizer.eos_token_id,
        )
    
    inference_time = time.time() - start_time
    
    generated_ids = outputs[0][inputs.input_ids.shape[1]:]
    response = llm_tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    
    memory = get_gpu_memory()
    num_tokens = len(generated_ids)
    
    return {
        "text": response,
        "time": inference_time,
        "tokens": num_tokens,
        "tokens_per_sec": num_tokens / inference_time if inference_time > 0 else 0,
        "memory": memory["peak_gb"]
    }


def synthesize_speech(text, output_path, speaker_wav=None, language="es"):
    """Step 3: TTS - Text to Audio"""
    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()
    
    tts_model.tts_to_file(
        text=text,
        file_path=output_path,
        speaker_wav=speaker_wav,
        language=language
    )
    
    inference_time = time.time() - start_time
    
    # Get audio duration
    audio_data, sample_rate = sf.read(output_path)
    audio_duration = len(audio_data) / sample_rate
    
    memory = get_gpu_memory()
    
    return {
        "output_path": output_path,
        "time": inference_time,
        "audio_duration": audio_duration,
        "rtf": inference_time / audio_duration if audio_duration > 0 else 0,
        "memory": memory["peak_gb"]
    }


print("✓ Pipeline functions defined")

## 7. Full Pipeline Test Function

In [None]:
def run_full_pipeline(audio_input, speaker_reference=None, test_name="Test"):
    """
    Run complete voice assistant pipeline and measure performance.
    
    Args:
        audio_input: Path to input audio file
        speaker_reference: Path to speaker reference for TTS (optional)
        test_name: Name for this test
    
    Returns:
        Dictionary with complete results and timings
    """
    print("\n" + "="*70)
    print(f"  RUNNING FULL PIPELINE: {test_name}")
    print("="*70)
    
    pipeline_start = time.time()
    results = {"test_name": test_name}
    
    # Stage 1: ASR
    print("\n[1/3] 🎤 ASR: Transcribing audio...")
    asr_result = transcribe_audio(audio_input)
    results["transcription"] = asr_result["text"]
    results["asr_time"] = asr_result["time"]
    results["asr_memory"] = asr_result["memory"]
    print(f"      Text: {asr_result['text']}")
    print(f"      Time: {asr_result['time']:.2f}s")
    
    # Stage 2: LLM
    print("\n[2/3] 🤖 LLM: Generating response...")
    llm_result = generate_response(asr_result["text"])
    results["response"] = llm_result["text"]
    results["llm_time"] = llm_result["time"]
    results["llm_tokens"] = llm_result["tokens"]
    results["llm_tokens_per_sec"] = llm_result["tokens_per_sec"]
    results["llm_memory"] = llm_result["memory"]
    print(f"      Response: {llm_result['text']}")
    print(f"      Time: {llm_result['time']:.2f}s ({llm_result['tokens_per_sec']:.1f} tokens/s)")
    
    # Stage 3: TTS
    print("\n[3/3] 🔊 TTS: Synthesizing speech...")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"pipeline_output_{timestamp}.wav"
    tts_result = synthesize_speech(llm_result["text"], output_path, speaker_reference)
    results["output_audio"] = output_path
    results["tts_time"] = tts_result["time"]
    results["tts_audio_duration"] = tts_result["audio_duration"]
    results["tts_rtf"] = tts_result["rtf"]
    results["tts_memory"] = tts_result["memory"]
    print(f"      Output: {output_path}")
    print(f"      Time: {tts_result['time']:.2f}s (RTF: {tts_result['rtf']:.2f}x)")
    
    # Calculate totals
    pipeline_end = time.time()
    results["total_time"] = pipeline_end - pipeline_start
    results["processing_time"] = asr_result["time"] + llm_result["time"] + tts_result["time"]
    results["overhead_time"] = results["total_time"] - results["processing_time"]
    
    # Summary
    print("\n" + "="*70)
    print("  PIPELINE SUMMARY")
    print("="*70)
    print(f"\n⏱️  Timings:")
    print(f"   ASR:         {results['asr_time']:6.2f}s ({results['asr_time']/results['total_time']*100:5.1f}%)")
    print(f"   LLM:         {results['llm_time']:6.2f}s ({results['llm_time']/results['total_time']*100:5.1f}%)")
    print(f"   TTS:         {results['tts_time']:6.2f}s ({results['tts_time']/results['total_time']*100:5.1f}%)")
    print(f"   Overhead:    {results['overhead_time']:6.2f}s ({results['overhead_time']/results['total_time']*100:5.1f}%)")
    print(f"   {'─'*30}")
    print(f"   TOTAL:       {results['total_time']:6.2f}s")
    
    print(f"\n💾 Peak Memory:")
    print(f"   ASR:         {results['asr_memory']:.3f} GB")
    print(f"   LLM:         {results['llm_memory']:.3f} GB")
    print(f"   TTS:         {results['tts_memory']:.3f} GB")
    print(f"   Max:         {max(results['asr_memory'], results['llm_memory'], results['tts_memory']):.3f} GB")
    
    return results

## 8. Test Pipeline with Audio Input

You can either:
1. Record audio now
2. Use an existing audio file

In [None]:
# Option 1: Record new audio
# Uncomment the line below to record
# test_audio = record_audio(duration=5)

# Option 2: Use existing audio file
test_audio = "test_input.wav"  # Replace with your audio file

# Set speaker reference for TTS (optional)
speaker_reference = None  # Replace with your speaker reference if available
# speaker_reference = "my_voice_reference.wav"

print(f"\nUsing audio file: {test_audio}")
if speaker_reference:
    print(f"Using speaker reference: {speaker_reference}")
else:
    print("⚠️  No speaker reference - TTS may require one. Add path to speaker_reference variable.")

## 9. Run Single Pipeline Test

In [None]:
# Run the pipeline
if Path(test_audio).exists():
    result = run_full_pipeline(
        audio_input=test_audio,
        speaker_reference=speaker_reference,
        test_name="Single Test"
    )
    
    # Play the output
    print("\n" + "="*70)
    print("🔊 Playing output audio...")
    print("="*70)
    display(Audio(result["output_audio"], autoplay=False))
else:
    print(f"\n⚠️  Audio file not found: {test_audio}")
    print("Please record audio or provide a valid file path.")

## 10. Multiple Pipeline Tests

In [None]:
# Run multiple tests to get average performance
num_tests = 3
all_results = []

print(f"\n{'='*70}")
print(f"  RUNNING {num_tests} PIPELINE TESTS")
print(f"{'='*70}")

for i in range(num_tests):
    print(f"\n\n{'#'*70}")
    print(f"#  TEST {i+1}/{num_tests}")
    print(f"{'#'*70}")
    
    # Record audio for each test
    test_audio = record_audio(duration=5)
    
    # Run pipeline
    result = run_full_pipeline(
        audio_input=test_audio,
        speaker_reference=speaker_reference,
        test_name=f"Test {i+1}"
    )
    
    all_results.append(result)
    
    # Brief pause between tests
    if i < num_tests - 1:
        print("\n⏸️  Pausing 3 seconds before next test...")
        time.sleep(3)

print("\n\n" + "="*70)
print("  ALL TESTS COMPLETE")
print("="*70)

## 11. Performance Analysis

In [None]:
if all_results:
    df = pd.DataFrame(all_results)
    
    print("\n" + "="*70)
    print("  PERFORMANCE ANALYSIS")
    print("="*70)
    
    print("\n📊 Average Timings:")
    print(f"   ASR:         {df['asr_time'].mean():6.2f}s  (±{df['asr_time'].std():.2f}s)")
    print(f"   LLM:         {df['llm_time'].mean():6.2f}s  (±{df['llm_time'].std():.2f}s)")
    print(f"   TTS:         {df['tts_time'].mean():6.2f}s  (±{df['tts_time'].std():.2f}s)")
    print(f"   Overhead:    {df['overhead_time'].mean():6.2f}s  (±{df['overhead_time'].std():.2f}s)")
    print(f"   {'─'*30}")
    print(f"   TOTAL:       {df['total_time'].mean():6.2f}s  (±{df['total_time'].std():.2f}s)")
    
    print("\n💾 Memory Usage:")
    print(f"   ASR Peak:    {df['asr_memory'].mean():.3f} GB")
    print(f"   LLM Peak:    {df['llm_memory'].mean():.3f} GB")
    print(f"   TTS Peak:    {df['tts_memory'].mean():.3f} GB")
    print(f"   Max Peak:    {df[['asr_memory', 'llm_memory', 'tts_memory']].max().max():.3f} GB")
    
    print("\n🚀 Performance Metrics:")
    print(f"   LLM Speed:   {df['llm_tokens_per_sec'].mean():.1f} tokens/s")
    print(f"   TTS RTF:     {df['tts_rtf'].mean():.2f}x (lower is better)")
    
    print("\n📈 Time Distribution:")
    total_proc_time = df[['asr_time', 'llm_time', 'tts_time']].sum(axis=1).mean()
    print(f"   ASR:         {(df['asr_time'].mean() / total_proc_time * 100):.1f}%")
    print(f"   LLM:         {(df['llm_time'].mean() / total_proc_time * 100):.1f}%")
    print(f"   TTS:         {(df['tts_time'].mean() / total_proc_time * 100):.1f}%")
    
    # Detailed results table
    print("\n" + "="*70)
    print("Detailed Results:")
    print("="*70)
    display_df = df[['test_name', 'asr_time', 'llm_time', 'tts_time', 'total_time']].copy()
    display_df.columns = ['Test', 'ASR (s)', 'LLM (s)', 'TTS (s)', 'Total (s)']
    print(display_df.to_string(index=False))
else:
    print("\n⚠️  No results to analyze. Run tests first.")

## 12. Conclusions and Recommendations

In [None]:
if all_results:
    avg_total = df['total_time'].mean()
    max_memory = df[['asr_memory', 'llm_memory', 'tts_memory']].max().max()
    
    print("\n" + "="*70)
    print("  CONCLUSIONS & RECOMMENDATIONS")
    print("="*70)
    
    print("\n✅ Configuration Tested:")
    print("   - ASR: Whisper Small (244M params)")
    print("   - LLM: Qwen2.5-7B-Instruct (4-bit quantization)")
    print("   - TTS: XTTS-v2")
    
    print(f"\n📊 Results:")
    print(f"   - Average Response Time: {avg_total:.1f}s")
    print(f"   - Peak GPU Memory: {max_memory:.1f} GB")
    
    print("\n💡 Real-time Performance:")
    if avg_total < 10:
        print("   ✅ EXCELLENT - Suitable for interactive voice assistant")
    elif avg_total < 15:
        print("   ✅ GOOD - Acceptable for voice assistant use")
    else:
        print("   ⚠️  SLOW - May need optimization for real-time use")
    
    print("\n💾 Memory Efficiency:")
    if max_memory < 10:
        print("   ✅ EXCELLENT - Fits comfortably within 12GB VRAM")
    elif max_memory < 12:
        print("   ✅ GOOD - Within 12GB VRAM limit")
    else:
        print("   ⚠️  OVER LIMIT - Exceeds 12GB VRAM constraint")
    
    print("\n🎯 Bottleneck Analysis:")
    slowest_stage = df[['asr_time', 'llm_time', 'tts_time']].mean().idxmax()
    stage_names = {'asr_time': 'ASR', 'llm_time': 'LLM', 'tts_time': 'TTS'}
    print(f"   Slowest Stage: {stage_names[slowest_stage]}")
    
    if slowest_stage == 'llm_time':
        print("   Recommendation: LLM is the bottleneck")
        print("   - Reduce max_new_tokens for faster responses")
        print("   - Consider using smaller model or better quantization")
    elif slowest_stage == 'asr_time':
        print("   Recommendation: ASR is the bottleneck")
        print("   - Consider using Distil-Whisper for faster transcription")
    else:
        print("   Recommendation: TTS is the bottleneck")
        print("   - Consider caching common responses")
        print("   - Optimize audio chunk processing")
    
    print("\n" + "="*70)
    print("✅ This configuration is suitable for a voice assistant!")
    print("="*70)

## 13. Cleanup

In [None]:
# Free memory
del asr_model, asr_processor, asr_pipe
del llm_model, llm_tokenizer
del tts_model

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"\n✓ Cleanup complete")
    print(f"GPU Memory after cleanup: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

## Summary

This notebook tested the complete voice assistant pipeline:

**Pipeline:** Audio → ASR → LLM → TTS → Audio

**Key Metrics:**
- End-to-end latency
- Individual service timings
- GPU memory usage
- Performance bottlenecks

**Configuration:**
- ASR: Whisper Small (~2GB VRAM)
- LLM: Qwen2.5-7B 4-bit (~5GB VRAM)
- TTS: XTTS-v2 (~2GB VRAM)
- **Total: ~9-10GB VRAM** ✅

This setup provides a good balance of quality, speed, and memory efficiency for a voice assistant application!