# Speech Analysis Pipeline Demo

This notebook demonstrates the complete speech analysis pipeline for detecting potential cognitive issues through speech patterns.

In [None]:
import sys
sys.path.append('../src')

from audio_processor import AudioProcessor
from transcriber import Transcriber
from analyzer import SpeechAnalyzer

import glob
import os
import pandas as pd
import matplotlib.pyplot as plt

## 1. Load Audio Files

In [None]:
# Get list of audio files
audio_files = glob.glob('../data/audio/*.wav')
print(f"Found {len(audio_files)} audio files")

## 2. Process Audio Files

In [None]:
# Initialize components
audio_processor = AudioProcessor()
transcriber = Transcriber()
analyzer = SpeechAnalyzer()

# Process each file
features_list = []

for audio_file in audio_files:
    print(f"Processing {os.path.basename(audio_file)}...")
    
    # Load and preprocess audio
    y, sr = audio_processor.load_and_preprocess(audio_file)
    
    # Extract features
    pauses = audio_processor.extract_pauses(y, sr)
    pitch_stats = audio_processor.compute_pitch_stats(y, sr)
    
    # Transcribe
    transcript = transcriber.transcribe(audio_file)
    transcript_analysis = transcriber.analyze_transcript(transcript)
    
    # Compute speech rate
    duration = len(y) / sr
    speech_rate = audio_processor.compute_speech_rate(
        transcript_analysis['word_count'], 
        duration
    )
    
    # Compile features
    features = {
        'file': os.path.basename(audio_file),
        'pause_count': len(pauses),
        'hesitation_count': transcript_analysis['hesitation_count'],
        'speech_rate': speech_rate,
        'pitch_stats': pitch_stats,
        'transcript': transcript,
        'incomplete_sentences': transcript_analysis['incomplete_sentences']
    }
    
    features_list.append(features)

## 3. Analyze Results

In [None]:
# Prepare feature matrix
feature_matrix = analyzer.prepare_features(features_list)

# Perform anomaly detection
labels = analyzer.fit_predict(feature_matrix)

# Create visualizations
pca_fig, tsne_fig = analyzer.visualize_results(feature_matrix, labels)

# Display results
plt.show()

## 4. Generate Report

In [None]:
# Create a DataFrame with results
results_df = pd.DataFrame([
    {
        'File': f['file'],
        'Pauses': f['pause_count'],
        'Hesitations': f['hesitation_count'],
        'Speech Rate': f['speech_rate'],
        'Pitch Std': f['pitch_stats']['pitch_std'],
        'Incomplete Sentences': f['incomplete_sentences'],
        'Risk Label': 'At Risk' if l == -1 else 'Normal'
    }
    for f, l in zip(features_list, labels)
])

results_df