In [None]:
# MARKDOWN
# Voice-Based Cognitive Decline Pattern Detection

This notebook demonstrates a comprehensive pipeline for analyzing speech patterns to detect potential cognitive decline indicators. It processes audio samples, extracts linguistic and acoustic features, and applies unsupervised machine learning to identify anomalies.

## Project Overview

**Objective:** Build a proof-of-concept pipeline that uses raw voice data to detect cognitive stress or decline indicators using NLP and audio feature extraction.

**Features Analyzed:**
- Pauses per sentence
- Hesitation markers (uh, um, etc.)
- Word recall issues (compared to baseline text)
- Speech rate and pitch variability
- Naming task performance
- Sentence completion quality

**ML Approach:** Unsupervised clustering and anomaly detection to identify abnormal speech patterns.

# CODE
# Import necessary libraries
import sys
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio, display

# Add the src directory to the path
sys.path.append('../src')

# Import our custom modules
from audio_processor import AudioProcessor
from transcriber import Transcriber
from analyzer import SpeechAnalyzer
from word_recall import WordRecallDetector
from sentence_completion import SentenceCompletionDetector

# Set up visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# MARKDOWN
## 1. Environment Setup

First, we need to set up our environment and initialize the components of our pipeline. This includes:
- Audio processor for feature extraction
- Transcriber for speech-to-text and linguistic analysis
- Speech analyzer for ML-based pattern detection

# CODE
# Set up environment variables (if not already set)
import os
# Uncomment and set your Groq API key if not set in environment
# os.environ["GROQ_API_KEY"] = "your_api_key_here"

# Get Groq API key from environment
groq_api_key = os.environ.get("GROQ_API_KEY")
if not groq_api_key:
    raise ValueError("GROQ_API_KEY environment variable is not set. Please set it before running the notebook.")

# Initialize components
audio_processor = AudioProcessor()
transcriber = Transcriber(groq_api_key)
analyzer = SpeechAnalyzer()

# Set default naming task targets
transcriber.set_naming_targets([
    "apple", "banana", "car", "dog", "elephant",
    "flower", "guitar", "house", "ice cream", "jacket"
])

print("Environment setup complete!")

# MARKDOWN
## 2. Load Audio Files

We'll load audio files from the `data/audio` directory. For this analysis, we need 5-10 audio samples.

# CODE
# Get list of audio files (both WAV and MP3)
audio_files = glob.glob('../data/audio/*.wav') + glob.glob('../data/audio/*.mp3')
print(f"Found {len(audio_files)} audio files:")

# Display available audio files
for i, file in enumerate(audio_files):
    print(f"{i+1}. {os.path.basename(file)}")

# Check if we have enough files
if len(audio_files) < 5:
    print("\nWARNING: For a comprehensive analysis, it's recommended to have at least 5 audio samples.")
elif len(audio_files) > 10:
    print("\nNote: You have more than 10 audio samples. This is fine, but processing might take longer.")

# MARKDOWN
## 3. Audio Preprocessing and Feature Extraction

For each audio file, we'll:
1. Load and preprocess the audio (resample, normalize)
2. Extract acoustic features (pauses, pitch statistics)
3. Transcribe the speech
4. Analyze the transcript for linguistic features

# CODE
# Define baseline text for word recall analysis (optional)
baseline_text = """
The quick brown fox jumps over the lazy dog. She sells seashells by the seashore.
The rain in Spain stays mainly in the plain. How much wood would a woodchuck chuck.
"""

# Process each file
features_list = []

for audio_file in audio_files:
    print(f"\nProcessing {os.path.basename(audio_file)}...")
    
    # Load and preprocess audio
    y, sr = audio_processor.load_and_preprocess(audio_file)
    
    # Extract acoustic features
    pauses = audio_processor.extract_pauses(y, sr)
    pitch_stats = audio_processor.compute_pitch_stats(y, sr)
    
    # Play a short sample of the audio (first 3 seconds)
    print("Audio sample (first 3 seconds):")
    sample_length = min(3 * sr, len(y))
    display(Audio(y[:sample_length], rate=sr))
    
    # Transcribe
    transcript = transcriber.transcribe(audio_file)
    print(f"Transcript: {transcript}")
    
    # Analyze transcript
    transcript_analysis = transcriber.analyze_transcript(transcript, baseline_text)
    
    # Compute speech rate
    duration = len(y) / sr
    speech_rate = audio_processor.compute_speech_rate(
        transcript_analysis['word_count'], 
        duration
    )
    
    # Calculate cognitive risk score
    risk_score = transcriber.get_cognitive_risk_score(transcript_analysis)
    
    # Compile features
    features = {
        'file': os.path.basename(audio_file),
        'transcript': transcript,
        'pause_count': len(pauses),
        'hesitation_count': transcript_analysis['hesitation_count'],
        'speech_rate': speech_rate,
        'pitch_stats': pitch_stats,
        'word_recall': transcript_analysis.get('word_recall', {}),
        'naming_task': transcript_analysis.get('naming_task', {}),
        'sentence_completion': transcript_analysis.get('sentence_completion', {}),
        'completion_score': transcript_analysis.get('completion_score', 0),
        'cognitive_risk_score': risk_score,
        'cognitive_assessment': {
            'risk_level': 'Low' if risk_score > 0.7 else 'Medium' if risk_score > 0.4 else 'High',
            'indicators': {
                'hesitation_frequency': 'Normal' if transcript_analysis.get('hesitation_rate', 0) < 0.1 else 'Elevated',
                'speech_rate': 'Normal' if 80 <= speech_rate <= 160 else 'Abnormal',
                'sentence_structure': 'Normal' if transcript_analysis.get('completion_score', 1) > 0.8 else 'Impaired'
            }
        }
    }
    
    features_list.append(features)
    print(f"Processed {os.path.basename(audio_file)} - Risk Score: {risk_score:.2f} ({features['cognitive_assessment']['risk_level']} risk)")

print("\nFeature extraction complete!")

# MARKDOWN
## 4. Feature Analysis and Visualization

Now we'll analyze the extracted features to identify patterns and potential indicators of cognitive decline.

# CODE
# Create a DataFrame with basic features for easier analysis
basic_features_df = pd.DataFrame([
    {
        'File': f['file'],
        'Pauses': f['pause_count'],
        'Hesitations': f['hesitation_count'],
        'Speech Rate': f['speech_rate'],
        'Pitch Mean': f['pitch_stats']['pitch_mean'],
        'Pitch Std': f['pitch_stats']['pitch_std'],
        'Completion Score': f['completion_score'],
        'Risk Score': f['cognitive_risk_score'],
        'Risk Level': f['cognitive_assessment']['risk_level']
    }
    for f in features_list
])

# Display the basic features
print("Basic Speech Features:")
basic_features_df

# Visualize key metrics
fig, axs = plt.subplots(2, 2, figsize=(15, 10))

# Plot speech rate
axs[0, 0].bar(basic_features_df['File'], basic_features_df['Speech Rate'], color='skyblue')
axs[0, 0].set_title('Speech Rate (words/min)')
axs[0, 0].set_ylabel('Words per minute')
axs[0, 0].tick_params(axis='x', rotation=45)

# Plot pause count
axs[0, 1].bar(basic_features_df['File'], basic_features_df['Pauses'], color='lightgreen')
axs[0, 1].set_title('Pause Count')
axs[0, 1].set_ylabel('Number of pauses')
axs[0, 1].tick_params(axis='x', rotation=45)

# Plot hesitation count
axs[1, 0].bar(basic_features_df['File'], basic_features_df['Hesitations'], color='salmon')
axs[1, 0].set_title('Hesitation Markers')
axs[1, 0].set_ylabel('Number of hesitations')
axs[1, 0].tick_params(axis='x', rotation=45)

# Plot risk score
colors = ['green' if level == 'Low' else 'orange' if level == 'Medium' else 'red' 
          for level in basic_features_df['Risk Level']]
axs[1, 1].bar(basic_features_df['File'], basic_features_df['Risk Score'], color=colors)
axs[1, 1].set_title('Cognitive Risk Score (higher is better)')
axs[1, 1].set_ylabel('Score (0-1)')
axs[1, 1].set_ylim(0, 1)
axs[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# MARKDOWN
## 5. Unsupervised Machine Learning Analysis

We'll now apply unsupervised machine learning techniques to identify patterns and anomalies in the speech data.

# CODE
# Prepare feature matrix for ML analysis
feature_matrix = analyzer.prepare_features(features_list)

# Perform anomaly detection and clustering
labels, anomaly_scores = analyzer.fit_predict(feature_matrix)

# Add ML results to our dataframe
basic_features_df['Cluster'] = labels
basic_features_df['Anomaly Score'] = anomaly_scores
basic_features_df['Is Anomaly'] = labels == -1

# Display updated dataframe with ML results
print("Speech Features with ML Analysis:")
basic_features_df

# Create visualizations of the ML results
pca_fig, tsne_fig = analyzer.visualize_results(feature_matrix, labels)

# Display PCA visualization
plt.figure(figsize=(12, 10))
plt.subplot(2, 1, 1)
plt.imshow(pca_fig)
plt.title("PCA Visualization of Speech Features")
plt.axis('off')

# Display t-SNE visualization
plt.subplot(2, 1, 2)
plt.imshow(tsne_fig)
plt.title("t-SNE Visualization of Speech Features")
plt.axis('off')

plt.tight_layout()
plt.show()

# MARKDOWN
## 6. Correlation Analysis

Let's examine the correlations between different speech features to identify which features are most predictive of cognitive risk.

# CODE
# Select numerical features for correlation analysis
corr_features = basic_features_df[['Pauses', 'Hesitations', 'Speech Rate', 
                                  'Pitch Mean', 'Pitch Std', 'Completion Score', 
                                  'Risk Score', 'Anomaly Score']]

# Calculate correlation matrix
corr_matrix = corr_features.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.2f')
plt.title('Correlation Between Speech Features')
plt.tight_layout()
plt.show()

# MARKDOWN
## 7. Batch Analysis Summary

Now we'll generate a comprehensive summary of our analysis across all audio samples.

# CODE
# Perform batch analysis
batch_analysis = analyzer.analyze_batch(features_list)

# Display summary statistics
print("Summary Statistics:")
print(f"Sample Count: {batch_analysis['sample_count']}")
print(f"Anomaly Count: {batch_analysis['anomaly_count']}")
print(f"Cluster Count: {batch_analysis['cluster_count']}")
print(f"Average Anomaly Score: {batch_analysis['avg_anomaly_score']:.2f}")
print(f"Average Speech Rate: {batch_analysis['avg_speech_rate']:.2f} words/min")
print(f"Average Pause Count: {batch_analysis['avg_pause_count']:.2f}")
print(f"Average Hesitation Count: {batch_analysis['avg_hesitation_count']:.2f}")
print(f"Average Cognitive Risk Score: {batch_analysis['avg_cognitive_risk_score']:.2f}")

# Identify most insightful features
feature_importance = corr_matrix['Risk Score'].abs().sort_values(ascending=False)
print("\nFeature Importance (correlation with Risk Score):")
for feature, importance in feature_importance.items():
    if feature != 'Risk Score':
        print(f"- {feature}: {importance:.2f}")

# MARKDOWN
## 8. Conclusions and Next Steps

Based on our analysis, we can draw the following conclusions:

1. **Most Insightful Features:**
   - [Fill in based on your results]
   - [e.g., "Speech rate showed the strongest correlation with cognitive risk scores"]

2. **ML Methods Used:**
   - DBSCAN clustering for identifying groups of similar speech patterns
   - Anomaly detection for identifying outliers that may indicate cognitive concerns
   - Dimensionality reduction (PCA and t-SNE) for visualization

3. **Potential Next Steps:**
   - Collect larger dataset with confirmed cognitive condition samples
   - Implement supervised learning with labeled data
   - Refine feature extraction for better sensitivity
   - Conduct clinical validation studies

This proof-of-concept demonstrates the potential of speech analysis for cognitive assessment, but further development and validation would be needed for clinical applications.

# CODE
# Export results to CSV for further analysis
basic_features_df.to_csv('../data/speech_analysis_results.csv', index=False)
print("Results exported to '../data/speech_analysis_results.csv'")

# Generate a simple HTML report
from IPython.display import HTML

html_report = f"""
<h1>Speech Analysis Report</h1>
<p>Analysis of {len(features_list)} audio samples</p>

<h2>Summary Statistics</h2>
<ul>
    <li>Sample Count: {batch_analysis['sample_count']}</li>
    <li>Anomaly Count: {batch_analysis['anomaly_count']}</li>
    <li>Average Risk Score: {batch_analysis['avg_cognitive_risk_score']:.2f}</li>
</ul>

<h2>Individual Sample Results</h2>
<table border="1">
    <tr>
        <th>File</th>
        <th>Risk Score</th>
        <th>Risk Level</th>
        <th>Anomaly</th>
    </tr>
"""

for _, row in basic_features_df.iterrows():
    color = "green" if row['Risk Level'] == "Low" else "orange" if row['Risk Level'] == "Medium" else "red"
    html_report += f"""
    <tr>
        <td>{row['File']}</td>
        <td>{row['Risk Score']:.2f}</td>
        <td style="color:{color}">{row['Risk Level']}</td>
        <td>{"Yes" if row['Is Anomaly'] else "No"}</td>
    </tr>
    """

html_report += """
</table>
<p>This report was generated automatically by the Speech Analysis Pipeline.</p>
"""

HTML(html_report)