# EchoQuality Interactive Visual Analysis

This notebook provides interactive analysis and visualization of EchoQuality inference results.

## Features:
- Interactive score distribution analysis
- Folder-by-folder performance comparison
- Quality assessment breakdown
- Error analysis and debugging
- Custom filtering and exploration

## Requirements:
Make sure you have run inference first:
```bash
make inference
```

In [None]:
# Import required libraries
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print("Libraries imported successfully!")

## 1. Load and Explore Data

In [None]:
# Load the summary data
results_dir = '../results/inference_output'
summary_path = os.path.join(results_dir, 'summary.json')

if not os.path.exists(summary_path):
    print(f"❌ Summary file not found: {summary_path}")
    print("Please run 'make inference' first to generate results.")
else:
    with open(summary_path, 'r') as f:
        summary_data = json.load(f)
    
    print("✅ Data loaded successfully!")
    print(f"📊 Found {summary_data['total_folders']} folders with {summary_data['total_files']} total files")
    print(f"📈 Overall pass rate: {summary_data['overall_pass_rate']:.2f}%")

In [None]:
# Create a comprehensive DataFrame for analysis
all_results = []
folder_stats = []

for folder_result in summary_data['folder_results']:
    folder_name = folder_result['folder']
    
    # Collect folder statistics
    if folder_result['num_processed'] > 0:
        folder_stats.append({
            'folder': folder_name,
            'pass_rate': folder_result['pass_rate'],
            'num_files': folder_result['num_files'],
            'num_processed': folder_result['num_processed'],
            'pass_count': folder_result['pass_count'],
            'fail_count': folder_result['fail_count']
        })
    
    # Collect individual file results
    if 'results' in folder_result:
        for file_id, result in folder_result['results'].items():
            all_results.append({
                'folder': folder_name,
                'file_id': file_id,
                'score': result['score'],
                'status': result['status'],
                'assessment': result['assessment'],
                'path': result['path']
            })

# Convert to DataFrames
df_results = pd.DataFrame(all_results)
df_folders = pd.DataFrame(folder_stats)

print(f"📋 Created DataFrames:")
print(f"   - Individual results: {len(df_results)} records")
print(f"   - Folder statistics: {len(df_folders)} folders")

# Display basic statistics
print(f"\n📊 Quick Statistics:")
print(f"   - Mean score: {df_results['score'].mean():.4f}")
print(f"   - Median score: {df_results['score'].median():.4f}")
print(f"   - Score std: {df_results['score'].std():.4f}")
print(f"   - Pass rate: {(df_results['status'] == 'PASS').mean() * 100:.2f}%")

## 2. Interactive Data Overview

In [None]:
# Display summary statistics table
summary_stats = pd.DataFrame([
    ['Total Folders', summary_data['total_folders']],
    ['Total Files', summary_data['total_files']],
    ['Processed Files', summary_data['total_processed']],
    ['Pass Count', summary_data['total_pass']],
    ['Fail Count', summary_data['total_fail']],
    ['Overall Pass Rate', f"{summary_data['overall_pass_rate']:.2f}%"]
], columns=['Metric', 'Value'])

display(HTML("<h3>📊 Overall Statistics</h3>"))
display(summary_stats)

In [None]:
# Interactive score distribution
fig = go.Figure()

# Add histogram
fig.add_trace(go.Histogram(
    x=df_results['score'],
    nbinsx=50,
    name='Score Distribution',
    opacity=0.7,
    marker_color='skyblue'
))

# Add threshold line
fig.add_vline(x=0.5, line_dash="dash", line_color="red", 
             annotation_text="Pass/Fail Threshold (0.5)")

# Add mean and median lines
mean_score = df_results['score'].mean()
median_score = df_results['score'].median()

fig.add_vline(x=mean_score, line_dash="dot", line_color="green", 
             annotation_text=f"Mean: {mean_score:.3f}")
fig.add_vline(x=median_score, line_dash="dot", line_color="orange", 
             annotation_text=f"Median: {median_score:.3f}")

fig.update_layout(
    title='Interactive Quality Score Distribution',
    xaxis_title='Quality Score',
    yaxis_title='Frequency',
    width=800,
    height=500
)

fig.show()

## 3. Folder Performance Analysis

In [None]:
# Interactive folder performance scatter plot
fig = px.scatter(df_folders, 
                x='num_processed', 
                y='pass_rate',
                size='num_files',
                hover_data=['folder', 'pass_count', 'fail_count'],
                title='Folder Performance: Pass Rate vs Number of Processed Files',
                labels={
                    'num_processed': 'Number of Processed Files',
                    'pass_rate': 'Pass Rate (%)',
                    'num_files': 'Total Files'
                })

fig.update_layout(width=800, height=600)
fig.show()

In [None]:
# Top and bottom performing folders
top_folders = df_folders.nlargest(10, 'pass_rate')
bottom_folders = df_folders.nsmallest(10, 'pass_rate')

display(HTML("<h3>🏆 Top 10 Performing Folders</h3>"))
display(top_folders[['folder', 'pass_rate', 'num_processed', 'pass_count', 'fail_count']])

display(HTML("<h3>⚠️ Bottom 10 Performing Folders</h3>"))
display(bottom_folders[['folder', 'pass_rate', 'num_processed', 'pass_count', 'fail_count']])

## 4. Quality Assessment Analysis

In [None]:
# Assessment distribution
assessment_counts = df_results['assessment'].value_counts()

fig = go.Figure(data=[go.Pie(
    labels=assessment_counts.index,
    values=assessment_counts.values,
    hole=0.3
)])

fig.update_layout(
    title="Quality Assessment Distribution",
    width=600,
    height=500
)

fig.show()

# Display assessment statistics
assessment_stats = pd.DataFrame({
    'Assessment': assessment_counts.index,
    'Count': assessment_counts.values,
    'Percentage': (assessment_counts.values / len(df_results) * 100).round(2)
})

display(HTML("<h3>📋 Assessment Breakdown</h3>"))
display(assessment_stats)

## 5. Interactive Filtering and Exploration

In [None]:
# Create interactive widgets for filtering
score_range = widgets.FloatRangeSlider(
    value=[0.0, 1.0],
    min=0.0,
    max=1.0,
    step=0.01,
    description='Score Range:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.2f'
)

status_filter = widgets.SelectMultiple(
    options=['PASS', 'FAIL'],
    value=['PASS', 'FAIL'],
    description='Status:',
    disabled=False
)

folder_filter = widgets.SelectMultiple(
    options=sorted(df_results['folder'].unique()),
    value=sorted(df_results['folder'].unique())[:5],  # Default to first 5 folders
    description='Folders:',
    disabled=False,
    rows=10
)

def update_analysis(score_range_val, status_val, folder_val):
    # Filter data based on selections
    filtered_df = df_results[
        (df_results['score'] >= score_range_val[0]) &
        (df_results['score'] <= score_range_val[1]) &
        (df_results['status'].isin(status_val)) &
        (df_results['folder'].isin(folder_val))
    ]
    
    print(f"📊 Filtered Results: {len(filtered_df)} files")
    print(f"📈 Pass Rate: {(filtered_df['status'] == 'PASS').mean() * 100:.2f}%")
    print(f"📉 Mean Score: {filtered_df['score'].mean():.4f}")
    
    # Create filtered visualization
    fig = px.histogram(filtered_df, x='score', color='status',
                      title=f'Filtered Score Distribution ({len(filtered_df)} files)',
                      nbins=30)
    fig.show()
    
    # Show sample of filtered data
    if len(filtered_df) > 0:
        display(HTML("<h4>📋 Sample of Filtered Data</h4>"))
        display(filtered_df.head(10))
    else:
        print("❌ No data matches the current filters")

# Create interactive widget
interactive_widget = widgets.interactive(
    update_analysis,
    score_range_val=score_range,
    status_val=status_filter,
    folder_val=folder_filter
)

display(HTML("<h3>🔍 Interactive Data Explorer</h3>"))
display(interactive_widget)

## 6. Score Analysis by Folder

In [None]:
# Box plot of scores by folder (top 15 folders by file count)
top_folders_by_count = df_folders.nlargest(15, 'num_processed')['folder'].tolist()
filtered_for_boxplot = df_results[df_results['folder'].isin(top_folders_by_count)]

fig = px.box(filtered_for_boxplot, x='folder', y='score', color='status',
            title='Score Distribution by Folder (Top 15 by File Count)')
fig.update_xaxes(tickangle=45)
fig.update_layout(width=1000, height=600)
fig.show()

## 7. Error Analysis

In [None]:
# Analyze errors from folder results
error_analysis = []

for folder_result in summary_data['folder_results']:
    if 'error_stats' in folder_result:
        folder_name = folder_result['folder']
        error_stats = folder_result['error_stats']
        
        for error_type, count in error_stats['error_counts'].items():
            if count > 0:
                error_analysis.append({
                    'folder': folder_name,
                    'error_type': error_type,
                    'count': count
                })

if error_analysis:
    df_errors = pd.DataFrame(error_analysis)
    
    # Error type distribution
    error_summary = df_errors.groupby('error_type')['count'].sum().sort_values(ascending=False)
    
    fig = px.bar(x=error_summary.index, y=error_summary.values,
                title='Error Type Distribution Across All Folders')
    fig.update_layout(xaxis_title='Error Type', yaxis_title='Count')
    fig.show()
    
    display(HTML("<h3>⚠️ Error Summary</h3>"))
    error_summary_df = pd.DataFrame({
        'Error Type': error_summary.index,
        'Total Count': error_summary.values,
        'Percentage': (error_summary.values / error_summary.sum() * 100).round(2)
    })
    display(error_summary_df)
else:
    print("✅ No errors found in the analysis!")

## 8. Custom Analysis Functions

In [None]:
def analyze_folder(folder_name):
    """Detailed analysis of a specific folder"""
    folder_data = df_results[df_results['folder'] == folder_name]
    
    if len(folder_data) == 0:
        print(f"❌ No data found for folder: {folder_name}")
        return
    
    print(f"📁 Analysis for folder: {folder_name}")
    print(f"📊 Total files: {len(folder_data)}")
    print(f"📈 Pass rate: {(folder_data['status'] == 'PASS').mean() * 100:.2f}%")
    print(f"📉 Mean score: {folder_data['score'].mean():.4f}")
    print(f"📊 Score range: {folder_data['score'].min():.4f} - {folder_data['score'].max():.4f}")
    
    # Score distribution for this folder
    fig = px.histogram(folder_data, x='score', color='status',
                      title=f'Score Distribution for {folder_name}',
                      nbins=20)
    fig.show()
    
    # Assessment breakdown
    assessment_counts = folder_data['assessment'].value_counts()
    print(f"\n📋 Assessment Breakdown:")
    for assessment, count in assessment_counts.items():
        percentage = (count / len(folder_data)) * 100
        print(f"   {assessment}: {count} ({percentage:.1f}%)")

# Example usage widgets
folder_selector = widgets.Dropdown(
    options=sorted(df_results['folder'].unique()),
    description='Select Folder:',
    disabled=False
)

analyze_button = widgets.Button(description="Analyze Folder")

def on_analyze_click(b):
    analyze_folder(folder_selector.value)

analyze_button.on_click(on_analyze_click)

display(HTML("<h3>🔍 Single Folder Analysis</h3>"))
display(widgets.HBox([folder_selector, analyze_button]))

## 9. Export and Save Results

In [None]:
# Save processed data for further analysis
output_dir = '../analysis_output'
os.makedirs(output_dir, exist_ok=True)

# Save DataFrames
df_results.to_csv(os.path.join(output_dir, 'all_results.csv'), index=False)
df_folders.to_csv(os.path.join(output_dir, 'folder_statistics.csv'), index=False)

# Save summary statistics
summary_stats_dict = {
    'total_files_analyzed': len(df_results),
    'total_folders_analyzed': len(df_folders),
    'overall_pass_rate': (df_results['status'] == 'PASS').mean() * 100,
    'mean_score': df_results['score'].mean(),
    'median_score': df_results['score'].median(),
    'score_std': df_results['score'].std(),
    'min_score': df_results['score'].min(),
    'max_score': df_results['score'].max()
}

with open(os.path.join(output_dir, 'analysis_summary.json'), 'w') as f:
    json.dump(summary_stats_dict, f, indent=2)

print(f"✅ Analysis results saved to: {output_dir}")
print(f"📁 Files created:")
print(f"   - all_results.csv: Individual file results")
print(f"   - folder_statistics.csv: Folder-level statistics")
print(f"   - analysis_summary.json: Summary statistics")

## 10. Generate Static Visualizations

In [None]:
# Generate and save static plots
plt.style.use('seaborn-v0_8')
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('EchoQuality Analysis Summary', fontsize=16)

# 1. Score distribution
axes[0, 0].hist(df_results['score'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].axvline(0.5, color='red', linestyle='--', label='Threshold')
axes[0, 0].axvline(df_results['score'].mean(), color='green', linestyle='--', label='Mean')
axes[0, 0].set_title('Score Distribution')
axes[0, 0].set_xlabel('Quality Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Pass/Fail by status
status_counts = df_results['status'].value_counts()
axes[0, 1].pie(status_counts.values, labels=status_counts.index, autopct='%1.1f%%', startangle=90)
axes[0, 1].set_title('Pass/Fail Distribution')

# 3. Top folders by pass rate
top_10_folders = df_folders.nlargest(10, 'pass_rate')
y_pos = range(len(top_10_folders))
axes[1, 0].barh(y_pos, top_10_folders['pass_rate'], color='lightgreen', alpha=0.7)
axes[1, 0].set_yticks(y_pos)
axes[1, 0].set_yticklabels([f[:20] + '...' if len(f) > 20 else f for f in top_10_folders['folder']], fontsize=8)
axes[1, 0].set_title('Top 10 Folders by Pass Rate')
axes[1, 0].set_xlabel('Pass Rate (%)')
axes[1, 0].grid(True, alpha=0.3)

# 4. Score vs file count scatter
axes[1, 1].scatter(df_folders['num_processed'], df_folders['pass_rate'], alpha=0.6)
axes[1, 1].set_xlabel('Number of Processed Files')
axes[1, 1].set_ylabel('Pass Rate (%)')
axes[1, 1].set_title('Pass Rate vs File Count')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'static_summary.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"📊 Static visualization saved to: {os.path.join(output_dir, 'static_summary.png')}")