## Evaluation

In [1]:
import yaml
import os
import pandas as pd
import numpy as np
import metric_fid_utils
import metric_csim_utils
from collections import defaultdict

I0000 00:00:1756404308.851346 1604986 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1756404309.842673 1605629 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.133.20), renderer: NVIDIA RTX A6000/PCIe/SSE2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
  from .autonotebook import tqdm as notebook_tqdm
W0000 00:00:1756404309.856084 1605564 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1756404309.860539 1604986 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
W0000 00:00:1756404309.902308 1605583 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1756404309.902591 1605637 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.133.20), renderer: NVIDIA RTX A6000/PCIe/SSE2
W0000 00:00:1756404309

### MUSETALK

#### Evaluation for All RAW Data

In [3]:
# --- Load autoreload extension ---
%reload_ext autoreload
# Load test configuration
config_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/configs/inference/test_raw.yaml"
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

# Base paths
video_input_base = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video"
results_base = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/results/test_raw/v15/RAW"

# Function to extract speaker and TTS info from audio path
def extract_info_from_audio_path(audio_path):
    filename = os.path.basename(audio_path)
    
    # Extract TTS method
    if filename.startswith("CV_"):
        tts_method = "CosyVoice"
    elif filename.startswith("XT_"):
        tts_method = "XTTSv2"
    elif filename.startswith("YT_"):
        tts_method = "YourTTS"
    else:
        tts_method = "Unknown"
    
    # Extract speaker ID
    parts = filename.split("_")
    if len(parts) >= 2:
        speaker_id = parts[1]  # A002, A006, S001, etc.
    else:
        speaker_id = "Unknown"
    
    return speaker_id, tts_method

# Function to generate output video filename
def generate_output_filename(video_path, audio_path):
    video_filename = os.path.splitext(os.path.basename(video_path))[0]
    audio_filename = os.path.splitext(os.path.basename(audio_path))[0]
    return f"{video_filename}_{audio_filename}.mp4"

# Initialize results storage
results = []
fid_scores = defaultdict(list)
csim_scores = defaultdict(list)
speaker_scores = defaultdict(lambda: defaultdict(list))

print("Starting evaluation...")
print("=" * 50)

# Process each task
for task_name, task_config in config.items():
    video_path = task_config['video_path']
    audio_path = task_config['audio_path']
    
    # Extract information
    speaker_id, tts_method = extract_info_from_audio_path(audio_path)
    
    # Generate output video path
    output_filename = generate_output_filename(video_path, audio_path)
    gen_video_path = os.path.join(results_base, output_filename)
    
    # Check if files exist
    if not os.path.exists(video_path):
        print(f"Warning: Original video not found: {video_path}")
        continue
    
    if not os.path.exists(gen_video_path):
        print(f"Warning: Generated video not found: {gen_video_path}")
        continue
    
    print(f"Processing {task_name}: {speaker_id} - {tts_method}")
    
    try:
        # Calculate FID
        fid_score = metric_fid_utils.calculate_fid(video_path, gen_video_path)
        
        # Calculate CSIM
        csim_score, similarities = metric_csim_utils.calculate_csim(video_path, gen_video_path)
        
        # Store results
        result = {
            'task': task_name,
            'speaker_id': speaker_id,
            'tts_method': tts_method,
            'fid_score': fid_score,
            'csim_score': csim_score,
            'csim_std': np.std(similarities),
            'original_video': video_path,
            'generated_video': gen_video_path
        }
        results.append(result)
        
        # Store for averaging
        fid_scores[tts_method].append(fid_score)
        csim_scores[tts_method].append(csim_score)
        speaker_scores[speaker_id]['fid'].append(fid_score)
        speaker_scores[speaker_id]['csim'].append(csim_score)
        speaker_scores[speaker_id]['tts_methods'].append(tts_method)
        
        print(f"  FID: {fid_score:.4f}, CSIM: {csim_score:.4f}")
        
    except Exception as e:
        print(f"Error processing {task_name}: {str(e)}")
        continue

print("\n" + "=" * 50)
print("EVALUATION COMPLETE")
print("=" * 50)

# Convert to DataFrame for easier analysis
df = pd.DataFrame(results)

# Display individual results
print("\nIndividual Results:")
print(df[['task', 'speaker_id', 'tts_method', 'fid_score', 'csim_score']].to_string(index=False))

# Calculate averages per TTS method
print("\n" + "=" * 50)
print("AVERAGES PER TTS METHOD")
print("=" * 50)

for tts_method in ['CosyVoice', 'XTTSv2', 'YourTTS']:
    if tts_method in fid_scores and len(fid_scores[tts_method]) > 0:
        avg_fid = np.mean(fid_scores[tts_method])
        std_fid = np.std(fid_scores[tts_method])
        avg_csim = np.mean(csim_scores[tts_method])
        std_csim = np.std(csim_scores[tts_method])
        count = len(fid_scores[tts_method])
        
        print(f"\n{tts_method} ({count} samples):")
        print(f"  FID:  {avg_fid:.4f} ± {std_fid:.4f}")
        print(f"  CSIM: {avg_csim:.4f} ± {std_csim:.4f}")

# Calculate averages per speaker
print("\n" + "=" * 50)
print("AVERAGES PER SPEAKER")
print("=" * 50)

for speaker_id in sorted(speaker_scores.keys()):
    if len(speaker_scores[speaker_id]['fid']) > 0:
        avg_fid = np.mean(speaker_scores[speaker_id]['fid'])
        std_fid = np.std(speaker_scores[speaker_id]['fid'])
        avg_csim = np.mean(speaker_scores[speaker_id]['csim'])
        std_csim = np.std(speaker_scores[speaker_id]['csim'])
        count = len(speaker_scores[speaker_id]['fid'])
        
        # Get unique TTS methods for this speaker
        tts_methods = list(set(speaker_scores[speaker_id]['tts_methods']))
        
        print(f"\n{speaker_id} ({count} samples, TTS: {', '.join(tts_methods)}):")
        print(f"  FID:  {avg_fid:.4f} ± {std_fid:.4f}")
        print(f"  CSIM: {avg_csim:.4f} ± {std_csim:.4f}")

# Create summary tables
print("\n" + "=" * 50)
print("SUMMARY TABLES")
print("=" * 50)

# TTS Method Summary
tts_summary = []
for tts_method in ['CosyVoice', 'XTTSv2', 'YourTTS']:
    if tts_method in fid_scores and len(fid_scores[tts_method]) > 0:
        tts_summary.append({
            'TTS_Method': tts_method,
            'Count': len(fid_scores[tts_method]),
            'FID_Mean': np.mean(fid_scores[tts_method]),
            'FID_Std': np.std(fid_scores[tts_method]),
            'CSIM_Mean': np.mean(csim_scores[tts_method]),
            'CSIM_Std': np.std(csim_scores[tts_method])
        })

tts_df = pd.DataFrame(tts_summary)
print("\nTTS Method Summary:")
print(tts_df.round(4).to_string(index=False))

# Speaker Summary
speaker_summary = []
for speaker_id in sorted(speaker_scores.keys()):
    if len(speaker_scores[speaker_id]['fid']) > 0:
        speaker_summary.append({
            'Speaker_ID': speaker_id,
            'Count': len(speaker_scores[speaker_id]['fid']),
            'FID_Mean': np.mean(speaker_scores[speaker_id]['fid']),
            'FID_Std': np.std(speaker_scores[speaker_id]['fid']),
            'CSIM_Mean': np.mean(speaker_scores[speaker_id]['csim']),
            'CSIM_Std': np.std(speaker_scores[speaker_id]['csim'])
        })

speaker_df = pd.DataFrame(speaker_summary)
print("\nSpeaker Summary:")
print(speaker_df.round(4).to_string(index=False))

# Overall statistics
print("\n" + "=" * 50)
print("OVERALL STATISTICS")
print("=" * 50)

all_fid = [score for scores in fid_scores.values() for score in scores]
all_csim = [score for scores in csim_scores.values() for score in scores]

if len(all_fid) > 0:
    print(f"Total samples: {len(all_fid)}")
    print(f"Overall FID:  {np.mean(all_fid):.4f} ± {np.std(all_fid):.4f}")
    print(f"Overall CSIM: {np.mean(all_csim):.4f} ± {np.std(all_csim):.4f}")

# Save results to CSV
output_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/evaluation_results_raw.csv"
df.to_csv(output_path, index=False)
print(f"\nDetailed results saved to: {output_path}")

print("\nEvaluation completed successfully!")

Starting evaluation...
Processing task_0: A017 - CosyVoice
Extracted 150 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A017_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracted 123 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/results/test_raw/v15/RAW/A017_02_BBP_NORMAL_color_25fps_6sec_CV_A017_BBP_cloned.mp4
Resampling frames to 123
FID score: 13.045662879943848
Extracting embeddings from real video...
Processed 30 frames...
Processed 60 frames...
Processed 90 frames...
Processed 120 frames...
Processed 150 frames...
Extracted 150 face embeddings from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A017_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracting embeddings from generated video...
Processed 30 frames...
Processed 60 frames...
Processed 90 frames...
Processed 120 frames...
Extracted 123 face embeddings from /home/

#### Evaluation for VCTK data

In [4]:
# --- Load autoreload extension ---
%reload_ext autoreload
# Load test configuration
config_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/configs/inference/test_vctk.yaml"
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

# Base paths
video_input_base = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video"
results_base = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/results/test_vctk/v15/VCTK"

# Function to extract speaker and TTS info from audio path
def extract_info_from_audio_path(audio_path):
    filename = os.path.basename(audio_path)
    
    # Extract TTS method
    if filename.startswith("CV_"):
        tts_method = "CosyVoice"
    elif filename.startswith("XT_"):
        tts_method = "XTTSv2"
    elif filename.startswith("YT_"):
        tts_method = "YourTTS"
    else:
        tts_method = "Unknown"
    
    # Extract speaker ID
    parts = filename.split("_")
    if len(parts) >= 2:
        speaker_id = parts[1]  # A002, A006, S001, etc.
    else:
        speaker_id = "Unknown"
    
    return speaker_id, tts_method

# Function to generate output video filename
def generate_output_filename(video_path, audio_path):
    video_filename = os.path.splitext(os.path.basename(video_path))[0]
    audio_filename = os.path.splitext(os.path.basename(audio_path))[0]
    return f"{video_filename}_{audio_filename}.mp4"

# Initialize results storage
results = []
fid_scores = defaultdict(list)
csim_scores = defaultdict(list)
speaker_scores = defaultdict(lambda: defaultdict(list))

print("Starting evaluation...")
print("=" * 50)

# Process each task
for task_name, task_config in config.items():
    video_path = task_config['video_path']
    audio_path = task_config['audio_path']
    
    # Extract information
    speaker_id, tts_method = extract_info_from_audio_path(audio_path)
    
    # Generate output video path
    output_filename = generate_output_filename(video_path, audio_path)
    gen_video_path = os.path.join(results_base, output_filename)
    
    # Check if files exist
    if not os.path.exists(video_path):
        print(f"Warning: Original video not found: {video_path}")
        continue
    
    if not os.path.exists(gen_video_path):
        print(f"Warning: Generated video not found: {gen_video_path}")
        continue
    
    print(f"Processing {task_name}: {speaker_id} - {tts_method}")
    
    try:
        # Calculate FID
        fid_score = metric_fid_utils.calculate_fid(video_path, gen_video_path)
        
        # Calculate CSIM
        csim_score, similarities = metric_csim_utils.calculate_csim(video_path, gen_video_path)
        
        # Store results
        result = {
            'task': task_name,
            'speaker_id': speaker_id,
            'tts_method': tts_method,
            'fid_score': fid_score,
            'csim_score': csim_score,
            'csim_std': np.std(similarities),
            'original_video': video_path,
            'generated_video': gen_video_path
        }
        results.append(result)
        
        # Store for averaging
        fid_scores[tts_method].append(fid_score)
        csim_scores[tts_method].append(csim_score)
        speaker_scores[speaker_id]['fid'].append(fid_score)
        speaker_scores[speaker_id]['csim'].append(csim_score)
        speaker_scores[speaker_id]['tts_methods'].append(tts_method)
        
        print(f"  FID: {fid_score:.4f}, CSIM: {csim_score:.4f}")
        
    except Exception as e:
        print(f"Error processing {task_name}: {str(e)}")
        continue

print("\n" + "=" * 50)
print("EVALUATION COMPLETE")
print("=" * 50)

# Convert to DataFrame for easier analysis
df = pd.DataFrame(results)

# Display individual results
print("\nIndividual Results:")
print(df[['task', 'speaker_id', 'tts_method', 'fid_score', 'csim_score']].to_string(index=False))

# Calculate averages per TTS method
print("\n" + "=" * 50)
print("AVERAGES PER TTS METHOD")
print("=" * 50)

for tts_method in ['CosyVoice', 'XTTSv2', 'YourTTS']:
    if tts_method in fid_scores and len(fid_scores[tts_method]) > 0:
        avg_fid = np.mean(fid_scores[tts_method])
        std_fid = np.std(fid_scores[tts_method])
        avg_csim = np.mean(csim_scores[tts_method])
        std_csim = np.std(csim_scores[tts_method])
        count = len(fid_scores[tts_method])
        
        print(f"\n{tts_method} ({count} samples):")
        print(f"  FID:  {avg_fid:.4f} ± {std_fid:.4f}")
        print(f"  CSIM: {avg_csim:.4f} ± {std_csim:.4f}")

# Calculate averages per speaker
print("\n" + "=" * 50)
print("AVERAGES PER SPEAKER")
print("=" * 50)

for speaker_id in sorted(speaker_scores.keys()):
    if len(speaker_scores[speaker_id]['fid']) > 0:
        avg_fid = np.mean(speaker_scores[speaker_id]['fid'])
        std_fid = np.std(speaker_scores[speaker_id]['fid'])
        avg_csim = np.mean(speaker_scores[speaker_id]['csim'])
        std_csim = np.std(speaker_scores[speaker_id]['csim'])
        count = len(speaker_scores[speaker_id]['fid'])
        
        # Get unique TTS methods for this speaker
        tts_methods = list(set(speaker_scores[speaker_id]['tts_methods']))
        
        print(f"\n{speaker_id} ({count} samples, TTS: {', '.join(tts_methods)}):")
        print(f"  FID:  {avg_fid:.4f} ± {std_fid:.4f}")
        print(f"  CSIM: {avg_csim:.4f} ± {std_csim:.4f}")

# Create summary tables
print("\n" + "=" * 50)
print("SUMMARY TABLES")
print("=" * 50)

# TTS Method Summary
tts_summary = []
for tts_method in ['CosyVoice', 'XTTSv2', 'YourTTS']:
    if tts_method in fid_scores and len(fid_scores[tts_method]) > 0:
        tts_summary.append({
            'TTS_Method': tts_method,
            'Count': len(fid_scores[tts_method]),
            'FID_Mean': np.mean(fid_scores[tts_method]),
            'FID_Std': np.std(fid_scores[tts_method]),
            'CSIM_Mean': np.mean(csim_scores[tts_method]),
            'CSIM_Std': np.std(csim_scores[tts_method])
        })

tts_df = pd.DataFrame(tts_summary)
print("\nTTS Method Summary:")
print(tts_df.round(4).to_string(index=False))

# Speaker Summary
speaker_summary = []
for speaker_id in sorted(speaker_scores.keys()):
    if len(speaker_scores[speaker_id]['fid']) > 0:
        speaker_summary.append({
            'Speaker_ID': speaker_id,
            'Count': len(speaker_scores[speaker_id]['fid']),
            'FID_Mean': np.mean(speaker_scores[speaker_id]['fid']),
            'FID_Std': np.std(speaker_scores[speaker_id]['fid']),
            'CSIM_Mean': np.mean(speaker_scores[speaker_id]['csim']),
            'CSIM_Std': np.std(speaker_scores[speaker_id]['csim'])
        })

speaker_df = pd.DataFrame(speaker_summary)
print("\nSpeaker Summary:")
print(speaker_df.round(4).to_string(index=False))

# Overall statistics
print("\n" + "=" * 50)
print("OVERALL STATISTICS")
print("=" * 50)

all_fid = [score for scores in fid_scores.values() for score in scores]
all_csim = [score for scores in csim_scores.values() for score in scores]

if len(all_fid) > 0:
    print(f"Total samples: {len(all_fid)}")
    print(f"Overall FID:  {np.mean(all_fid):.4f} ± {np.std(all_fid):.4f}")
    print(f"Overall CSIM: {np.mean(all_csim):.4f} ± {np.std(all_csim):.4f}")

# Save results to CSV
output_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/evaluation_results_vctk.csv"
df.to_csv(output_path, index=False)
print(f"\nDetailed results saved to: {output_path}")

print("\nEvaluation completed successfully!")

Starting evaluation...
Processing task_0: A002 - CosyVoice
Extracted 150 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A002_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracted 84 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/results/test_vctk/v15/VCTK/A002_02_BBP_NORMAL_color_25fps_6sec_CV_A002_VCTK_cloned.mp4
Resampling frames to 84
FID score: 29.996950149536133
Extracting embeddings from real video...
Processed 30 frames...
Processed 60 frames...
Processed 90 frames...
Processed 120 frames...
Processed 150 frames...
Extracted 150 face embeddings from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A002_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracting embeddings from generated video...
Processed 30 frames...
Processed 60 frames...
Extracted 84 face embeddings from /home/is/lathifgalih-k/research_naist/multimodal-clon

### EDTALK

#### Experiment 1 File Evaluation

##### 1. FID



In [5]:
# --- Load autoreload extension ---
%reload_ext autoreload

import metric_fid_utils

real_video_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A002_02_BBP_NORMAL_color_25fps_6sec.mp4"
gen_video_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/EDTalk/results/test_raw/v15/RAW/A002_02_BBP_NORMAL_color_25fps_6sec_CV_A002_BBP_cloned_512.mp4"

fid_score = metric_fid_utils.calculate_fid(real_video_path, gen_video_path)
print("Final FID score:", fid_score)

Extracted 150 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A002_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracted 127 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/EDTalk/results/test_raw/v15/RAW/A002_02_BBP_NORMAL_color_25fps_6sec_CV_A002_BBP_cloned_512.mp4
Resampling frames to 127
FID score: 86.91779327392578
Final FID score: 86.91779327392578


##### 2. CSIM

In [6]:
# --- Load autoreload extension ---
%reload_ext autoreload
import numpy as np
import metric_csim_utils

real_video_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A002_02_BBP_NORMAL_color_25fps_6sec.mp4"
gen_video_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/EDTalk/results/test_raw/v15/RAW/A002_02_BBP_NORMAL_color_25fps_6sec_CV_A002_BBP_cloned_512.mp4"

# Calculate CSIM
csim_score, similarities = metric_csim_utils.calculate_csim(real_video_path, gen_video_path)
print(f"Final CSIM score: {csim_score:.4f}")
print(f"CSIM standard deviation: {np.std(similarities):.4f}")

Extracting embeddings from real video...
Processed 30 frames...
Processed 60 frames...
Processed 90 frames...
Processed 120 frames...
Processed 150 frames...
Extracted 150 face embeddings from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A002_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracting embeddings from generated video...
Processed 30 frames...
Processed 60 frames...
Processed 90 frames...
Processed 120 frames...
Extracted 127 face embeddings from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/EDTalk/results/test_raw/v15/RAW/A002_02_BBP_NORMAL_color_25fps_6sec_CV_A002_BBP_cloned_512.mp4
Resampling embeddings to 127 frames
Average CSIM score: 0.8761
Final CSIM score: 0.8761
CSIM standard deviation: 0.0293


#### Evaluation for All RAW Data

In [7]:
# --- Load autoreload extension ---
%reload_ext autoreload

import os
import pandas as pd
import numpy as np
import metric_fid_utils
import metric_csim_utils
from collections import defaultdict
import re

# Base paths
video_input_base = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video"
edtalk_results_base = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/EDTalk/results"

def extract_info_from_edtalk_filename(filename):
    """Extract speaker ID and TTS method from EDTalk output filename"""
    # Example: A002_02_BBP_NORMAL_color_25fps_6sec_CV_A002_BBP_cloned_512.mp4
    
    # Extract TTS method
    if "_CV_" in filename:
        tts_method = "CosyVoice"
    elif "_XT_" in filename:
        tts_method = "XTTSv2"
    elif "_YT_" in filename:
        tts_method = "YourTTS"
    else:
        tts_method = "Unknown"
    
    # Extract speaker ID (first part before underscore)
    speaker_id = filename.split('_')[0]  # A002, A006, S001, etc.
    
    return speaker_id, tts_method

def find_original_video(speaker_id, video_input_base):
    """Find the original video file for a given speaker ID"""
    # Check both ALS and Stroke folders
    video_folders = ["als-input", "stroke-input"]
    
    for folder in video_folders:
        folder_path = os.path.join(video_input_base, folder)
        if not os.path.exists(folder_path):
            continue
            
        for video_file in os.listdir(folder_path):
            if video_file.startswith(speaker_id) and video_file.endswith('.mp4'):
                return os.path.join(folder_path, video_file)
    
    return None

# Initialize results storage
results = []
fid_scores = defaultdict(list)
csim_scores = defaultdict(list)
speaker_scores = defaultdict(lambda: defaultdict(list))

print("Starting EDTalk RAW evaluation...")
print("=" * 50)

# Process RAW data
raw_results_dir = os.path.join(edtalk_results_base, "test_raw/v15/RAW")

if os.path.exists(raw_results_dir):
    for filename in os.listdir(raw_results_dir):
        # Only process 512 resolution videos
        if not (filename.endswith('_512.mp4')):
            continue
        
        gen_video_path = os.path.join(raw_results_dir, filename)
        
        # Extract information from filename
        speaker_id, tts_method = extract_info_from_edtalk_filename(filename)
        
        # Find corresponding original video
        original_video_path = find_original_video(speaker_id, video_input_base)
        
        if not original_video_path:
            print(f"Warning: Original video not found for speaker {speaker_id}")
            continue
        
        if not os.path.exists(original_video_path):
            print(f"Warning: Original video file does not exist: {original_video_path}")
            continue
        
        print(f"Processing RAW: {speaker_id} - {tts_method}")
        
        try:
            # Calculate FID
            fid_score = metric_fid_utils.calculate_fid(original_video_path, gen_video_path)
            
            # Calculate CSIM
            csim_score, similarities = metric_csim_utils.calculate_csim(original_video_path, gen_video_path)
            
            # Store results
            result = {
                'dataset': 'RAW',
                'speaker_id': speaker_id,
                'tts_method': tts_method,
                'fid_score': fid_score,
                'csim_score': csim_score,
                'csim_std': np.std(similarities),
                'original_video': original_video_path,
                'generated_video': gen_video_path,
                'filename': filename
            }
            results.append(result)
            
            # Store for averaging
            key = f"RAW_{tts_method}"
            fid_scores[key].append(fid_score)
            csim_scores[key].append(csim_score)
            speaker_scores[f"RAW_{speaker_id}"]['fid'].append(fid_score)
            speaker_scores[f"RAW_{speaker_id}"]['csim'].append(csim_score)
            speaker_scores[f"RAW_{speaker_id}"]['tts_methods'].append(tts_method)
            
            print(f"  FID: {fid_score:.4f}, CSIM: {csim_score:.4f}")
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print("\n" + "=" * 50)
print("RAW EVALUATION COMPLETE")
print("=" * 50)

# Convert to DataFrame for easier analysis
df_raw = pd.DataFrame([r for r in results if r['dataset'] == 'RAW'])

# Display individual results for RAW
print("\nRAW Individual Results:")
if len(df_raw) > 0:
    print(df_raw[['speaker_id', 'tts_method', 'fid_score', 'csim_score']].to_string(index=False))
else:
    print("No RAW results found")

# Calculate averages per TTS method for RAW
print("\n" + "=" * 50)
print("RAW AVERAGES PER TTS METHOD")
print("=" * 50)

for tts_method in ['CosyVoice', 'XTTSv2', 'YourTTS']:
    key = f"RAW_{tts_method}"
    if key in fid_scores and len(fid_scores[key]) > 0:
        avg_fid = np.mean(fid_scores[key])
        std_fid = np.std(fid_scores[key])
        avg_csim = np.mean(csim_scores[key])
        std_csim = np.std(csim_scores[key])
        count = len(fid_scores[key])
        
        print(f"\n{tts_method} ({count} samples):")
        print(f"  FID:  {avg_fid:.4f} ± {std_fid:.4f}")
        print(f"  CSIM: {avg_csim:.4f} ± {std_csim:.4f}")

# Calculate averages per speaker for RAW
print("\n" + "=" * 50)
print("RAW AVERAGES PER SPEAKER")
print("=" * 50)

raw_speaker_keys = [k for k in speaker_scores.keys() if k.startswith('RAW_')]
for speaker_key in sorted(raw_speaker_keys):
    speaker_id = speaker_key.replace('RAW_', '')
    if len(speaker_scores[speaker_key]['fid']) > 0:
        avg_fid = np.mean(speaker_scores[speaker_key]['fid'])
        std_fid = np.std(speaker_scores[speaker_key]['fid'])
        avg_csim = np.mean(speaker_scores[speaker_key]['csim'])
        std_csim = np.std(speaker_scores[speaker_key]['csim'])
        count = len(speaker_scores[speaker_key]['fid'])
        
        # Get unique TTS methods for this speaker
        tts_methods = list(set(speaker_scores[speaker_key]['tts_methods']))
        
        print(f"\n{speaker_id} ({count} samples, TTS: {', '.join(tts_methods)}):")
        print(f"  FID:  {avg_fid:.4f} ± {std_fid:.4f}")
        print(f"  CSIM: {avg_csim:.4f} ± {std_csim:.4f}")

print("\nRAW evaluation completed!")

Starting EDTalk RAW evaluation...
Processing RAW: S005 - CosyVoice
Extracted 150 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/stroke-input/S005_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracted 124 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/EDTalk/results/test_raw/v15/RAW/S005_02_BBP_NORMAL_color_25fps_6sec_CV_S005_BBP_cloned_512.mp4
Resampling frames to 124
FID score: 48.843788146972656
Extracting embeddings from real video...
Processed 30 frames...
Processed 60 frames...
Processed 90 frames...
Processed 120 frames...
Processed 150 frames...
Extracted 150 face embeddings from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/stroke-input/S005_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracting embeddings from generated video...
Processed 30 frames...
Processed 60 frames...
Processed 90 frames...
Processed 120 frames...
Extracted 124 face embe

#### Evaluation for VCTK Data

In [8]:
print("\nStarting EDTalk VCTK evaluation...")
print("=" * 50)

# Process VCTK data
vctk_results_dir = os.path.join(edtalk_results_base, "test_vctk/v15/VCTK")

if os.path.exists(vctk_results_dir):
    for filename in os.listdir(vctk_results_dir):
        # Only process 512 resolution videos
        if not (filename.endswith('_512.mp4')):
            continue
        
        gen_video_path = os.path.join(vctk_results_dir, filename)
        
        # Extract information from filename
        speaker_id, tts_method = extract_info_from_edtalk_filename(filename)
        
        # Find corresponding original video
        original_video_path = find_original_video(speaker_id, video_input_base)
        
        if not original_video_path:
            print(f"Warning: Original video not found for speaker {speaker_id}")
            continue
        
        if not os.path.exists(original_video_path):
            print(f"Warning: Original video file does not exist: {original_video_path}")
            continue
        
        print(f"Processing VCTK: {speaker_id} - {tts_method}")
        
        try:
            # Calculate FID
            fid_score = metric_fid_utils.calculate_fid(original_video_path, gen_video_path)
            
            # Calculate CSIM
            csim_score, similarities = metric_csim_utils.calculate_csim(original_video_path, gen_video_path)
            
            # Store results
            result = {
                'dataset': 'VCTK',
                'speaker_id': speaker_id,
                'tts_method': tts_method,
                'fid_score': fid_score,
                'csim_score': csim_score,
                'csim_std': np.std(similarities),
                'original_video': original_video_path,
                'generated_video': gen_video_path,
                'filename': filename
            }
            results.append(result)
            
            # Store for averaging
            key = f"VCTK_{tts_method}"
            fid_scores[key].append(fid_score)
            csim_scores[key].append(csim_score)
            speaker_scores[f"VCTK_{speaker_id}"]['fid'].append(fid_score)
            speaker_scores[f"VCTK_{speaker_id}"]['csim'].append(csim_score)
            speaker_scores[f"VCTK_{speaker_id}"]['tts_methods'].append(tts_method)
            
            print(f"  FID: {fid_score:.4f}, CSIM: {csim_score:.4f}")
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

print("\n" + "=" * 50)
print("VCTK EVALUATION COMPLETE")
print("=" * 50)

# Convert to DataFrame for easier analysis
df_vctk = pd.DataFrame([r for r in results if r['dataset'] == 'VCTK'])

# Display individual results for VCTK
print("\nVCTK Individual Results:")
if len(df_vctk) > 0:
    print(df_vctk[['speaker_id', 'tts_method', 'fid_score', 'csim_score']].to_string(index=False))
else:
    print("No VCTK results found")

# Calculate averages per TTS method for VCTK
print("\n" + "=" * 50)
print("VCTK AVERAGES PER TTS METHOD")
print("=" * 50)

for tts_method in ['CosyVoice', 'XTTSv2', 'YourTTS']:
    key = f"VCTK_{tts_method}"
    if key in fid_scores and len(fid_scores[key]) > 0:
        avg_fid = np.mean(fid_scores[key])
        std_fid = np.std(fid_scores[key])
        avg_csim = np.mean(csim_scores[key])
        std_csim = np.std(csim_scores[key])
        count = len(fid_scores[key])
        
        print(f"\n{tts_method} ({count} samples):")
        print(f"  FID:  {avg_fid:.4f} ± {std_fid:.4f}")
        print(f"  CSIM: {avg_csim:.4f} ± {std_csim:.4f}")

# Calculate averages per speaker for VCTK
print("\n" + "=" * 50)
print("VCTK AVERAGES PER SPEAKER")
print("=" * 50)

vctk_speaker_keys = [k for k in speaker_scores.keys() if k.startswith('VCTK_')]
for speaker_key in sorted(vctk_speaker_keys):
    speaker_id = speaker_key.replace('VCTK_', '')
    if len(speaker_scores[speaker_key]['fid']) > 0:
        avg_fid = np.mean(speaker_scores[speaker_key]['fid'])
        std_fid = np.std(speaker_scores[speaker_key]['fid'])
        avg_csim = np.mean(speaker_scores[speaker_key]['csim'])
        std_csim = np.std(speaker_scores[speaker_key]['csim'])
        count = len(speaker_scores[speaker_key]['fid'])
        
        # Get unique TTS methods for this speaker
        tts_methods = list(set(speaker_scores[speaker_key]['tts_methods']))
        
        print(f"\n{speaker_id} ({count} samples, TTS: {', '.join(tts_methods)}):")
        print(f"  FID:  {avg_fid:.4f} ± {std_fid:.4f}")
        print(f"  CSIM: {avg_csim:.4f} ± {std_csim:.4f}")

# Create comprehensive summary tables
print("\n" + "=" * 50)
print("COMPREHENSIVE SUMMARY TABLES")
print("=" * 50)

# Convert all results to DataFrame
df_all = pd.DataFrame(results)

# Add disease category column
def get_disease_category(speaker_id):
    if speaker_id.startswith('A'):
        return 'ALS'
    elif speaker_id.startswith('S'):
        return 'Stroke'
    else:
        return 'Unknown'

df_all['disease_category'] = df_all['speaker_id'].apply(get_disease_category)

# Combined TTS Method Summary
combined_tts_summary = []
for dataset in ['RAW', 'VCTK']:
    for tts_method in ['CosyVoice', 'XTTSv2', 'YourTTS']:
        key = f"{dataset}_{tts_method}"
        if key in fid_scores and len(fid_scores[key]) > 0:
            combined_tts_summary.append({
                'Dataset': dataset,
                'TTS_Method': tts_method,
                'Count': len(fid_scores[key]),
                'FID_Mean': np.mean(fid_scores[key]),
                'FID_Std': np.std(fid_scores[key]),
                'CSIM_Mean': np.mean(csim_scores[key]),
                'CSIM_Std': np.std(csim_scores[key])
            })

combined_tts_df = pd.DataFrame(combined_tts_summary)
print("\nCombined TTS Method Summary:")
print(combined_tts_df.round(4).to_string(index=False))

# ALS/Stroke Analysis
print("\n" + "=" * 50)
print("ALS vs STROKE ANALYSIS")
print("=" * 50)

# Overall ALS vs Stroke statistics
als_results = [r for r in results if r['speaker_id'].startswith('A')]
stroke_results = [r for r in results if r['speaker_id'].startswith('S')]

if len(als_results) > 0:
    als_fid = [r['fid_score'] for r in als_results]
    als_csim = [r['csim_score'] for r in als_results]
    print(f"\nALS Patients (Overall):")
    print(f"  Total samples: {len(als_results)}")
    print(f"  FID:  {np.mean(als_fid):.4f} ± {np.std(als_fid):.4f}")
    print(f"  CSIM: {np.mean(als_csim):.4f} ± {np.std(als_csim):.4f}")

if len(stroke_results) > 0:
    stroke_fid = [r['fid_score'] for r in stroke_results]
    stroke_csim = [r['csim_score'] for r in stroke_results]
    print(f"\nStroke Patients (Overall):")
    print(f"  Total samples: {len(stroke_results)}")
    print(f"  FID:  {np.mean(stroke_fid):.4f} ± {np.std(stroke_fid):.4f}")
    print(f"  CSIM: {np.mean(stroke_csim):.4f} ± {np.std(stroke_csim):.4f}")

# ALS vs Stroke per Dataset
for dataset in ['RAW', 'VCTK']:
    dataset_als = [r for r in results if r['dataset'] == dataset and r['speaker_id'].startswith('A')]
    dataset_stroke = [r for r in results if r['dataset'] == dataset and r['speaker_id'].startswith('S')]
    
    print(f"\n{dataset} Dataset - Disease Category Analysis:")
    
    if len(dataset_als) > 0:
        als_fid = [r['fid_score'] for r in dataset_als]
        als_csim = [r['csim_score'] for r in dataset_als]
        print(f"  ALS ({len(dataset_als)} samples):")
        print(f"    FID:  {np.mean(als_fid):.4f} ± {np.std(als_fid):.4f}")
        print(f"    CSIM: {np.mean(als_csim):.4f} ± {np.std(als_csim):.4f}")
    
    if len(dataset_stroke) > 0:
        stroke_fid = [r['fid_score'] for r in dataset_stroke]
        stroke_csim = [r['csim_score'] for r in dataset_stroke]
        print(f"  Stroke ({len(dataset_stroke)} samples):")
        print(f"    FID:  {np.mean(stroke_fid):.4f} ± {np.std(stroke_fid):.4f}")
        print(f"    CSIM: {np.mean(stroke_csim):.4f} ± {np.std(stroke_csim):.4f}")

# ALS vs Stroke per TTS Method
print("\n" + "=" * 50)
print("ALS vs STROKE PER TTS METHOD")
print("=" * 50)

for tts_method in ['CosyVoice', 'XTTSv2', 'YourTTS']:
    tts_als = [r for r in results if r['tts_method'] == tts_method and r['speaker_id'].startswith('A')]
    tts_stroke = [r for r in results if r['tts_method'] == tts_method and r['speaker_id'].startswith('S')]
    
    print(f"\n{tts_method}:")
    
    if len(tts_als) > 0:
        als_fid = [r['fid_score'] for r in tts_als]
        als_csim = [r['csim_score'] for r in tts_als]
        print(f"  ALS ({len(tts_als)} samples):")
        print(f"    FID:  {np.mean(als_fid):.4f} ± {np.std(als_fid):.4f}")
        print(f"    CSIM: {np.mean(als_csim):.4f} ± {np.std(als_csim):.4f}")
    
    if len(tts_stroke) > 0:
        stroke_fid = [r['fid_score'] for r in tts_stroke]
        stroke_csim = [r['csim_score'] for r in tts_stroke]
        print(f"  Stroke ({len(tts_stroke)} samples):")
        print(f"    FID:  {np.mean(stroke_fid):.4f} ± {np.std(stroke_fid):.4f}")
        print(f"    CSIM: {np.mean(stroke_csim):.4f} ± {np.std(stroke_csim):.4f}")

# Create comprehensive summary table for research paper
print("\n" + "=" * 50)
print("RESEARCH PAPER SUMMARY TABLE")
print("=" * 50)

# Create a comprehensive summary for the research table
research_summary = []

# For each dataset and TTS combination
for dataset in ['RAW', 'VCTK']:
    for tts_method in ['CosyVoice', 'XTTSv2', 'YourTTS']:
        # Overall for this combination
        combo_results = [r for r in results if r['dataset'] == dataset and r['tts_method'] == tts_method]
        
        if len(combo_results) > 0:
            # ALS results
            als_combo = [r for r in combo_results if r['speaker_id'].startswith('A')]
            # Stroke results  
            stroke_combo = [r for r in combo_results if r['speaker_id'].startswith('S')]
            
            # Add ALS row
            if len(als_combo) > 0:
                als_fid = [r['fid_score'] for r in als_combo]
                als_csim = [r['csim_score'] for r in als_combo]
                research_summary.append({
                    'Dataset': dataset,
                    'TTS_Method': tts_method,
                    'Disease': 'ALS',
                    'Count': len(als_combo),
                    'FID_Mean': np.mean(als_fid),
                    'FID_Std': np.std(als_fid),
                    'CSIM_Mean': np.mean(als_csim),
                    'CSIM_Std': np.std(als_csim)
                })
            
            # Add Stroke row
            if len(stroke_combo) > 0:
                stroke_fid = [r['fid_score'] for r in stroke_combo]
                stroke_csim = [r['csim_score'] for r in stroke_combo]
                research_summary.append({
                    'Dataset': dataset,
                    'TTS_Method': tts_method,
                    'Disease': 'Stroke',
                    'Count': len(stroke_combo),
                    'FID_Mean': np.mean(stroke_fid),
                    'FID_Std': np.std(stroke_fid),
                    'CSIM_Mean': np.mean(stroke_csim),
                    'CSIM_Std': np.std(stroke_csim)
                })

research_df = pd.DataFrame(research_summary)
print("\nComprehensive Research Summary (Dataset x TTS x Disease):")
print(research_df.round(4).to_string(index=False))

# Save extended results
research_output_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/evaluation_results_edtalk_detailed.csv"
research_df.to_csv(research_output_path, index=False)
print(f"\nResearch summary saved to: {research_output_path}")

# Overall statistics per dataset
print("\n" + "=" * 50)
print("OVERALL STATISTICS PER DATASET")
print("=" * 50)

for dataset in ['RAW', 'VCTK']:
    dataset_results = [r for r in results if r['dataset'] == dataset]
    if len(dataset_results) > 0:
        dataset_fid = [r['fid_score'] for r in dataset_results]
        dataset_csim = [r['csim_score'] for r in dataset_results]
        
        print(f"\n{dataset} Dataset:")
        print(f"  Total samples: {len(dataset_results)}")
        print(f"  FID:  {np.mean(dataset_fid):.4f} ± {np.std(dataset_fid):.4f}")
        print(f"  CSIM: {np.mean(dataset_csim):.4f} ± {np.std(dataset_csim):.4f}")

# Save results to CSV
output_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/evaluation_results_edtalk.csv"
df_all.to_csv(output_path, index=False)
print(f"\nDetailed EDTalk results saved to: {output_path}")

print("\nEDTalk evaluation completed successfully!")


Starting EDTalk VCTK evaluation...
Processing VCTK: A008 - YourTTS
Extracted 150 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A008_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracted 79 frames from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/EDTalk/results/test_vctk/v15/VCTK/A008_02_BBP_NORMAL_color_25fps_6sec_YT_A008_cloned_512.mp4
Resampling frames to 79
FID score: 34.023529052734375
Extracting embeddings from real video...
Processed 30 frames...
Processed 60 frames...
Processed 90 frames...
Processed 120 frames...
Processed 150 frames...
Extracted 150 face embeddings from /home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/MuseTalk/data/video/als-input/A008_02_BBP_NORMAL_color_25fps_6sec.mp4
Extracting embeddings from generated video...
Processed 30 frames...
Processed 60 frames...
Extracted 79 face embeddings from /home/is/lathifgalih-k/research_naist/multimo

In [9]:
# Short Disease-based Evaluation Script
import pandas as pd
import numpy as np
import os
import glob

def load_evaluation_results():
    """Load all existing evaluation CSV files"""
    results = {}
    
    # MuseTalk results
    musetalk_raw = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/evaluation_results_raw.csv"
    musetalk_vctk = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/muse-exp/evaluation_results_vctk.csv"
    
    # EDTalk results
    edtalk_results = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/edtalk-exp/evaluation_results_edtalk.csv"
    
    # Load MuseTalk RAW
    if os.path.exists(musetalk_raw):
        df = pd.read_csv(musetalk_raw)
        df['model'] = 'MuseTalk'
        df['dataset'] = 'RAW'
        results['musetalk_raw'] = df
    
    # Load MuseTalk VCTK
    if os.path.exists(musetalk_vctk):
        df = pd.read_csv(musetalk_vctk)
        df['model'] = 'MuseTalk'
        df['dataset'] = 'VCTK'
        results['musetalk_vctk'] = df
    
    # Load EDTalk (contains both RAW and VCTK)
    if os.path.exists(edtalk_results):
        df = pd.read_csv(edtalk_results)
        df['model'] = 'EDTalk'
        results['edtalk'] = df
    
    return results

def get_disease_category(speaker_id):
    """Determine disease category from speaker ID"""
    if speaker_id.startswith('A'):
        return 'ALS'
    elif speaker_id.startswith('S'):
        return 'Stroke'
    else:
        return 'Unknown'

def calculate_disease_metrics(df, model_name, dataset_name):
    """Calculate FID and CSIM metrics per disease category"""
    summary_data = []
    
    # Add disease category
    df['disease'] = df['speaker_id'].apply(get_disease_category)
    
    # TTS methods to evaluate
    tts_methods = ['CosyVoice', 'XTTSv2', 'YourTTS']
    
    for disease in ['ALS', 'Stroke']:
        disease_data = df[df['disease'] == disease]
        
        if len(disease_data) > 0:
            for tts_method in tts_methods:
                tts_data = disease_data[disease_data['tts_method'] == tts_method]
                
                if len(tts_data) > 0:
                    summary_data.append({
                        'Model': model_name,
                        'Dataset': dataset_name,
                        'TTS_Method': tts_method,
                        'Disease': disease,
                        'Count': len(tts_data),
                        'FID_Mean': tts_data['fid_score'].mean(),
                        'FID_Std': tts_data['fid_score'].std(),
                        'CSIM_Mean': tts_data['csim_score'].mean(),
                        'CSIM_Std': tts_data['csim_score'].std()
                    })
    
    return summary_data

def main_evaluation():
    """Main function to generate disease-based evaluation"""
    print("=" * 80)
    print("DISEASE-BASED EVALUATION (ALS vs STROKE)")
    print("=" * 80)
    
    # Load all results
    results = load_evaluation_results()
    
    all_summaries = []
    
    # Process MuseTalk RAW
    if 'musetalk_raw' in results:
        print("\nProcessing MuseTalk RAW...")
        summaries = calculate_disease_metrics(results['musetalk_raw'], 'MuseTalk', 'RAW')
        all_summaries.extend(summaries)
        print(f"Generated {len(summaries)} entries")
    
    # Process MuseTalk VCTK
    if 'musetalk_vctk' in results:
        print("\nProcessing MuseTalk VCTK...")
        summaries = calculate_disease_metrics(results['musetalk_vctk'], 'MuseTalk', 'VCTK')
        all_summaries.extend(summaries)
        print(f"Generated {len(summaries)} entries")
    
    # Process EDTalk (both RAW and VCTK)
    if 'edtalk' in results:
        df_edtalk = results['edtalk']
        
        # EDTalk RAW
        df_raw = df_edtalk[df_edtalk['dataset'] == 'RAW']
        if len(df_raw) > 0:
            print("\nProcessing EDTalk RAW...")
            summaries = calculate_disease_metrics(df_raw, 'EDTalk', 'RAW')
            all_summaries.extend(summaries)
            print(f"Generated {len(summaries)} entries")
        
        # EDTalk VCTK
        df_vctk = df_edtalk[df_edtalk['dataset'] == 'VCTK']
        if len(df_vctk) > 0:
            print("\nProcessing EDTalk VCTK...")
            summaries = calculate_disease_metrics(df_vctk, 'EDTalk', 'VCTK')
            all_summaries.extend(summaries)
            print(f"Generated {len(summaries)} entries")
    
    # Create final summary DataFrame
    if all_summaries:
        final_df = pd.DataFrame(all_summaries)
        
        # Sort for better readability
        final_df = final_df.sort_values(['Model', 'Dataset', 'Disease', 'TTS_Method'])
        
        print("\n" + "=" * 80)
        print("FINAL RESEARCH TABLE - DISEASE-BASED METRICS")
        print("=" * 80)
        print(final_df.round(4).to_string(index=False))
        
        # Save to CSV
        output_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/disease_based_evaluation_summary.csv"
        final_df.to_csv(output_path, index=False)
        print(f"\nResults saved to: {output_path}")
        
        # Generate summary statistics for research paper
        print("\n" + "=" * 80)
        print("RESEARCH PAPER SUMMARY")
        print("=" * 80)
        
        for model in ['MuseTalk', 'EDTalk']:
            model_data = final_df[final_df['Model'] == model]
            if len(model_data) > 0:
                print(f"\n{model}:")
                
                for dataset in ['RAW', 'VCTK']:
                    dataset_data = model_data[model_data['Dataset'] == dataset]
                    if len(dataset_data) > 0:
                        print(f"  {dataset} Dataset:")
                        
                        for disease in ['ALS', 'Stroke']:
                            disease_data = dataset_data[dataset_data['Disease'] == disease]
                            if len(disease_data) > 0:
                                avg_fid = disease_data['FID_Mean'].mean()
                                avg_csim = disease_data['CSIM_Mean'].mean()
                                total_samples = disease_data['Count'].sum()
                                
                                print(f"    {disease} (Total: {total_samples} samples):")
                                print(f"      Avg FID across TTS:  {avg_fid:.4f}")
                                print(f"      Avg CSIM across TTS: {avg_csim:.4f}")
        
        # Create table format for easy copy-paste
        print("\n" + "=" * 80)
        print("TABLE FORMAT FOR RESEARCH PAPER")
        print("=" * 80)
        
        # Group by Model and Dataset for table format
        table_data = []
        for model in ['MuseTalk', 'EDTalk']:
            for dataset in ['RAW', 'VCTK']:
                model_dataset_data = final_df[(final_df['Model'] == model) & (final_df['Dataset'] == dataset)]
                
                if len(model_dataset_data) > 0:
                    # Calculate average across all TTS methods for each disease
                    als_data = model_dataset_data[model_dataset_data['Disease'] == 'ALS']
                    stroke_data = model_dataset_data[model_dataset_data['Disease'] == 'Stroke']
                    
                    row = {
                        'Dataset': dataset,
                        'Model': model,
                        'ALS_FID': als_data['FID_Mean'].mean() if len(als_data) > 0 else 0,
                        'ALS_CSIM': als_data['CSIM_Mean'].mean() if len(als_data) > 0 else 0,
                        'ALS_Count': als_data['Count'].sum() if len(als_data) > 0 else 0,
                        'Stroke_FID': stroke_data['FID_Mean'].mean() if len(stroke_data) > 0 else 0,
                        'Stroke_CSIM': stroke_data['CSIM_Mean'].mean() if len(stroke_data) > 0 else 0,
                        'Stroke_Count': stroke_data['Count'].sum() if len(stroke_data) > 0 else 0
                    }
                    table_data.append(row)
        
        table_df = pd.DataFrame(table_data)
        if len(table_df) > 0:
            print("\nSIMPLIFIED TABLE (Averaged across TTS methods):")
            print(table_df.round(4).to_string(index=False))
            
            # Save simplified table
            simple_output_path = "/home/is/lathifgalih-k/research_naist/multimodal-clone/clone-video/simplified_disease_table.csv"
            table_df.to_csv(simple_output_path, index=False)
            print(f"\nSimplified table saved to: {simple_output_path}")
    
    else:
        print("No evaluation results found!")

# Run the evaluation
main_evaluation()

DISEASE-BASED EVALUATION (ALS vs STROKE)

Processing MuseTalk RAW...
Generated 1 entries

Processing MuseTalk VCTK...
Generated 6 entries

Processing EDTalk RAW...
Generated 6 entries

Processing EDTalk VCTK...
Generated 6 entries

FINAL RESEARCH TABLE - DISEASE-BASED METRICS
   Model Dataset TTS_Method Disease  Count  FID_Mean  FID_Std  CSIM_Mean  CSIM_Std
  EDTalk     RAW  CosyVoice     ALS      9   82.2521  87.5959     0.8655    0.0660
  EDTalk     RAW     XTTSv2     ALS      9   78.0505  82.9109     0.8661    0.0681
  EDTalk     RAW    YourTTS     ALS      9   81.4656  84.8039     0.8709    0.0645
  EDTalk     RAW  CosyVoice  Stroke     11   68.8020  20.5776     0.8896    0.0371
  EDTalk     RAW     XTTSv2  Stroke     11   68.6904  19.6504     0.8876    0.0370
  EDTalk     RAW    YourTTS  Stroke     11   70.3209  21.2369     0.8914    0.0384
  EDTalk    VCTK  CosyVoice     ALS      9   77.7140  79.6911     0.8467    0.0762
  EDTalk    VCTK     XTTSv2     ALS      9   79.8080  82.38

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['disease'] = df['speaker_id'].apply(get_disease_category)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['disease'] = df['speaker_id'].apply(get_disease_category)
