In [1]:
# load df from pickle file
import pandas as pd
vtc_share_output = pd.read_pickle('/home/nele_pauline_suffo/outputs/vtc/quantex_share_vtc_output.pkl')
annotations_output = pd.read_pickle('/home/nele_pauline_suffo/ProcessedData/annotations_superannotate/quantex_share_annotations.pkl')

In [14]:
annotations_output.head()

Unnamed: 0,audio_file_name,Utterance_Start,Utterance_Duration,Voice_type,Utterance_End
0,442279,0.942,1.243,FEM,2.185
1,442279,2.245,1.129,KCHI,3.374
2,442279,3.714,1.919,FEM,5.633
3,442279,5.65,3.044,KCHI,8.694
4,442279,8.829,1.184,FEM,10.013


In [15]:
vtc_share_output.head()

Unnamed: 0,audio_file_name,Utterance_Start,Utterance_Duration,Voice_type,Utterance_End
0,100898,0.231,4.411,SPEECH,4.642
1,100898,0.251,1.453,KCHI,1.704
2,100898,1.011,0.255,CHI,1.266
3,100898,1.851,1.774,FEM,3.625
4,100898,3.731,0.78,KCHI,4.511


In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

def time_overlap(start_pred, end_pred, start_annot, end_annot, tolerance=0.0):
    """
    Check if the predicted time interval overlaps with the annotated time interval.
    If a tolerance is provided, the overlap is allowed within that collar.
    """
    # Allow tolerance (collar) for the time intervals
    start_pred = start_pred - tolerance
    end_pred = end_pred + tolerance
    
    # Check for overlap: [start_pred, end_pred] overlaps with [start_annot, end_annot]
    overlap = not (end_pred < start_annot or start_pred > end_annot)
    return overlap


# Merge predictions and annotations on `audio_file_name`
merged = pd.merge(
    vtc_share_output,
    annotations_output,
    on='audio_file_name',
    how='left',  # Keeps all predictions, adds NaN for missing annotations
    suffixes=('_pred', '_annot')
)

# Drop rows where there are no annotations available
paired_data = merged.dropna(subset=['Utterance_Start_annot'])  # Keeps only matched rows

# Ensure all unique voice types are included
all_classes = set(paired_data['Voice_type_pred'].unique()) | set(paired_data['Voice_type_annot'].unique())

# Initialize placeholders for binary metrics per class
results = {}

for voice_type in all_classes:
    class_data = paired_data[paired_data['Voice_type_pred'] == voice_type]

    # Initialize binary metrics
    tp, fp, fn = 0, 0, 0
    
    # Compare each prediction and annotation for the current class
    for _, row in class_data.iterrows():
        start_pred = row['Utterance_Start_pred']
        end_pred = row['Utterance_End_pred']
        start_annot = row['Utterance_Start_annot']
        end_annot = row['Utterance_End_annot']
        
        # Check if the prediction overlaps with the annotation
        if time_overlap(start_pred, end_pred, start_annot, end_annot):
            tp += 1  # True Positive: there is an overlap
        else:
            fp += 1  # False Positive: prediction has no overlap
    
    # Compute False Negatives: annotations that don't have corresponding predictions
    fn = len(class_data) - tp
    
    # Compute precision, recall, F1 score for the current class
    precision = tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

    # Store results
    results[voice_type] = {'precision': precision, 'recall': recall, 'f1': f1}
    
# Calculate macro F1 score
f1_scores = [class_metrics['f1'] for class_metrics in results.values()]
macro_f1 = sum(f1_scores) / len(f1_scores)

# Output results
print("Class-wise Metrics:")
for voice_type, metrics in results.items():
    print(f"{voice_type}: Precision={metrics['precision']:.3f}, Recall={metrics['recall']:.3f}, F1={metrics['f1']:.3f}")

print(f"\nMacro F1 Score: {macro_f1:.3f}")

  from scipy.sparse.base import spmatrix
