# Audio Feature Analyses SIM 1

In [1]:
import librosa
import numpy as np
import pandas as pd


from scripts.load_data import check_and_load
from scripts.extract_audio_features import extract_zcr, extract_loudness, extract_rhythm, create_target_variable
from scripts.nested_cv import partition_feature_df, nested_cross_validation

from scipy.signal import correlate

## Data Loading

In [2]:
# # Define paths
# data_path = "../ground_truth_data/trimmed_videos"
# frames_output_dir = "../ground_truth_data/trimmed_videos/frames"
# audio_output_dir = "../ground_truth_data/trimmed_videos/audio"
# annotations_path = "../ground_truth_data/trimmed_videos"

data_path = "../ground_truth_data"
frames_output_dir = "../ground_truth_data/frames"
audio_output_dir = "../ground_truth_data/audio"
annotations_path = "../ground_truth_data"

muppet_files = {
    "Muppets-02-01-01.avi": "GroundTruth_Muppets-02-01-01.csv",
    "Muppets-02-04-04.avi": "GroundTruth_Muppets-02-04-04.csv",
    "Muppets-03-04-03.avi": "GroundTruth_Muppets-03-04-03.csv",
}

In [3]:
# TODO: add code to overwrite frames or audio in case only one exists
annotations, audio_data, frames = check_and_load(data_path, frames_output_dir, audio_output_dir, annotations_path, muppet_files)

Frames and audio are already extracted.
Loading audio segments...
Loaded 3 audio files.
Loaded audio segments for 3 videos.
Loaded frames for 3 videos.
Number of videos with frames: 3
Video 0 has 38681 frames.
Video 1 has 38706 frames.
Video 2 has 38498 frames.


## Feature Engineering

Loudness

    - Description: Measures overall signal strength through RMS of sample amplitudes.
    - Use Case: Effective when differentiating characters based on the volume or energy of their speech or sound.

Fundamental Frequency

    - Description: Extracts the pitch of the audio signal using methods like zero-crossing rate (ZCR).
    - Use Case: Useful for identifying characters with distinct pitch or tonal qualities in their voices (e.g., Kermit's high-pitched voice).

Rhythm Detection

    - Description: Uses autocorrelation to find repeating patterns across frames. Statistical moments indicate the presence of rhythm.
    - Use Case: Ideal for characters with unique speech cadences or rhythmic patterns (e.g., the conversational style of Waldorf and Statler).

In [4]:
# Calculate ZCR features
zcr_features = extract_zcr(audio_data)

# Extract loudness features
loudness_features = extract_loudness(audio_data)

# Extract rhythm features
rhythm_features = extract_rhythm(audio_data)

In [5]:
extracted_features_df = []

for video_idx, (audio_entry, zcr, loudness, rhythm) in enumerate(zip(audio_data, zcr_features, loudness_features, rhythm_features)):
    if zcr is not None and loudness is not None and rhythm is not None:
        num_frames = min(len(zcr), len(loudness), len(rhythm))
        for frame_idx in range(num_frames):
            extracted_features_df.append({
                "video_idx": video_idx,
                "frame_idx": frame_idx,
                "loudness_rms": loudness[frame_idx],
                "zcr": zcr[frame_idx],
                "rhythm": rhythm[frame_idx]
            })

extracted_features_df = pd.DataFrame(extracted_features_df)
print(extracted_features_df.shape)

(115885, 5)


In [6]:
for i in extracted_features_df["video_idx"].unique():
    print(f"Video {i}: {extracted_features_df[extracted_features_df['video_idx'] == i].shape}")


# Video 0 has 38681 frames.
# Video 1 has 38706 frames.
# Video 2 has 38498 frames.


Video 0: (38681, 5)
Video 1: (38706, 5)
Video 2: (38498, 5)


## Model Prep

In [7]:
# Create a mapping from filenames to video indices
video_idx_map = {filename: idx for idx, filename in enumerate(muppet_files.keys())}

# Prepare ground truth data with corrected video_idx
ground_truth_data = []
for video_filename, annotation_df in annotations.items():
    video_idx = video_idx_map[video_filename]  # Map video filename to its index
    for _, row in annotation_df.iterrows():
        ground_truth_data.append({
            'video_idx': video_idx,  # Use mapped video index
            'frame_idx': row['Frame_number'],  # Assuming Frame_number exists
            'Kermit': row['Kermit'],  # Assuming Kermit is a column in the annotation
            'Audio_StatlerWaldorf': row['Audio_StatlerWaldorf']  # Assuming this column exists
        })

# Create a DataFrame for ground truth
ground_truth_df = pd.DataFrame(ground_truth_data)


In [8]:
print(ground_truth_df.shape)
print(extracted_features_df.shape)

(115885, 4)
(115885, 5)


In [9]:
# Merge features with ground truth
feature_df = pd.merge(extracted_features_df, ground_truth_df, on=['video_idx', 'frame_idx'], how='left')
feature_df.shape

(115885, 7)

In [10]:
# split_points = {
#     0: 19716,  # Video 0
#     1: 19719,  # Video 1
#     2: 19432, # Video 2 
# }

# Assuming feature_df is the dataframe containing video_idx and frame_idx columns
feature_df, split_overview = partition_feature_df(feature_df)

In [11]:
feature_df.head()

Unnamed: 0,video_idx,frame_idx,loudness_rms,zcr,rhythm,Kermit,Audio_StatlerWaldorf,fold
0,0,0,0.0,0.0,0.0,0,0,0-A
1,0,1,0.0,0.0,0.0,0,0,0-A
2,0,2,0.0,0.0,0.0,0,0,0-A
3,0,3,0.0,0.0,0.0,0,0,0-A
4,0,4,0.0,0.0,0.0,0,0,0-A


In [12]:
# Display the results
print(split_overview)

feature_df['fold'].unique()

   video_idx fold  Kermit  Audio_StatlerWaldorf
0          0  0-A    2916                  1015
1          0  0-B    2533                   399
2          1  1-A    3925                   542
3          1  1-B    8327                   282
4          2  2-A    5231                   171
5          2  2-B   10277                   308


array(['0-A', '0-B', '1-A', '1-B', '2-A', '2-B'], dtype=object)

In [13]:
# Check for invalid values
print("NaN in features:", feature_df[['loudness_rms', 'rhythm', 'zcr']].isnull().sum())
print()
print("Infinite values in features:", np.isinf(feature_df[['loudness_rms', 'rhythm', 'zcr']]).sum())


NaN in features: loudness_rms    0
rhythm          0
zcr             0
dtype: int64

Infinite values in features: loudness_rms    0
rhythm          0
zcr             0
dtype: int64


In [14]:
# # Add the target variable to feature_df
# feature_df['target'] = create_target_variable(feature_df)

# print("Unique target values:", feature_df['target'].unique())
# Note Statler/Wald and Kermit have no simultanous 1s thus [0,1,2]

## Nested CV - KNN

### Kermit

In [16]:
from sklearn.neighbors import KNeighborsClassifier 

# Define KNN model
knn = KNeighborsClassifier()

# Define parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'] 
}

train_cols = ['loudness_rms', 'rhythm', 'zcr']
target_col = 'Kermit'
results_kermit, summary_kermit, best_model_kermit = nested_cross_validation(feature_df, train_cols, target_col, KNeighborsClassifier, param_grid)

Outer Fold: 0-A
  Inner Validation Fold: 0-B
  Inner Validation Fold: 1-A
  Inner Validation Fold: 1-B
  Inner Validation Fold: 2-A
  Inner Validation Fold: 2-B
Metrics for Fold 0-A: {'outer_fold': '0-A', 'accuracy': 0.6859403530127814, 'precision': np.float64(0.7534524776829667), 'recall': np.float64(0.6859403530127814), 'f1': np.float64(0.7152562584077999), 'roc_auc': np.float64(0.5273406472499836)}
Outer Fold: 0-B
  Inner Validation Fold: 0-A
  Inner Validation Fold: 1-A
  Inner Validation Fold: 1-B
  Inner Validation Fold: 2-A
  Inner Validation Fold: 2-B
Metrics for Fold 0-B: {'outer_fold': '0-B', 'accuracy': 0.686527814394938, 'precision': np.float64(0.7720901048139849), 'recall': np.float64(0.686527814394938), 'f1': np.float64(0.7230335517999853), 'roc_auc': np.float64(0.5170890064200269)}
Outer Fold: 1-A
  Inner Validation Fold: 0-A
  Inner Validation Fold: 0-B
  Inner Validation Fold: 1-B
  Inner Validation Fold: 2-A
  Inner Validation Fold: 2-B
Metrics for Fold 1-A: {'outer_f

In [17]:
best_model

{'0-A': KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='distance'),
 '0-B': KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='distance'),
 '1-A': KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='distance'),
 '1-B': KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='distance'),
 '2-A': KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='distance'),
 '2-B': KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='distance')}

In [53]:
#  TODO:
#     - schreib so um dass direkt ein cleaner Test-Train Split
#     - Einmal Kermit und einmal die Alten   
#     - add features aus sim2
#     - mach jeweils KNN (Hedger) vs RF (seperator)  



In [None]:
# Define KNN model
knn = KNeighborsClassifier()

# Define parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Ensure these are smaller than the smallest fold size
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']  # Supported metrics
}

train_cols = ['loudness_rms', 'rhythm', 'zcr']
target_col = 'Audio_StatlerWaldorf'
results_statler, summary_statler, best_models_statler = nested_cross_validation(feature_df, train_cols, target_col, KNeighborsClassifier, param_grid)

## Evaluation