# Audio Feature Analyses SIM 1

In [1]:
import librosa
import numpy as np
import pandas as pd


from scripts.load_data import check_and_load
from scripts.extract_audio_features import extract_zcr, extract_loudness, extract_rhythm, create_target_variable
from scipy.signal import correlate

## Data Loading

In [2]:
# # Define paths
# data_path = "../ground_truth_data/trimmed_videos"
# frames_output_dir = "../ground_truth_data/trimmed_videos/frames"
# audio_output_dir = "../ground_truth_data/trimmed_videos/audio"
# annotations_path = "../ground_truth_data/trimmed_videos"

data_path = "../ground_truth_data"
frames_output_dir = "../ground_truth_data/frames"
audio_output_dir = "../ground_truth_data/audio"
annotations_path = "../ground_truth_data"

muppet_files = {
    "Muppets-02-01-01.avi": "GroundTruth_Muppets-02-01-01.csv",
    "Muppets-02-04-04.avi": "GroundTruth_Muppets-02-04-04.csv",
    "Muppets-03-04-03.avi": "GroundTruth_Muppets-03-04-03.csv",
}

In [3]:
# TODO: add code to overwrite frames or audio in case only one exists
annotations, audio_data, frames = check_and_load(data_path, frames_output_dir, audio_output_dir, annotations_path, muppet_files)

Frames and/or audio not extracted. Running setup script...
Loaded annotations for 'Muppets-02-01-01.avi'.
Loaded annotations for 'Muppets-02-04-04.avi'.
Loaded annotations for 'Muppets-03-04-03.avi'.


Extracting frames from Muppets-02-01-01.avi:   7%|▋         | 2854/38681 [00:53<16:20, 36.55it/s] 

KeyboardInterrupt: 

## Feature Engineering

Loudness

    - Description: Measures overall signal strength through RMS of sample amplitudes.
    - Use Case: Effective when differentiating characters based on the volume or energy of their speech or sound.

Fundamental Frequency

    - Description: Extracts the pitch of the audio signal using methods like zero-crossing rate (ZCR).
    - Use Case: Useful for identifying characters with distinct pitch or tonal qualities in their voices (e.g., Kermit's high-pitched voice).

Rhythm Detection

    - Description: Uses autocorrelation to find repeating patterns across frames. Statistical moments indicate the presence of rhythm.
    - Use Case: Ideal for characters with unique speech cadences or rhythmic patterns (e.g., the conversational style of Waldorf and Statler).

In [4]:
# Calculate ZCR features
zcr_features = extract_zcr(audio_data)

# Extract loudness features
loudness_features = extract_loudness(audio_data)

# Extract rhythm features
rhythm_features = extract_rhythm(audio_data)

In [5]:
extracted_features_df = []

for video_idx, (audio_entry, zcr, loudness, rhythm) in enumerate(zip(audio_data, zcr_features, loudness_features, rhythm_features)):
    if zcr is not None and loudness is not None and rhythm is not None:
        num_frames = min(len(zcr), len(loudness), len(rhythm))
        for frame_idx in range(num_frames):
            extracted_features_df.append({
                "video_idx": video_idx,
                "frame_idx": frame_idx,
                "loudness_rms": loudness[frame_idx],
                "zcr": zcr[frame_idx],
                "rhythm": rhythm[frame_idx]
            })

extracted_features_df = pd.DataFrame(extracted_features_df)
print(extracted_features_df.shape)

(115852, 5)


## Model Prep

In [6]:
# Create a mapping from filenames to video indices
video_idx_map = {filename: idx for idx, filename in enumerate(muppet_files.keys())}

# Prepare ground truth data with corrected video_idx
ground_truth_data = []
for video_filename, annotation_df in annotations.items():
    video_idx = video_idx_map[video_filename]  # Map video filename to its index
    for _, row in annotation_df.iterrows():
        ground_truth_data.append({
            'video_idx': video_idx,  # Use mapped video index
            'frame_idx': row['Frame_number'],  # Assuming Frame_number exists
            'Kermit': row['Kermit'],  # Assuming Kermit is a column in the annotation
            'Audio_StatlerWaldorf': row['Audio_StatlerWaldorf']  # Assuming this column exists
        })

# Create a DataFrame for ground truth
ground_truth_df = pd.DataFrame(ground_truth_data)


In [7]:
# Merge features with ground truth
feature_df = pd.merge(extracted_features_df, ground_truth_df, on=['video_idx', 'frame_idx'], how='left')

In [8]:
# Add the target variable to feature_df
feature_df['target'] = create_target_variable(feature_df)

In [9]:
# Example Data (Replace with your feature_df data)
X = feature_df[['loudness_rms', 'rhythm', 'zcr']].values
y = feature_df['target'].values  # Replace 'Kermit' with your target variable

In [10]:
# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1120)


In [11]:
print("Unique target values:", feature_df['target'].unique())

Unique target values: [0 1 2]


In [12]:
# Check for missing values
print(feature_df.isnull().sum())


video_idx               0
frame_idx               0
loudness_rms            0
zcr                     0
rhythm                  0
Kermit                  0
Audio_StatlerWaldorf    0
target                  0
dtype: int64


In [13]:
# Check for invalid values
print("NaN in features:", feature_df[['loudness_rms', 'rhythm', 'zcr']].isnull().sum())
print("Infinite values in features:", np.isinf(feature_df[['loudness_rms', 'rhythm', 'zcr']]).sum())


NaN in features: loudness_rms    0
rhythm          0
zcr             0
dtype: int64
Infinite values in features: loudness_rms    0
rhythm          0
zcr             0
dtype: int64


In [32]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    average_precision_score,
    accuracy_score,
    roc_auc_score,
    classification_report
)
from sklearn.linear_model import LogisticRegression


## KNN

In [24]:
# Define KNN model
knn = KNeighborsClassifier()

# Define parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Ensure these are smaller than the smallest fold size
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']  # Supported metrics
}


In [None]:
# Use StratifiedKFold for balanced splits
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1120)

grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring='recall_weighted',  # Adjust scoring metric as needed
    cv=cv,
    n_jobs=-1,
    verbose=1
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
Best F1 Score: 0.6598352300117785


In [26]:
# Evaluate on the test set
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.90      0.78     23868
           1       0.33      0.11      0.17     10049
           2       0.00      0.00      0.00       839

    accuracy                           0.65     34756
   macro avg       0.34      0.34      0.32     34756
weighted avg       0.57      0.65      0.59     34756



In [None]:
# Get the best estimator from the grid search
best_knn = grid_search.best_estimator_
# Predict probabilities for all classes
y_proba = best_knn.predict_proba(X_test)

# Calculate the AUC for multiclass
auc = roc_auc_score(y_test, y_proba, multi_class='ovr')  # 'ovr' or 'ovo'

# Print the AUC
print("Multiclass AUC Score:", auc)

Multiclass AUC Score: 0.5321061417251541


In [17]:

# Convert labels for the first character
y_test_kermit = (y_test == 1) | (y_test == 3)
y_pred_kermit = (y_pred == 1) | (y_pred == 3)

# Convert labels for the second character
y_test_wald_stat = (y_test == 2) | (y_test == 3)
y_pred_wald_stat = (y_pred == 2) | (y_pred == 3)


In [18]:
# Compute metrics for the first character
accuracy_kermit = accuracy_score(y_test_kermit, y_pred_kermit)
precision_kermit = precision_score(y_test_kermit, y_pred_kermit)
recall_kermit = recall_score(y_test_kermit, y_pred_kermit)
f1_kermit = f1_score(y_test_kermit, y_pred_kermit)
map_kermit = average_precision_score(y_test_kermit, y_pred_kermit)

print(accuracy_kermit)

0.6756243526297617


In [19]:

# Compute metrics for the second character
accuracy_wald_stat = accuracy_score(y_test_wald_stat, y_pred_wald_stat)
precision_wald_stat = precision_score(y_test_wald_stat, y_pred_wald_stat)
recall_wald_stat = recall_score(y_test_wald_stat, y_pred_wald_stat)
f1_wald_stat = f1_score(y_test_wald_stat, y_pred_wald_stat)
map_wald_stat = average_precision_score(y_test_wald_stat, y_pred_wald_stat)


In [20]:
# Compute metrics for the general classifier as a whole
accuracy_general = accuracy_score(y_test, y_pred)
print(f"general accuracy: {accuracy_general}")

precision_general = precision_score(y_test, y_pred, average="weighted")
print(f"general precision: {precision_general}")

recall_general = recall_score(y_test, y_pred, average="weighted")
print(f"general recall: {recall_general}")

f1_general = f1_score(y_test, y_pred, average="weighted")
print(f"general f1: {f1_general}")



general accuracy: 0.6535849925192773
general precision: 0.5687129153956213
general recall: 0.6535849925192773
general f1: 0.58670871905626


## Logistic Regression

In [33]:
# Initialize Logistic Regression model
log_reg = LogisticRegression(solver='liblinear', random_state=42)

# Train the model
log_reg.fit(X_train, y_train)


In [35]:

# Predict probabilities for the test set
y_proba = log_reg.predict_proba(X_test)  # Probabilities for the positive class

# Calculate AUC for multiclass classification
auc = roc_auc_score(y_test, y_proba, multi_class='ovr')  # Use 'ovo' if desired
print("Multiclass AUC Score:", auc)


# Classification Report
y_pred = log_reg.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Multiclass AUC Score: 0.5491168375765431

Classification Report:
               precision    recall  f1-score   support

           0       0.69      1.00      0.81     23868
           1       0.19      0.00      0.00     10049
           2       0.00      0.00      0.00       839

    accuracy                           0.69     34756
   macro avg       0.29      0.33      0.27     34756
weighted avg       0.53      0.69      0.56     34756



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:

# Cross-Validated AUC
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_auc = cross_val_score(log_reg, X, y, cv=cv, scoring='roc_auc')
print("\nCross-Validated AUC Scores:", cv_auc)
print("Mean Cross-Validated AUC:", np.mean(cv_auc))

## Random Forest