# Audio Feature Analysis SIM 2 - Detecting Pigs and the Swedish Chef

In [61]:
import librosa
import numpy as np
import pandas as pd
from scripts.load_data import check_and_load
from scripts.nested_cv import partition_feature_df
from scripts.sim2_extract_audio_features import extract_mfcc, extract_log_mel_spectrogram, extract_yin

In [2]:
data_path = "../ground_truth_data"
frames_output_dir = "../ground_truth_data/frames"
audio_output_dir = "../ground_truth_data"
annotations_path = "../ground_truth_data"

muppet_files = {
    "Muppets-02-01-01.avi": "GroundTruth_Muppets-02-01-01.csv",
    "Muppets-02-04-04.avi": "GroundTruth_Muppets-02-04-04.csv",
    "Muppets-03-04-03.avi": "GroundTruth_Muppets-03-04-03.csv",
}

annotations, audio_data, frames = check_and_load(data_path, frames_output_dir, audio_output_dir, annotations_path, muppet_files)

Frames and audio are already extracted.
Loading audio segments...
Loaded 3 audio files.
Loaded audio segments for 3 videos.
Loaded frames for 3 videos.
Number of videos with frames: 3
Video 0 has 38681 frames.
Video 1 has 38706 frames.
Video 2 has 38498 frames.


## MFCC

In [4]:
mfcc_features  = extract_mfcc(audio_data)

## Log Mel spectrogram

In [8]:
log_mel_features = extract_log_mel_spectrogram(audio_data)

## Probabilistic YIN pitch extraction

In [None]:
extract = False

if extract == True:
    yin_extracted = extract_yin(audio_data)
        
    out = pd.DataFrame()
    video_id_mapper = {"Muppets-02-01-01.wav":0, "Muppets-03-04-03.wav":2, "Muppets-02-04-04.wav":1}
    
    for audio_entry in audio_data:
        audio_file = audio_entry.get('audio_file', 'Unknown file')
        video_id = video_id_mapper.get(audio_file)
        
        df = pd.DataFrame(yin_extracted[audio_file]).T
        df = df.rename(columns={0:"yin", 1:"voiced_flag", 2:"voiced_prob"})
        df = df.reset_index(names="frame_idx")
        df["video_idx"] = video_id
        
        out = pd.concat([out, df], axis=0)
        
    return out

    output_yin.to_csv("../model_data/features/yin_features.csv", index=False)
    
else: 
    print("Set extract to True to extract features")

Processing audio entries:   0%|          | 0/3 [00:00<?, ?it/s]

Processing audio entries: 100%|██████████| 3/3 [17:40<00:00, 353.57s/it]


## Extract features

In [None]:
output_yin = output_yin.fillna(0)
output_yin = output_yin[["yin", "video_id", "frame_id"]]

In [56]:
# Initialize the final list to hold feature rows
extracted_features_df = []

# Loop through the data for each video
for video_idx, (audio_entry, mfcc, log_mel) in enumerate(
        zip(audio_data, mfcc_features, log_mel_features)):
    
    num_frames = mfcc.shape[1]-2  # -2 accounts for edge frames in MFCC
    print(num_frames)

    for frame_idx in range(num_frames):
        feature_row = {
            "video_idx": video_idx,
            "frame_idx": frame_idx
        }

        # Add MFCC features
        mfcc_values = {f"mfcc_{i+1}": mfcc[i, frame_idx] for i in range(mfcc.shape[0])}
        feature_row.update(mfcc_values)
        
        # Add Log Mel features
        log_mel_values = {f"log_mel_{i+1}": log_mel[i, frame_idx] for i in range(log_mel.shape[0])}
        feature_row.update(log_mel_values)

        # Append the row to the list
        extracted_features_df.append(feature_row)

# Convert the list of features into a DataFrame
extracted_features_df = pd.DataFrame(extracted_features_df) 
extracted_features_df = pd.merge(extracted_features_df, output_yin, on = ["frame_idx", "video_idx"], how="left")

# Display the shape of the combined DataFrame
print(extracted_features_df.shape)

38681
38498
38706
(115885, 63)


In [57]:
extracted_features_df.columns

Index(['video_idx', 'frame_idx', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4',
       'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11',
       'mfcc_12', 'mfcc_13', 'mfcc_14', 'mfcc_15', 'mfcc_16', 'mfcc_17',
       'mfcc_18', 'mfcc_19', 'mfcc_20', 'log_mel_1', 'log_mel_2', 'log_mel_3',
       'log_mel_4', 'log_mel_5', 'log_mel_6', 'log_mel_7', 'log_mel_8',
       'log_mel_9', 'log_mel_10', 'log_mel_11', 'log_mel_12', 'log_mel_13',
       'log_mel_14', 'log_mel_15', 'log_mel_16', 'log_mel_17', 'log_mel_18',
       'log_mel_19', 'log_mel_20', 'log_mel_21', 'log_mel_22', 'log_mel_23',
       'log_mel_24', 'log_mel_25', 'log_mel_26', 'log_mel_27', 'log_mel_28',
       'log_mel_29', 'log_mel_30', 'log_mel_31', 'log_mel_32', 'log_mel_33',
       'log_mel_34', 'log_mel_35', 'log_mel_36', 'log_mel_37', 'log_mel_38',
       'log_mel_39', 'log_mel_40', 'yin'],
      dtype='object')

In [60]:
extracted_features_df.to_csv("../model_data/features/sim2_audio_features.csv")

In [44]:
output_yin = output_yin.fillna(0)
output_yin = output_yin[["yin", "video_id", "frame_id"]]

In [55]:
log_mel_features[0].shape

(40, 38683)

# Pigs classification

# Cook classification

In [18]:
gt_211_nointro.loc[gt_211_nointro["Audio_Cook"] == 1]

NameError: name 'gt_211_nointro' is not defined

In [24]:
audio_path = "../ground_truth_data/Muppets-02-01-01.wav"

# Frame numbers for the range
frame_start = 28965
frame_end = 28980
fps = 25

# Convert frame numbers to timestamps (in seconds)
timestamp_start = frame_start / fps
timestamp_end = frame_end / fps

# Load audio file using librosa
audio, sr = librosa.load(audio_path, sr=None)  # Use the original sampling rate

# Convert timestamps to samples
start_sample = int(timestamp_start * sr)
end_sample = int(timestamp_end * sr)

# Extract the audio segment
audio_segment = audio[start_sample:end_sample]

# Plot the waveform of the extracted audio segment
plt.figure(figsize=(10, 4))
plt.plot(np.linspace(timestamp_start, timestamp_end, len(audio_segment)), audio_segment)
plt.title(f"Audio Waveform: {timestamp_start}s to {timestamp_end}s")
plt.xlabel("Time (seconds)")
plt.ylabel("Amplitude")
plt.grid(True)
plt.show()

NameError: name 'plt' is not defined