## 1. Import required Libs

In [9]:
import re
import os
import csv
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt
from IPython.display import Audio
from pyloudnorm import Meter, normalize

## 2. Define some required functions 

In [4]:
def extract_student_id(filename):
    pattern = r"\d{9}"
    match = re.search(pattern, filename)
    if match:
        return match.group(0)
    else:
        return None



def denoise_speech_bandpass(audio_data, sr, lowcut, highcut, order=5):
  nyq = 0.5 * sr
  low = lowcut / nyq
  high = highcut / nyq
  b, a = butter(order, [low, high], btype='band')
  denoised_audio = filtfilt(b, a, audio_data)
  return denoised_audio


def remove_silence(audio_data, sr, threshold=0.05, frame_length=4096, hop_length=512):
    rms = librosa.feature.rms(y=audio_data, frame_length=frame_length, hop_length=hop_length)[0]
    frames_above_threshold = np.where(rms > threshold)[0]
    non_silent_segments = []
    for i in range(len(frames_above_threshold)):
        start_sample = frames_above_threshold[i] * hop_length
        end_sample = min((frames_above_threshold[i] + 1) * hop_length, len(audio_data))  # Prevent exceeding audio length
        non_silent_segments.append(audio_data[start_sample:end_sample])
    if len(non_silent_segments) > 0:
      non_silent_audio = np.concatenate(non_silent_segments)
    else:
      non_silent_audio = audio_data
    return non_silent_audio


def normalize_audio(audio_data, sr,  target_lufs=-14):
  meter = Meter(sr) # create BS.1770 meter
  loudness = meter.integrated_loudness(audio_data)
  loudness_normalized_audio = normalize.loudness(audio_data, loudness, target_lufs)
  return loudness_normalized_audio


def preprocess_audio(audio_data, sr=22050):
  y_denoised = denoise_speech_bandpass(audio_data, sr, lowcut=100, highcut=8000, order=6)
  y_normalized = normalize_audio(y_denoised, sr, -14)
  non_silent_audio = remove_silence(y_normalized, sr, threshold=0.05, frame_length=4096, hop_length=512)
  return non_silent_audio

## 3. Extract features

In [15]:
def extract_features(segment, target_sr):
    mfcc = librosa.feature.mfcc(y=segment, sr=target_sr, n_mfcc=13)
    mfcc = np.mean(mfcc.T, axis=0)
    spectral_contrast = librosa.feature.spectral_contrast(y=segment, sr=target_sr)
    spectral_contrast = np.mean(spectral_contrast, axis = 1)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=segment)
    zero_crossing_rate = np.mean(zero_crossing_rate)
    rms = np.mean(librosa.feature.rms(y=segment).T, axis=0)
    spectral_centroid = librosa.feature.spectral_centroid(y=segment, sr=target_sr)
    spectral_centroid = np.mean(spectral_centroid)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=target_sr)
    spectral_bandwidth = np.mean(spectral_bandwidth)

    feature_data = {
        'Mfcc_' + str(i+1): mfcc[i] for i in range(len(mfcc))
    }
    
    feature_data.update({
        'Spectral_Contrast_' + str(i+1): spectral_contrast[i] for i in range(len(spectral_contrast))
    })
    
    feature_data['Zero_Crossing_Rate'] = zero_crossing_rate
    feature_data['RMS_Energy'] = rms
    feature_data['Spectral_Centroid'] = spectral_centroid
    feature_data['Spectral_Bandwidth'] = spectral_bandwidth

    return feature_data

def process_audio(audio_directory, output_file, target_sr=22050, segment_duration=4.0):
    np.random.seed(42)
    student_info = []
    for filename in os.listdir(audio_directory):
        if 'female' in filename.lower():
            label = 'female'
        elif 'male' in filename.lower():
            label = 'male'
        else:
            continue
        student_id = extract_student_id(filename)
        student_info.append((student_id, label, filename))
    df_info = pd.DataFrame(student_info, columns=['student_id', 'gender', 'filename'])
    male_students = df_info[df_info['gender'] == 'male']['student_id'].unique()
    female_students = df_info[df_info['gender'] == 'female']['student_id'].unique()
    min_count = min(len(male_students), len(female_students))
    balanced_male_students = np.random.choice(male_students, min_count, replace=False)
    balanced_female_students = np.random.choice(female_students, min_count, replace=False)
    balanced_student_ids = np.concatenate((balanced_male_students, balanced_female_students))
    df_balanced = df_info[df_info['student_id'].isin(balanced_student_ids)]
    feature_columns = ['filename', 'label', 'student_id'] + \
                      ['Mfcc_' + str(i) for i in range(1, 14)] + \
                      ['Spectral_Contrast_' + str(i) for i in range(1, 8)] + \
                      ['Zero_Crossing_Rate', 'RMS_Energy', 'Spectral_Centroid', 'Spectral_Bandwidth']
    with open(output_file, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=feature_columns)
        writer.writeheader()
        for _, row in df_balanced.iterrows():
            label = row['gender']
            student_id = row['student_id']
            filename = row['filename']
            y, sr = librosa.load(os.path.join(audio_directory, filename), offset=1.0, duration=500.0, sr=target_sr)
            y = preprocess_audio(y, sr=sr)
            segment_length = int(segment_duration * sr)
            num_segments = len(y) // segment_length
            for i in range(num_segments):
                start = i * segment_length
                end = start + segment_length
                segment = y[start:end]
                feature_data = extract_features(segment, target_sr)
                feature_data['filename'] = filename + f'_{i}'
                feature_data['label'] = label
                feature_data['student_id'] = student_id
                writer.writerow(feature_data)

In [16]:
import warnings
import pyloudnorm

warnings.filterwarnings("ignore", category=UserWarning, module='pyloudnorm')

output_file = 'final_Audio_dataset.csv'
process_audio('../Data/raw', output_file)

In [17]:
dataset = pd.read_csv(output_file)

print(dataset.head())

                      filename label  student_id      Mfcc_1      Mfcc_2  \
0  hw1_q2_610399205_male.mp3_0  male   610399205  -94.829977  122.175083   
1  hw1_q2_610399205_male.mp3_1  male   610399205 -123.197408  131.659141   
2  hw1_q2_610399205_male.mp3_2  male   610399205 -122.910702  141.014834   
3  hw1_q2_610399205_male.mp3_3  male   610399205 -140.025639  126.584783   
4  hw1_q2_610399205_male.mp3_4  male   610399205 -127.307332  122.204455   

      Mfcc_3     Mfcc_4     Mfcc_5     Mfcc_6     Mfcc_7  ...  \
0 -68.000374  47.507958 -14.407363 -13.427573 -23.064053  ...   
1 -61.479495  44.792372  -7.927916 -10.119547 -21.518692  ...   
2 -67.712628  39.928565 -14.785694  -7.251495 -24.902940  ...   
3 -53.874062  40.963797  -5.784167  -5.752144 -22.515332  ...   
4 -61.316071  54.882303  -0.544779 -11.455723 -28.317545  ...   

   Spectral_Contrast_2  Spectral_Contrast_3  Spectral_Contrast_4  \
0            16.537672            17.337876            15.638074   
1            16.