In [35]:
import numpy as np
import matplotlib.pyplot as plt
import os
import librosa
from IPython.display import Audio
import pandas as pd

In [36]:
!pip install pyloudnorm



In [37]:
!pip install gdown
import gdown



In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
# Define audio directory
audio_directory = '/content/drive/My Drive/Audio_project/HW1_M'  # Replace with your directory
print("Ok :)")

Ok :)


In [40]:
import re
def extract_student_id(filename):
    pattern = r"\d{9}"
    match = re.search(pattern, filename)
    if match:
        return match.group(0)
    else:
        return None

In [41]:
from scipy.signal import butter, filtfilt

def denoise_speech_bandpass(audio_data, sr, lowcut, highcut, order=5):
  """Denoises speech using a bandpass filter.

  Args:
    audio_data: The audio data as a NumPy array.
    sr: The sampling rate of the audio data.
    lowcut: The lower cutoff frequency of the bandpass filter.
    highcut: The upper cutoff frequency of the bandpass filter.
    order: The order of the bandpass filter.

  Returns:
    The denoised audio data as a NumPy array.
  """

  # Calculate Nyquist frequency
  nyq = 0.5 * sr

  # Normalize cutoff frequencies
  low = lowcut / nyq
  high = highcut / nyq

  # Design bandpass filter
  b, a = butter(order, [low, high], btype='band')

  # Apply filter
  denoised_audio = filtfilt(b, a, audio_data)

  return denoised_audio

In [42]:
def remove_silence(audio_data, sr, threshold=0.05, frame_length=4096, hop_length=512):
    # Calculate RMS energy for each frame
    rms = librosa.feature.rms(y=audio_data, frame_length=frame_length, hop_length=hop_length)[0]

    # Find frames above the threshold
    frames_above_threshold = np.where(rms > threshold)[0]

    # Initialize an empty list to store non-silent segments
    non_silent_segments = []

    # Iterate over frames above the threshold
    for i in range(len(frames_above_threshold)):
        # Get start and end samples for the current segment
        start_sample = frames_above_threshold[i] * hop_length
        end_sample = min((frames_above_threshold[i] + 1) * hop_length, len(audio_data))  # Prevent exceeding audio length

        # Append the current non-silent segment to the list
        non_silent_segments.append(audio_data[start_sample:end_sample])

    # Concatenate all non-silent segments
    if len(non_silent_segments) > 0:
      non_silent_audio = np.concatenate(non_silent_segments)
    else:
      non_silent_audio = audio_data

    return non_silent_audio

In [43]:
from pyloudnorm import Meter, normalize

def normalize_audio(audio_data, sr,  target_lufs=-14):
  """Normalizes all audio files in a directory to a target LUFS level."""
  # measure the loudness first
  meter = Meter(sr) # create BS.1770 meter
  loudness = meter.integrated_loudness(audio_data)

  # loudness normalize audio to target_lufs dB
  loudness_normalized_audio = normalize.loudness(audio_data, loudness, target_lufs)
  return loudness_normalized_audio

In [44]:
def preprocess_audio(audio_data, sr=22050):
  y_denoised = denoise_speech_bandpass(audio_data, sr, lowcut=100, highcut=8000, order=6)
  y_normalized = normalize_audio(y_denoised, sr, -14)
  non_silent_audio = remove_silence(y_normalized, sr, threshold=0.05, frame_length=4096, hop_length=512)
  return non_silent_audio

In [45]:
import csv

def extract_features(segment, target_sr):

    mfcc = librosa.feature.mfcc(y=segment, sr=target_sr, n_mfcc=13)
    mfcc = np.mean(mfcc.T, axis=0)

    spectral_contrast = librosa.feature.spectral_contrast(y=segment, sr=target_sr)
    spectral_contrast = np.mean(spectral_contrast, axis = 1)

    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=segment)
    zero_crossing_rate = np.mean(zero_crossing_rate)

    rms = np.mean(librosa.feature.rms(y=segment).T, axis=0)

    spectral_centroid = librosa.feature.spectral_centroid(y=segment, sr=target_sr)
    spectral_centroid = np.mean(spectral_centroid)

    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=target_sr)
    spectral_bandwidth = np.mean(spectral_bandwidth)

    feature_data = {
        'Mfcc_' + str(i+1): mfcc[i] for i in range(len(mfcc))
    }
    feature_data.update({
        'Spectral_Contrast_' + str(i+1): spectral_contrast[i] for i in range(len(spectral_contrast))
    })
    feature_data['Zero_Crossing_Rate'] = zero_crossing_rate
    feature_data['RMS_Energy'] = rms
    feature_data['Spectral_Centroid'] = spectral_centroid
    feature_data['Spectral_Bandwidth'] = spectral_bandwidth

    return feature_data

def process_audio(audio_directory, output_file, target_sr=22050, segment_duration=4.0):
    np.random.seed(42)
    student_info = []

    for filename in os.listdir(audio_directory):
        if 'female' in filename.lower():
            label = 'female'
        elif 'male' in filename.lower():
            label = 'male'
        else:
            continue

        student_id = extract_student_id(filename)
        student_info.append((student_id, label, filename))

    df_info = pd.DataFrame(student_info, columns=['student_id', 'gender', 'filename'])

    male_students = df_info[df_info['gender'] == 'male']['student_id'].unique()
    female_students = df_info[df_info['gender'] == 'female']['student_id'].unique()

    min_count = min(len(male_students), len(female_students))

    balanced_male_students = np.random.choice(male_students, min_count, replace=False)
    balanced_female_students = np.random.choice(female_students, min_count, replace=False)

    balanced_student_ids = np.concatenate((balanced_male_students, balanced_female_students))
    df_balanced = df_info[df_info['student_id'].isin(balanced_student_ids)]

    feature_columns = ['filename', 'label', 'student_id'] + \
                      ['Mfcc_' + str(i) for i in range(1, 14)] + \
                      ['Spectral_Contrast_' + str(i) for i in range(1, 8)] + \
                      ['Zero_Crossing_Rate', 'RMS_Energy', 'Spectral_Centroid', 'Spectral_Bandwidth']

    with open(output_file, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=feature_columns)
        writer.writeheader()

        for _, row in df_balanced.iterrows():
            label = row['gender']
            student_id = row['student_id']
            filename = row['filename']

            y, sr = librosa.load(os.path.join(audio_directory, filename), offset=1.0, duration=500.0, sr=target_sr)
            y = preprocess_audio(y, sr=sr)
            segment_length = int(segment_duration * sr)
            num_segments = len(y) // segment_length

            for i in range(num_segments):
                start = i * segment_length
                end = start + segment_length
                segment = y[start:end]

                feature_data = extract_features(segment, target_sr)
                feature_data['filename'] = filename + f'_{i}'
                feature_data['label'] = label
                feature_data['student_id'] = student_id

                writer.writerow(feature_data)




In [46]:
import warnings
import pyloudnorm

# Disable the normalization user warning
warnings.filterwarnings("ignore", category=UserWarning, module='pyloudnorm')

In [None]:
# Example usage
output_file = '/content/drive/My Drive/final_Audio_dataset.csv'
process_audio(audio_directory, output_file)



In [53]:
# Load the resulting CSV file into a DataFrame
dataset = pd.read_csv(output_file)

# Print the first few rows of the dataset
print(dataset.head())

                      filename label  student_id      Mfcc_1      Mfcc_2  \
0  HW1_Q5_810801072_Male.mp3_0  male   810801072 -155.217852  135.871079   
1  HW1_Q5_810801072_Male.mp3_1  male   810801072 -177.114558  140.304623   
2  HW1_Q5_810801072_Male.mp3_2  male   810801072 -162.584909  120.209218   
3  HW1_Q5_810801072_Male.mp3_3  male   810801072 -165.845303  145.862184   
4  HW1_Q5_810801072_Male.mp3_4  male   810801072 -178.106893  132.620852   

      Mfcc_3     Mfcc_4    Mfcc_5     Mfcc_6     Mfcc_7  ...  \
0 -13.262353  47.794728  7.207893   7.300228 -22.951777  ...   
1 -21.115352  46.488005 -3.399340  16.619354 -25.741743  ...   
2 -14.282396  57.224400 -7.945799  16.344686 -23.463047  ...   
3 -10.069458  37.615732 -4.063423  11.771095 -19.396108  ...   
4 -11.592609  59.768651  1.988449   9.041938 -14.379663  ...   

   Spectral_Contrast_2  Spectral_Contrast_3  Spectral_Contrast_4  \
0            15.134972            17.070269            14.433772   
1            14.806266

In [None]:
from IPython.display import Audio
filename = '/content/drive/MyDrive/Audio_project/HW1_M/HW1-Q1-1-810103226-male.mp3'

y, sr = librosa.load(filename, offset=1.0, duration=500.0)
y = preprocess_audio(y)

print(sr)

display(Audio(y[0: 5*sr], rate=sr))

In [None]:
data = extract_features(y[0:5*sr], sr)