In [None]:
import os
import librosa
import numpy as np
import pandas as pd

def augment_audio(y, sr, noise_factor, stretch_factor, n_steps):
    """
    Creates and returns 3 augmentated versions of an inputted audio file: 
    1. One with added noise (where random noise is scaled by a relatively low noise factor)
    2. One with time stretching 
    3. One with a pitch shift 
    Args:
        y: audio time series of original file
        sr: sampling rate of original audio file
        noise_factor: scales the amplitude of the random noise that is added to of the original time series
        stretch_factor: factor for time stretching (stretch_factor < 1 for a sped up version, stretch_factor > 1 for a slowed down version)
        n_steps: steps for pitch shifting (n_steps < 0 for a lower pitched version, n_steps > 0 for a higher pitched version)
    """
    # Noisy version
    y_noise = y + noise_factor * np.random.randn(len(y))
    
    # Stretched version
    y_stretch = librosa.effects.time_stretch(y, rate=stretch_factor)
    
    # Shifted version
    y_shift = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)
    
    return y_noise, y_stretch, y_shift

def extract_features(y, sr, file_name, genre):
    """
    Extract the mean and variance of various audio features and return them in a list with the file name and genre.
    """
    # Extract features from an audio file
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rms = librosa.feature.rms(y=y)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    harmony, perceptr = librosa.effects.hpss(y)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    # Convert into a list of features (to be converted to a dataframe later)
    features = [file_name, 
                np.mean(chroma_stft), np.var(chroma_stft),
                np.mean(rms), np.var(rms),
                np.mean(spectral_centroid), np.var(spectral_centroid),
                np.mean(spectral_bandwidth), np.var(spectral_bandwidth),
                np.mean(rolloff), np.var(rolloff),
                np.mean(zcr), np.var(zcr),
                np.mean(harmony), np.var(harmony),
                np.mean(perceptr), np.var(perceptr),
                tempo] + \
                list(np.mean(mfcc, axis=1)) + \
                list(np.var(mfcc, axis=1)) + \
                [genre]
    
    return features

In [None]:
# Initialize the DataFrame columns as the features and labels
columns = ['filename', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean', 'rms_var', 
           'spectral_centroid_mean', 'spectral_centroid_var', 
           'spectral_bandwidth_mean', 'spectral_bandwidth_var', 
           'rolloff_mean', 'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var', 
           'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 
           'tempo'] + [f'mfcc{i}_mean' for i in range(1, 21)] + [f'mfcc{i}_var' for i in range(1, 21)] + ['genre']
features_list = []

# Define the path to the audio files folder
audio_files_path = '/Users/nikanhassanipak/Library/Mobile Documents/com~apple~CloudDocs/Georgia Tech/Spring 2024/CS 4641/Project/All_Audio_Files'

# Loop through each song in the audio files folder, extracting the features for both the oroginal and augmented versions of the song
for file in os.listdir(audio_files_path):
    if file.endswith('.wav'):
        file_path = os.path.join(audio_files_path, file)
        y, sr = librosa.load(file_path)
        
        # Extract the features from the original version of the audio file and append it to the list
        genre = file.split('.')[0]
        features_list.append(extract_features(y, sr, file, genre))
        
        # Extract the features from the augmented versions of the audio file and append it to the list
        # In this case noise_factor=0.005, so minimal random noise is added, stretch_factor=0.8, so a sped up version is used, and n_steps=-1 so a lower pitched version is used
        y_noise, y_stretch, y_shift = augment_audio(y, sr, noise_factor=0.005, stretch_factor=0.8, n_steps=-1)
        features_list.append(extract_features(y_noise, sr, file + '_noise', genre))
        features_list.append(extract_features(y_stretch, sr, file + '_stretch', genre))
        features_list.append(extract_features(y_shift, sr, file + '_shift', genre))

In [None]:
# Convert to features_list to a DataFrame
features_df = pd.DataFrame(features_list, columns=columns)

# Create regex pattern to create the base name for each song (everything before '.wav')
pattern = r'^(.+)\.wav'

# Extract the base name using the pattern
features_df['base_name'] = features_df['filename'].str.extract(pattern)[0]

In [None]:
# Write the features_df to an 'Original_format' csv (representing the pure form of the data, which includes all features)
filename = 'Original_format.csv'
features_df.to_csv(filename, index=False) 