# Data Exploration

This notebook is for exploring and visualizing the data.

In [1]:
import os
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

In [2]:
# Define PATH
ADReSS2020_DATAPATH = "../data/ADReSS-IS2020-data"
ADReSS2020_TRAINPATH = os.path.join(ADReSS2020_DATAPATH, "train")
ADReSS2020_TESTPATH = os.path.join(ADReSS2020_DATAPATH, "test")

FULL_WAVE_NAME = "Full_wave_enhanced_audio"
CHUNK_WAVE_NAME = "Normalised_audio-chunks"

In [3]:
os.listdir(ADReSS2020_TRAINPATH)

['Normalised_audio-chunks',
 'cd_meta_data.txt',
 'cc_meta_data.txt',
 'README.md',
 'transcription',
 'Full_wave_enhanced_audio']

In [4]:
# Function to get file paths and labels
def get_audio_files_and_labels(dataset_path, split_folder_path, split):
    audio_files = []
    labels = []

    if split == 'train':
        for folder in os.listdir(split_folder_path):
            folder_path = os.path.join(split_folder_path, folder)
            if os.path.isdir(folder_path) and os.path.basename(folder_path) == FULL_WAVE_NAME:
                for label in os.listdir(folder_path):
                    label_path = os.path.join(folder_path, label)
                    if os.path.isdir(label_path):
                        for file_name in os.listdir(label_path):
                            if file_name.endswith('.wav'):
                                audio_files.append(os.path.join(label_path, file_name))
                                if label == 'cc':
                                    labels.append(0)
                                elif label == 'cd':
                                    labels.append(1)
    
    elif split == 'test':
        test_df = pd.read_csv(dataset_path + '/2020Labels.txt', delimiter=';', skipinitialspace=True)
        test_df = test_df.drop(columns=['age', 'mmse', 'gender'], axis=1)
        
        for folder in os.listdir(split_folder_path):
            folder_path = os.path.join(split_folder_path, folder)
            if os.path.isdir(folder_path) and os.path.basename(folder_path) == FULL_WAVE_NAME:
                for file_name in os.listdir(folder_path):
                    if file_name.endswith('.wav'):
                        audio_name = file_name.split('.')[0] + ' '
                        audio_files.append(os.path.join(folder_path, file_name))
                        labels.append(test_df[test_df['ID'] == audio_name].Label.iloc[0])
                        
    return audio_files, labels

In [5]:
# Load train and test data
train_audio_files, train_labels = get_audio_files_and_labels(ADReSS2020_DATAPATH, ADReSS2020_TRAINPATH, split='train')
test_audio_files, test_labels = get_audio_files_and_labels(ADReSS2020_DATAPATH, ADReSS2020_TESTPATH, split='test')

len(train_audio_files), len(train_labels), len(test_audio_files), len(test_labels)

(108, 108, 48, 48)

In [6]:
# Data Augmentation Functions
def add_noise(data, noise_factor=0.005):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    return augmented_data

def pitch_shift(data, sampling_rate, pitch_factor=2):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

def time_stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=rate)


In [7]:
# Load and augment the audio data
def load_and_augment_audio(file_path, label, audio_data, audio_labels):
    data, sr = librosa.load(file_path, sr=None)
    augmented_data = [
        data,
        add_noise(data),
        pitch_shift(data, sr),
        time_stretch(data, 0.9),
        time_stretch(data, 1.1)
    ]
    for aug_data in augmented_data:
        audio_data.append(aug_data)
        audio_labels.append(label)


In [8]:
# # Prepare testing data
# audio_data = []
# audio_labels = []

# for file_path, label in tqdm(zip(test_audio_files, test_labels), total=len(test_audio_files)):
#     load_and_augment_audio(file_path, label, audio_data, audio_labels)


In [9]:
BATCH_SIZE = 10  # Adjust as needed

def process_batches(files, labels, batch_size=BATCH_SIZE):
    for i in range(0, len(files), batch_size):
        batch_files = files[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]
        batch_audio_data = []
        batch_audio_labels = []
        for file_path, label in tqdm(zip(batch_files, batch_labels), total=len(batch_files),
                                       desc=f"Processing batch {i//batch_size + 1}"):
            # Reference: [`scripts.utils.load_and_augment_audio`](scripts/utils.py#L130)
            load_and_augment_audio(file_path, label, batch_audio_data, batch_audio_labels)
        yield batch_audio_data, batch_audio_labels

# Usage example in a notebook cell:
audio_data = []
audio_labels = []

for batch_data, batch_labels in process_batches(train_audio_files, train_labels):
    # Process each batch (e.g., further pre-processing or saving results)
    audio_data.extend(batch_data)
    audio_labels.extend(batch_labels)
    # Optionally clear variables or process the batch to free memory
    print(f"Processed batch with {len(batch_data)} items")

Processing batch 1: 100%|██████████| 10/10 [00:18<00:00,  1.83s/it]


Processed batch with 50 items


Processing batch 2: 100%|██████████| 10/10 [00:14<00:00,  1.49s/it]


Processed batch with 50 items


Processing batch 3: 100%|██████████| 10/10 [00:19<00:00,  1.93s/it]


Processed batch with 50 items


Processing batch 4: 100%|██████████| 10/10 [00:15<00:00,  1.56s/it]


Processed batch with 50 items


Processing batch 5: 100%|██████████| 10/10 [00:16<00:00,  1.63s/it]


Processed batch with 50 items


Processing batch 6: 100%|██████████| 10/10 [00:22<00:00,  2.24s/it]


Processed batch with 50 items


Processing batch 7: 100%|██████████| 10/10 [00:19<00:00,  1.92s/it]


Processed batch with 50 items


Processing batch 8: 100%|██████████| 10/10 [00:24<00:00,  2.50s/it]


Processed batch with 50 items


Processing batch 9: 100%|██████████| 10/10 [00:22<00:00,  2.25s/it]


Processed batch with 50 items


Processing batch 10: 100%|██████████| 10/10 [00:20<00:00,  2.05s/it]


Processed batch with 50 items


Processing batch 11: 100%|██████████| 8/8 [00:19<00:00,  2.42s/it]

Processed batch with 40 items





In [10]:
# Segment the audio data into 25-second segments
def segment_audio(data, sr, segment_length=25):
    segment_samples = sr * segment_length
    segments = []
    for start in range(0, len(data), segment_samples):
        end = start + segment_samples
        if end <= len(data):
            segments.append(data[start:end])
    return segments

In [11]:
segmented_data = []
segmented_labels = []

for data, label in zip(audio_data, audio_labels):
    sr = librosa.get_samplerate(test_audio_files[0])
    segments = segment_audio(data, sr)
    segmented_data.extend(segments)
    segmented_labels.extend([label] * len(segments))


In [12]:
# Feature extraction with customizable window size and hop length
def extract_features(data, sr, n_mfcc=13, window_size=2048, hop_length=512):
    mfccs = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=n_mfcc, n_fft=window_size, hop_length=hop_length)
    return np.mean(mfccs.T, axis=0)

In [13]:
features = []
custom_window_size = 1024
custom_hop_length = 256

for segment in segmented_data:
    sr = librosa.get_samplerate(train_audio_files[0])
    mfccs = extract_features(segment, sr, window_size=custom_window_size, hop_length=custom_hop_length)
    features.append(mfccs)

X_train = np.array(features)
y_train = np.array(segmented_labels)


In [14]:
X_train.shape, y_train.shape

((1308, 13), (1308,))