In [1]:
import wave
import pandas as pd
import soundfile as sf

# files
import os

# progress
from tqdm import tqdm

# windowing
import math

# download kaggle dir
import shutil
from IPython.display import FileLink



pd.set_option("display.max_rows", None, "display.max_columns", None) 
pd.set_option('display.max_colwidth', None)

### **Building data frames of sample path and length**

In [3]:
# function
# input : path_to_wav_file
# output : length_in_seconds


def get_wav_duration(filepath):
    info = sf.info(filepath)
    return round(info.duration)
    
    
# filepath = '/kaggle/input/dementia-audio-data/denoised_cn/denoised_cn/no_interv/adrso014-new.wav'
# time = get_wav_duration(filepath)
# print(time)

In [4]:
# function
# input : path_to_directory_of_audio_samples
# output : dictionary_of_path_n_lengthSecs



def build_df(directory):    
    # list to hold paths, lengths
    paths = []
    lengths = []
    
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)

        # Check if the item is a file
        if os.path.isfile(filepath):
            paths.append(filepath)
            lengths.append(get_wav_duration(filepath))
    
    # make a dictionary 
    data_dict = {
    'sample_path' : paths,
    'sample_len' : lengths}
    
    return data_dict

In [5]:
# data frame for AD subjects

ad_dir = '/kaggle/input/dementia-audio-data/denoised_ad/denoised_ad/no_interv'
ad_data = pd.DataFrame(build_df(ad_dir))

In [6]:
# data frame for normal subjects

cn_dir = '/kaggle/input/dementia-audio-data/denoised_cn/denoised_cn/no_interv'
cn_data = pd.DataFrame(build_df(cn_dir))

### **Deciding optimal window size**

In [7]:
cn_data.describe()

Unnamed: 0,sample_len
count,78.0
mean,62.74359
std,24.644715
min,22.0
25%,46.25
50%,60.0
75%,75.0
max,154.0


In [8]:
ad_data.describe()

Unnamed: 0,sample_len
count,86.0
mean,75.813953
std,39.85961
min,26.0
25%,49.5
50%,70.0
75%,84.5
max,253.0


### minimum in each table is about 20 seconds, so lets go with 20-sec length windowing

### **Windowing**

#### **Add labels to indicate Dementia**

In [9]:
ad_data.loc[:,'AD'] = 1
cn_data.loc[:,'AD'] = 1

In [10]:
ad_data.head()

Unnamed: 0,sample_path,sample_len,AD
0,/kaggle/input/dementia-audio-data/denoised_ad/denoised_ad/no_interv/adrso055-new.wav,90,1
1,/kaggle/input/dementia-audio-data/denoised_ad/denoised_ad/no_interv/adrso192-new.wav,101,1
2,/kaggle/input/dementia-audio-data/denoised_ad/denoised_ad/no_interv/adrso068-new.wav,48,1
3,/kaggle/input/dementia-audio-data/denoised_ad/denoised_ad/no_interv/adrso250-new.wav,75,1
4,/kaggle/input/dementia-audio-data/denoised_ad/denoised_ad/no_interv/adrso244-new.wav,74,1


Plan is 
1. cut 20 sec parts 
    1. for each part assign same labels as of parent
    2. if residual part is less than 20 sec, as of now discard it, will decide if its correct or not
    (use only new data frame)

#### **Create new directories to store upsampled sets**

In [11]:
output_dir_for_cn = '/kaggle/working/up_sampled/cn'
output_dir_for_ad = '/kaggle/working/up_sampled/ad'

os.makedirs(output_dir_for_cn,exist_ok = True)
os.makedirs(output_dir_for_ad,exist_ok = True)


In [12]:
def generate_segments(directory,output_dir,duration_per_part):
    
    filenames = os.listdir(directory)
    for filename in tqdm(filenames):
        filepath = os.path.join(directory, filename)

        # Check if the item is a file
        if os.path.isfile(filepath) is False:
            continue

        # so its a file 
        file_name = filepath.split('/')[-1]
        file_name = file_name.split('.')[-2]

        output_sub_dir = output_dir + '/' + file_name

        # make sub-dir to store the segments
        os.makedirs(output_sub_dir,exist_ok = True)

        # Read the audio file
        data, samplerate = sf.read(filepath)

        total_samples = len(data)
        sample_rate = samplerate
        samples_per_part = math.ceil(duration_per_part * sample_rate)

        # Calculate the number of parts
        num_parts = math.floor(total_samples / samples_per_part)

        # Generate and save each part
        for i in range(num_parts):
            start_sample = i * samples_per_part
            end_sample = (i + 1) * samples_per_part
            part_data = data[start_sample:end_sample]

            # Check if the part is shorter than 20 seconds
            if len(part_data) >= samples_per_part:
                part_filename = f"part_{i+1}.wav"
                part_path = os.path.join(output_sub_dir, part_filename)

                # Save the part as a WAV file
                sf.write(part_path, part_data, samplerate)

#                 print(f"Part {i+1} saved: {part_path}")
#                 print('\n-------------------\n')

            

In [13]:
cn_dir = '/kaggle/input/dementia-audio-data/denoised_cn/denoised_cn/no_interv'
output_dir = '/kaggle/working/up_sampled/cn'         
duration_per_part = 20 
                                                                                      
generate_segments(cn_dir,output_dir,duration_per_part)

100%|██████████| 78/78 [00:07<00:00, 10.59it/s]


In [14]:
ad_dir = '/kaggle/input/dementia-audio-data/denoised_ad/denoised_ad/no_interv'
output_dir = '/kaggle/working/up_sampled/ad'
duration_per_part = 20 

generate_segments(ad_dir,output_dir,duration_per_part)

100%|██████████| 86/86 [00:08<00:00,  9.71it/s]


#### **Download the segmented audio directories**

In [16]:
# Zip the Output Directory

# Path to the output directory you want to download
output_dir_path = '/kaggle/working/up_sampled'

# Path to the ZIP file that will be created
zip_file_path = '/kaggle/working/up_sampled.zip'

# Create a ZIP file of the output directory
shutil.make_archive(zip_file_path, 'zip', output_dir_path)

'/kaggle/working/up_sampled.zip.zip'

In [17]:
# Provide the path to the ZIP file you created
zip_file_path = '/kaggle/working/up_sampled.zip.zip'

# Generate a download link for the ZIP file
FileLink(zip_file_path)
