Scripts are supposed to change format of txt files of training data to the same format, lets call it 'tpi'
0 - exhale, 1 - inhale, 2 - silence
after every class we have 1 number saying how many milliseconds does this class last
for example for following csv file:
1 1500
0 2500
We have a recording that has 1 second of inhale and 2.5 seconds of exhale, total of 3.5 second recording.
For every csv file we have a corresponding .wav file with the same name.

# Master thesis dataset preparation

In [None]:
# Despite running master-thesis-data-filter.py which deletes all wav files without corresponding txt files, and unnecessary png files.

import os

# Function that converts txt file to csv file in our wanted format
def convert_to_csv_master(txt_file, output_file):

    # Open txt file
    with open(txt_file, 'r') as f:
        lines = f.readlines()

    events = []
    last_end = 0

    # Iterate over lines in txt file
    for line in lines:

        # Split line by spaces
        parts = line.strip().split()

        # If there are not 3 parts (class, start and end), skip this line
        if len(parts) != 3:
            continue


        event, start, end = parts[0], int(parts[1]), int(parts[2])

        if start > last_end:  # Silence between events
            events.append((2, start - last_end))
        if event == 'wydech':  # Exhale
            events.append((0, end - start))
        elif event == 'wdech':  # Inhale
            events.append((1, end - start))
        last_end = end

    # Write to csv file
    with open(output_file, 'w') as f:
        for event in events:
            f.write(f'{event[0]},{event[1]}\n')

# Function that changes txt files to csv files in our wanted format in specified directory
def process_directory_master(directory):
    i = 1  # Counter for output files

    # Iterate over files in directory
    for filename in os.listdir(directory):

        # If file is txt file
        if filename.endswith('.txt'):

            # Get paths
            txt_file = os.path.join(directory, filename)
            wav_file = os.path.join(directory, filename.replace('.txt', '.wav'))

            # If corresponding wav file exists
            if os.path.exists(wav_file):

                # Create output paths
                output_csv = os.path.join(directory, f'data-master{i}.csv')
                output_wav = os.path.join(directory, f'data-master{i}.wav')

                # Convert txt file to csv file in our wanted format
                convert_to_csv_master(txt_file, output_csv)

                # Rename wav file
                os.rename(wav_file, output_wav)

                # Increment counter
                i += 1

# Process dataset directory
process_directory_master('../data-master-thesis')

# Script to delete unnecessary txt files that are left after our scripts

In [None]:
import os

# Function that removes all txt files in specified directory
def remove_txt_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            txt_file = os.path.join(directory, filename)
            os.remove(txt_file)
            print(f'Deleted: {txt_file}')

# Delete all txt files in specified directory
remove_txt_files('../data-repo')

# Turns out that we need also script to convert frames to milliseconds in our csv format

In [None]:
import os
import pandas as pd
import wave

# Function that converts samples to milliseconds
def convert_samples_to_ms(csv_file, wav_file):

    # Open wav file and get sample rate
    with wave.open(wav_file, 'r') as wf:
        sample_rate = wf.getframerate()

    # Open csv file and process data
    df = pd.read_csv(csv_file, header=None)
    df[1] = df[1] / sample_rate * 1000

    # Save processed data to the same csv file
    df.to_csv(csv_file, header=False, index=False)

# Function that processes all csv files in specified directory
def process_directory(directory):

    # Iterate over files in directory
    for filename in os.listdir(directory):

        # If file is csv file
        if filename.endswith('.csv'):

            # Get paths of both csv and wav file
            csv_file = os.path.join(directory, filename)
            wav_file = os.path.join(directory, filename.replace('.csv', '.wav'))

            # If corresponding wav file exists
            if os.path.exists(wav_file):

                # Convert samples to milliseconds
                convert_samples_to_ms(csv_file, wav_file)

# Process files in the directory ../data-master-thesis
process_directory('../data-master-thesis')

# Repo dataset preparation

In [None]:
import os

# Function to map the class values
def map_class(value):
    if value == 0:  # Their silence
        return 2
    elif value in [1, 3]:  # Their inhale
        return 1
    elif value in [2, 4]:  # Their exhalation
        return 0

# Function that converts txt file to csv file in our wanted format
def convert_to_csv_repo(txt_file, output_file):

    # Open txt file
    with open(txt_file, 'r') as f:
        lines = f.readlines()

    events = []

    # Iterate over lines in txt file
    for line in lines:

        # Split line by tabs
        parts = line.strip().split('\t')

        # If there are not 3 parts (start, end, class), skip this line
        if len(parts) != 3:
            continue

        # Extract start, end and class
        start, end, event = float(parts[0]), float(parts[1]), int(parts[2])

        # Map class value
        event = map_class(event)

        # Convert duration to milliseconds and throw error if anomaly occurs
        try:
            duration = int((end - start) * 1000)
            if duration < 0:
                raise ValueError(f"Negative duration: {duration} for line: {line.strip()}")
        except ValueError as e:
            print(e)
            continue

        # Append event to events list
        events.append((event, duration))

    # Write events to csv file
    with open(output_file, 'w') as f:
        for event in events:
            f.write(f'{event[0]},{event[1]}\n')

# Function that changes txt files to csv files in our wanted format in specified directory
def process_directory_repo(directory):
    i = 1  # Counter for output files

    # Iterate over files in directory
    for filename in os.listdir(directory):

        # If file is txt file
        if filename.endswith('.txt'):

            # Get paths
            txt_file = os.path.join(directory, filename)
            wav_file = os.path.join(directory, filename.replace('.txt', '.wav'))

            # If corresponding wav file exists
            if os.path.exists(wav_file):

                print(f'Processing: {txt_file}')
                # Create output paths
                output_csv = os.path.join(directory, f'data-repo{i}.csv')
                output_wav = os.path.join(directory, f'data-repo{i}.wav')

                # Convert txt file to csv file in our wanted format
                convert_to_csv_repo(txt_file, output_csv)

                # Rename wav file
                os.rename(wav_file, output_wav)

                # Increment counter
                i += 1

# Process dataset directory
process_directory_repo('../data-repo')

## That's all we need to prepare ready dataset for training - only thing left is to process our data (using breating_sequence_creator.py)