In [1]:
import os
import pandas as pd

def create_duration_csvs():
    # Configure paths
    original_segmented_dir = 'Datasets'
    csv_output_dir = os.path.join('final_dataset', 'CSVs')
    os.makedirs(csv_output_dir, exist_ok=True)

    # Initialize dictionary for our 6 target groups
    duration_groups = {
        '5s': [], '5s_overlap': [],
        '10s': [], '10s_overlap': [],
        '15s': [], '15s_overlap': [],
    }

    # Scan for CSV file paths + metadata
    for class_name in os.listdir(original_segmented_dir):
        class_path = os.path.join(original_segmented_dir, class_name)
        if not os.path.isdir(class_path):
            continue

        for segmented_folder in os.listdir(class_path):
            folder_path = os.path.join(class_path, segmented_folder)
            if not os.path.isdir(folder_path):
                continue

            parts = segmented_folder.split('_')
            if len(parts) < 2 or not parts[1].isdigit():
                continue
            duration = int(parts[1])
            overlap = 'overlap' in parts
            group_key = f"{duration}s{'_overlap' if overlap else ''}"
            if group_key not in duration_groups:
                continue

            # Collect each CSV path & metadata
            for file in os.listdir(folder_path):
                if file.endswith('.csv'):
                    duration_groups[group_key].append({
                        'path': os.path.join(folder_path, file),
                        'filename': file,
                        'class': class_name,
                        'duration': duration,
                        'overlap': overlap
                    })

    # Now for each group, read & concatenate actual CSV contents
    for group_name, entries in duration_groups.items():
        dfs = []
        for entry in entries:
            df = pd.read_csv(entry['path'])
            # add your metadata columns
            df['filename'] = entry['filename']
            df['class']    = entry['class']
            df['duration'] = entry['duration']
            df['overlap']  = entry['overlap']
            dfs.append(df)
        if dfs:
            combined = pd.concat(dfs, ignore_index=True)
            # sort if you like:
            combined = combined.sort_values(by=['class', 'filename'])
            out_path = os.path.join(csv_output_dir, f"{group_name}.csv")
            combined.to_csv(out_path, index=False)
            print(f"Created {group_name}.csv with {len(combined)} rows")

if __name__ == "__main__":
    create_duration_csvs()
    print("\nProcess completed! 6 CSV files created:")
    print("5s.csv, 5s_overlap.csv, 10s.csv, 10s_overlap.csv, 15s.csv, 15s_overlap.csv")


Created 5s.csv with 16200 rows
Created 5s_overlap.csv with 32400 rows
Created 10s.csv with 16200 rows
Created 10s_overlap.csv with 30600 rows
Created 15s.csv with 16200 rows
Created 15s_overlap.csv with 29700 rows

Process completed! 6 CSV files created:
5s.csv, 5s_overlap.csv, 10s.csv, 10s_overlap.csv, 15s.csv, 15s_overlap.csv
