# 📁 02_clean_audio_segments.ipynb

# 🎯 Step 2: Clean Uploaded Audio Segments (Filter by Duration)

"""
This notebook filters the uploaded audio segments,
keeping only the ones with duration between 1 and 5 seconds.
Unsuitable files are automatically deleted.
"""

In [None]:
# 📂 Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 📦 Libraries
import os
import librosa

In [None]:
# 📂 Path to uploaded segments
segments_folder = "/content/drive/MyDrive/audio_segments" 

In [None]:
# 🧹 Filter files based on duration (1-5 seconds)
filtered_files = []

for filename in os.listdir(segments_folder):
    if filename.endswith(".wav"):
        filepath = os.path.join(segments_folder, filename)
        try:
            y, sr = librosa.load(filepath, sr=16000)
            duration = librosa.get_duration(y=y, sr=sr)

            if 1 <= duration <= 5:
                filtered_files.append(filename)
            else:
                os.remove(filepath)  # ❌ Delete unsuitable files

        except Exception as e:
            print(f"⚠️ Error processing {filename}: {e}")

print(f"✅ Total kept files (1-5 sec): {len(filtered_files)}")