# 📁 05_split_final_data.ipynb

# 🎯 Step 5: Copy Final Audio Files and Split into Train/Test Sets

"""
This notebook:
1. Selects audio files based on balanced_data.csv.
2. Copies them into a new folder called 'audio_final'.
3. Splits them into train and test sets.
4. Creates separate CSV files for train and test.
Test set size: ~40% of total samples.
"""

In [None]:
# 📂 Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 📦 Libraries
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# 📂 Paths
segments_folder = "/content/drive/MyDrive/audio_segments"
output_final_folder = "/content/drive/MyDrive/audio_final"
csv_path = "/content/drive/MyDrive/balanced_data.csv"

# Create output folder if not exists
os.makedirs(output_final_folder, exist_ok=True)

In [None]:
# 📖 Load balanced CSV
df = pd.read_csv(csv_path)

# 📋 Copy selected files
for filename in df['filename']:
    src = os.path.join(segments_folder, filename)
    dst = os.path.join(output_final_folder, filename)
    if os.path.exists(src):
        shutil.copy(src, dst)
    else:
        print(f"⚠️ File not found: {filename}")

print(f"✅ All selected files copied to {output_final_folder}")

In [None]:
# ✂️ Split into train and test
train_df, test_df = train_test_split(df, test_size=0.4, stratify=df['label'], random_state=42)

print(f"✅ Train samples: {len(train_df)}")
print(f"✅ Test samples: {len(test_df)}")

In [None]:
# 📂 Create folders
train_folder = "/content/drive/MyDrive/audio_train"
test_folder = "/content/drive/MyDrive/audio_test"

os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

In [None]:
# 📋 Copy files to train and test folders
for filename in train_df['filename']:
    src = os.path.join(output_final_folder, filename)
    dst = os.path.join(train_folder, filename)
    shutil.copy(src, dst)

for filename in test_df['filename']:
    src = os.path.join(output_final_folder, filename)
    dst = os.path.join(test_folder, filename)
    shutil.copy(src, dst)

print("✅ Train and Test audio files copied.")

In [None]:
# 💾 Save CSVs
train_df.to_csv("/content/drive/MyDrive/train.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/test.csv", index=False)

print("✅ train.csv and test.csv created.")