**Description of this code, this code is AI Generated**

---


This is the final step in data processing, this code reads the .mp3 files from demcus conversion and uses librosa to convert all the 66k* clips to log mel spectrograms, and stack its game and streamer audio channels together. For speeding up the concept of batch input and output is present, with checkpoints, so that the script can resume if colab closes unexpectedly in between

In [None]:
!apt-get update -qq
!apt-get install -y ffmpeg
!pip install librosa numpy torch tqdm
print("‚úÖ Ready for Librosa!")

In [None]:
import os
import zipfile
import shutil
import glob
import uuid
import warnings
import numpy as np
import torch
import librosa
from tqdm import tqdm
import json
from datetime import datetime

# ================= CONFIGURATION =================
POSITIVE_INPUT = "/content/drive/MyDrive/Separated_Audio"
NEGATIVE_INPUT = "/content/drive/MyDrive/separated_audio_negative"
OUTPUT_ROOT = "/content/drive/MyDrive/Mel_Spectrograms"

SAMPLE_RATE = 22050
N_FFT = 1024
HOP_LENGTH = 512
N_MELS = 80

SHARD_SIZE = 2048

# Game folders mapping
GAME_FOLDERS = {
    "Valorant_Separated": "Valorant",
    "CS2_Separated": "CS2",
    "Apex_Separated": "Apex"
}

# Local temp folders
TEMP_EXTRACT = "/content/temp_extract"
TEMP_BUFFER = "/content/temp_buffer"
CHECKPOINT_FILE = None

warnings.filterwarnings("ignore")

# ================= SETUP =================
def setup():
    global CHECKPOINT_FILE

    print("üöÄ Mel Spectrogram Converter")
    print("=" * 60)

    # Create output folders
    for label in ["Positive", "Negative"]:
        for game in GAME_FOLDERS.values():
            folder = os.path.join(OUTPUT_ROOT, label, game)
            os.makedirs(folder, exist_ok=True)
            print(f"üìÅ {label}/{game}")

    # Temp folders
    os.makedirs(TEMP_EXTRACT, exist_ok=True)
    os.makedirs(TEMP_BUFFER, exist_ok=True)

    CHECKPOINT_FILE = os.path.join(OUTPUT_ROOT, "mel_checkpoint.json")

    print("=" * 60)
    return True

# ================= CHECKPOINT =================
def load_checkpoint():
    if CHECKPOINT_FILE and os.path.exists(CHECKPOINT_FILE):
        try:
            with open(CHECKPOINT_FILE, 'r') as f:
                cp = json.load(f)
            print(f"üì• Checkpoint: {cp.get('total_processed', 0)} clips done")
            return cp
        except:
            pass

    return {
        "processed_zips": [],
        "shard_counts": {
            "Positive": {"Valorant": 0, "CS2": 0, "Apex": 0},
            "Negative": {"Valorant": 0, "CS2": 0, "Apex": 0}
        },
        "total_processed": 0,
        "total_errors": 0
    }

def save_checkpoint(cp):
    if CHECKPOINT_FILE:
        cp["last_save"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        try:
            with open(CHECKPOINT_FILE, 'w') as f:
                json.dump(cp, f, indent=2)
        except:
            pass

# ================= AUDIO PROCESSING =================
def audio_to_mel(audio_path):
    """Convert audio file to log mel spectrogram tensor."""
    try:
        # Load audio
        y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)

        if len(y) == 0:
            return None

        # Create mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_fft=N_FFT,
            hop_length=HOP_LENGTH,
            n_mels=N_MELS
        )

        # Convert to tensor and log scale
        mel_tensor = torch.from_numpy(mel_spec).float()
        log_mel = torch.log(mel_tensor + 1e-9)

        # Shape: (1, n_mels, time)
        return log_mel.unsqueeze(0)

    except Exception as e:
        return None

def process_clip(clip_folder, temp_vocals, temp_no_vocals):
    """Process a clip folder containing vocals.mp3 and no_vocals.mp3.

    Returns: dict with 'vocals' and 'no_vocals' tensors, or None if failed
    """
    vocals_path = os.path.join(clip_folder, "vocals.mp3")
    no_vocals_path = os.path.join(clip_folder, "no_vocals.mp3")

    # Check both files exist
    if not os.path.exists(vocals_path) or not os.path.exists(no_vocals_path):
        return None

    # Process both tracks
    vocals_mel = audio_to_mel(vocals_path)
    no_vocals_mel = audio_to_mel(no_vocals_path)

    if vocals_mel is None or no_vocals_mel is None:
        return None

    return {
        "vocals": vocals_mel,
        "no_vocals": no_vocals_mel
    }

# ================= SHARDING =================
def flush_buffer(buffer_path, output_folder, shard_id):
    """Zip all .pt files in buffer to a shard."""
    files = glob.glob(os.path.join(buffer_path, "*.pt"))
    if not files:
        return False

    zip_name = f"shard_{shard_id}_{len(files)}clips.zip"
    zip_path = os.path.join(output_folder, zip_name)

    try:
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
            for f in files:
                zf.write(f, os.path.basename(f))

        # Cleanup buffer
        for f in files:
            os.remove(f)

        size_mb = os.path.getsize(zip_path) / (1024**2)
        tqdm.write(f"   üì¶ Shard {shard_id}: {len(files)} clips ({size_mb:.1f} MB)")
        return True

    except Exception as e:
        tqdm.write(f"   ‚ùå Shard error: {e}")
        return False

# ================= MAIN PROCESSING =================
def process_dataset(input_folder, label, checkpoint):
    """Process all games in a dataset folder (Positive or Negative)."""

    print(f"\n{'=' * 60}")
    print(f"üéÆ Processing {label} Dataset")
    print(f"üìÇ {input_folder}")
    print(f"{'=' * 60}")

    if not os.path.exists(input_folder):
        print(f"‚ùå Folder not found: {input_folder}")
        return

    total_clips = 0
    total_errors = 0

    # Process each game folder
    for game_folder, game_name in GAME_FOLDERS.items():
        game_path = os.path.join(input_folder, game_folder)

        if not os.path.exists(game_path):
            print(f"‚ö†Ô∏è {game_name}: folder not found, skipping")
            continue

        # Find all batch zips
        batch_zips = sorted(glob.glob(os.path.join(game_path, "*.zip")))

        if not batch_zips:
            print(f"‚ö†Ô∏è {game_name}: no zip files found")
            continue

        print(f"\nüéÆ {game_name}: {len(batch_zips)} batch zips")

        # Output folder for this game
        output_folder = os.path.join(OUTPUT_ROOT, label, game_name)
        os.makedirs(output_folder, exist_ok=True)

        # Buffer for this game
        game_buffer = os.path.join(TEMP_BUFFER, f"{label}_{game_name}")
        if os.path.exists(game_buffer):
            shutil.rmtree(game_buffer)
        os.makedirs(game_buffer)

        buffer_count = 0
        shard_id = checkpoint["shard_counts"][label][game_name]
        game_clips = 0
        game_errors = 0

        # Process each batch zip
        for batch_zip in tqdm(batch_zips, desc=f"[{game_name}]"):
            zip_key = f"{label}_{game_name}_{os.path.basename(batch_zip)}"

            # Skip if already processed
            if zip_key in checkpoint["processed_zips"]:
                continue

            # Extract batch zip
            extract_path = os.path.join(TEMP_EXTRACT, f"{label}_{game_name}")
            if os.path.exists(extract_path):
                shutil.rmtree(extract_path)
            os.makedirs(extract_path)

            try:
                with zipfile.ZipFile(batch_zip, 'r') as zf:
                    zf.extractall(extract_path)

                # Find all clip folders (folders containing vocals.mp3)
                clip_folders = []
                for root, dirs, files in os.walk(extract_path):
                    if "vocals.mp3" in files and "no_vocals.mp3" in files:
                        clip_folders.append(root)

                tqdm.write(f"   üìÇ {os.path.basename(batch_zip)}: {len(clip_folders)} clips")

                # Process each clip
                for clip_folder in clip_folders:
                    clip_name = os.path.basename(clip_folder)

                    result = process_clip(clip_folder, None, None)

                    if result is not None:
                        # Save as single .pt file with both tensors
                        pt_filename = f"{game_name}_{clip_name}_{uuid.uuid4().hex[:8]}.pt"
                        pt_path = os.path.join(game_buffer, pt_filename)

                        # Save dict with both spectrograms
                        torch.save({
                            "vocals": result["vocals"],
                            "no_vocals": result["no_vocals"],
                            "clip_name": clip_name,
                            "game": game_name,
                            "label": label
                        }, pt_path)

                        buffer_count += 1
                        game_clips += 1

                        # Flush buffer if full
                        if buffer_count >= SHARD_SIZE:
                            flush_buffer(game_buffer, output_folder, shard_id)
                            shard_id += 1
                            checkpoint["shard_counts"][label][game_name] = shard_id
                            buffer_count = 0
                            save_checkpoint(checkpoint)
                    else:
                        game_errors += 1

                # Mark zip as processed
                checkpoint["processed_zips"].append(zip_key)
                save_checkpoint(checkpoint)

            except zipfile.BadZipFile:
                tqdm.write(f"   ‚ùå Corrupted zip: {os.path.basename(batch_zip)}")
                game_errors += 1
            except Exception as e:
                tqdm.write(f"   ‚ùå Error: {e}")
                game_errors += 1
            finally:
                # Cleanup extract folder
                if os.path.exists(extract_path):
                    shutil.rmtree(extract_path, ignore_errors=True)

        # Flush remaining buffer
        if buffer_count > 0:
            flush_buffer(game_buffer, output_folder, shard_id)
            shard_id += 1
            checkpoint["shard_counts"][label][game_name] = shard_id
            save_checkpoint(checkpoint)

        # Cleanup game buffer
        if os.path.exists(game_buffer):
            shutil.rmtree(game_buffer, ignore_errors=True)

        print(f"   ‚úÖ {game_name}: {game_clips} clips, {shard_id} shards" +
              (f", {game_errors} errors" if game_errors else ""))

        total_clips += game_clips
        total_errors += game_errors

    checkpoint["total_processed"] += total_clips
    checkpoint["total_errors"] += total_errors
    save_checkpoint(checkpoint)

    print(f"\n‚úÖ {label} Complete: {total_clips} clips processed")

# ================= MAIN =================
def run_pipeline():
    print("=" * 60)
    print("üéµ MEL SPECTROGRAM CONVERTER")
    print("=" * 60)
    print(f"‚è∞ Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"üéµ Sample Rate: {SAMPLE_RATE} Hz")
    print(f"üéµ Mel Bins: {N_MELS}")
    print(f"üì¶ Shard Size: {SHARD_SIZE} clips")
    print("=" * 60)

    if not setup():
        return

    checkpoint = load_checkpoint()

    # Process Positive dataset
    process_dataset(POSITIVE_INPUT, "Positive", checkpoint)

    # Process Negative dataset
    process_dataset(NEGATIVE_INPUT, "Negative", checkpoint)

    # Cleanup
    if os.path.exists(TEMP_EXTRACT):
        shutil.rmtree(TEMP_EXTRACT, ignore_errors=True)
    if os.path.exists(TEMP_BUFFER):
        shutil.rmtree(TEMP_BUFFER, ignore_errors=True)

    # Summary
    print("\n" + "=" * 60)
    print("üéâ COMPLETE!")
    print("=" * 60)
    print(f"‚úÖ Total Clips: {checkpoint['total_processed']}")
    print(f"‚ùå Total Errors: {checkpoint['total_errors']}")
    print(f"‚è∞ Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    print("\nüìä Shard Summary:")
    for label in ["Positive", "Negative"]:
        print(f"\n   {label}:")
        for game in GAME_FOLDERS.values():
            count = checkpoint["shard_counts"][label][game]
            folder = os.path.join(OUTPUT_ROOT, label, game)
            if os.path.exists(folder):
                zips = glob.glob(os.path.join(folder, "*.zip"))
                size_mb = sum(os.path.getsize(z) for z in zips) / (1024**2)
                print(f"      {game}: {count} shards ({size_mb:.1f} MB)")

# ================= RUN =================
if __name__ == "__main__":
    run_pipeline()

In [None]:
import os
import zipfile
import shutil
import glob
import torch
from tqdm import tqdm
import json
from datetime import datetime

# ================= CONFIGURATION =================

INPUT_ROOT = "/content/drive/MyDrive/Mel_Spectrograms"
OUTPUT_ROOT = "/content/drive/MyDrive/Stacked_Tensors"

SHARD_SIZE = 2048

# Structure
LABELS = ["Positive", "Negative"]
GAMES = ["Valorant", "CS2", "Apex"]

# Temp folders
TEMP_EXTRACT = "/content/temp_stack_extract"
TEMP_BUFFER = "/content/temp_stack_buffer"
CHECKPOINT_FILE = None

# ================= SETUP =================
def setup():
    global CHECKPOINT_FILE

    print("üîó Tensor Stacker")
    print("=" * 60)

    # Create output folders
    for label in LABELS:
        for game in GAMES:
            folder = os.path.join(OUTPUT_ROOT, label, game)
            os.makedirs(folder, exist_ok=True)

    os.makedirs(TEMP_EXTRACT, exist_ok=True)
    os.makedirs(TEMP_BUFFER, exist_ok=True)

    CHECKPOINT_FILE = os.path.join(OUTPUT_ROOT, "stack_checkpoint.json")

    print(f"üìÇ Input: {INPUT_ROOT}")
    print(f"üìÇ Output: {OUTPUT_ROOT}")
    print("=" * 60)
    return True

# ================= CHECKPOINT =================
def load_checkpoint():
    if CHECKPOINT_FILE and os.path.exists(CHECKPOINT_FILE):
        try:
            with open(CHECKPOINT_FILE, 'r') as f:
                return json.load(f)
        except:
            pass

    return {
        "processed_shards": [],
        "shard_counts": {label: {game: 0 for game in GAMES} for label in LABELS},
        "total_processed": 0
    }

def save_checkpoint(cp):
    if CHECKPOINT_FILE:
        cp["last_save"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open(CHECKPOINT_FILE, 'w') as f:
            json.dump(cp, f, indent=2)

# ================= STACKING =================
def stack_tensor(data):
    """Stack vocals and no_vocals into single tensor (2, 80, time)."""
    vocals = data["vocals"]      # (1, 80, time)
    no_vocals = data["no_vocals"]  # (1, 80, time)

    # Handle different lengths - pad shorter to match longer
    v_time = vocals.shape[-1]
    nv_time = no_vocals.shape[-1]

    if v_time != nv_time:
        max_time = max(v_time, nv_time)
        if v_time < max_time:
            pad = torch.zeros(1, vocals.shape[1], max_time - v_time)
            vocals = torch.cat([vocals, pad], dim=-1)
        if nv_time < max_time:
            pad = torch.zeros(1, no_vocals.shape[1], max_time - nv_time)
            no_vocals = torch.cat([no_vocals, pad], dim=-1)

    # Stack: (2, 80, time)
    stacked = torch.cat([vocals, no_vocals], dim=0)

    return stacked

def flush_buffer(buffer_path, output_folder, shard_id):
    """Zip all .pt files in buffer."""
    files = glob.glob(os.path.join(buffer_path, "*.pt"))
    if not files:
        return False

    zip_name = f"stacked_shard_{shard_id}_{len(files)}clips.zip"
    zip_path = os.path.join(output_folder, zip_name)

    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
        for f in files:
            zf.write(f, os.path.basename(f))

    for f in files:
        os.remove(f)

    size_mb = os.path.getsize(zip_path) / (1024**2)
    tqdm.write(f"   üì¶ Shard {shard_id}: {len(files)} clips ({size_mb:.1f} MB)")
    return True

# ================= MAIN PROCESSING =================
def process_all():
    checkpoint = load_checkpoint()
    total = 0

    for label in LABELS:
        print(f"\n{'=' * 60}")
        print(f"üìÅ {label}")
        print(f"{'=' * 60}")

        for game in GAMES:
            input_folder = os.path.join(INPUT_ROOT, label, game)
            output_folder = os.path.join(OUTPUT_ROOT, label, game)

            if not os.path.exists(input_folder):
                print(f"‚ö†Ô∏è {game}: not found, skipping")
                continue

            # Find input shards
            input_shards = sorted(glob.glob(os.path.join(input_folder, "*.zip")))

            if not input_shards:
                print(f"‚ö†Ô∏è {game}: no shards found")
                continue

            print(f"\nüéÆ {game}: {len(input_shards)} input shards")

            # Buffer for this game
            game_buffer = os.path.join(TEMP_BUFFER, f"{label}_{game}")
            if os.path.exists(game_buffer):
                shutil.rmtree(game_buffer)
            os.makedirs(game_buffer)

            buffer_count = 0
            shard_id = checkpoint["shard_counts"][label][game]
            game_total = 0

            for shard_zip in tqdm(input_shards, desc=f"[{game}]"):
                shard_key = f"{label}_{game}_{os.path.basename(shard_zip)}"

                if shard_key in checkpoint["processed_shards"]:
                    continue

                # Extract shard
                extract_path = os.path.join(TEMP_EXTRACT, f"{label}_{game}")
                if os.path.exists(extract_path):
                    shutil.rmtree(extract_path)
                os.makedirs(extract_path)

                try:
                    with zipfile.ZipFile(shard_zip, 'r') as zf:
                        zf.extractall(extract_path)

                    # Process each .pt file
                    pt_files = glob.glob(os.path.join(extract_path, "*.pt"))

                    for pt_file in pt_files:
                        try:
                            data = torch.load(pt_file, weights_only=False)

                            # Stack tensors
                            stacked = stack_tensor(data)

                            # Save stacked tensor
                            out_name = os.path.basename(pt_file).replace(".pt", "_stacked.pt")
                            out_path = os.path.join(game_buffer, out_name)

                            torch.save({
                                "tensor": stacked,  # (2, 80, time)
                                "clip_name": data.get("clip_name", ""),
                                "game": game,
                                "label": label
                            }, out_path)

                            buffer_count += 1
                            game_total += 1

                            # Flush if buffer full
                            if buffer_count >= SHARD_SIZE:
                                flush_buffer(game_buffer, output_folder, shard_id)
                                shard_id += 1
                                checkpoint["shard_counts"][label][game] = shard_id
                                buffer_count = 0
                                save_checkpoint(checkpoint)

                        except Exception as e:
                            continue

                    checkpoint["processed_shards"].append(shard_key)
                    save_checkpoint(checkpoint)

                except Exception as e:
                    tqdm.write(f"   ‚ùå Error: {e}")

                finally:
                    if os.path.exists(extract_path):
                        shutil.rmtree(extract_path, ignore_errors=True)

            # Flush remaining
            if buffer_count > 0:
                flush_buffer(game_buffer, output_folder, shard_id)
                shard_id += 1
                checkpoint["shard_counts"][label][game] = shard_id
                save_checkpoint(checkpoint)

            # Cleanup
            if os.path.exists(game_buffer):
                shutil.rmtree(game_buffer, ignore_errors=True)

            print(f"   ‚úÖ {game}: {game_total} clips stacked, {shard_id} shards")
            total += game_total

    checkpoint["total_processed"] = total
    save_checkpoint(checkpoint)

    return total

# ================= RUN =================
def run_pipeline():
    print("=" * 60)
    print("üîó TENSOR STACKER")
    print("=" * 60)
    print(f"‚è∞ Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"üìê Output shape: (2, 80, time)")
    print(f"   Channel 0: vocals")
    print(f"   Channel 1: no_vocals")
    print("=" * 60)

    setup()
    total = process_all()

    # Cleanup
    if os.path.exists(TEMP_EXTRACT):
        shutil.rmtree(TEMP_EXTRACT, ignore_errors=True)
    if os.path.exists(TEMP_BUFFER):
        shutil.rmtree(TEMP_BUFFER, ignore_errors=True)

    print("\n" + "=" * 60)
    print("üéâ COMPLETE!")
    print("=" * 60)
    print(f"‚úÖ Total stacked: {total} clips")
    print(f"‚è∞ Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Summary
    print("\nüìä Output Summary:")
    for label in LABELS:
        print(f"\n   {label}:")
        for game in GAMES:
            folder = os.path.join(OUTPUT_ROOT, label, game)
            if os.path.exists(folder):
                zips = glob.glob(os.path.join(folder, "*.zip"))
                if zips:
                    size_mb = sum(os.path.getsize(z) for z in zips) / (1024**2)
                    print(f"      {game}: {len(zips)} shards ({size_mb:.1f} MB)")

if __name__ == "__main__":
    run_pipeline()

In [None]:
# FAST VERSION - Process locally, not on Drive

import os
import zipfile
import glob
import torch
import io
import shutil
from tqdm import tqdm

INPUT_ROOT = "/content/drive/MyDrive/Mel_Spectrograms"
OUTPUT_ROOT = "/content/drive/MyDrive/Stacked_Tensors"
LOCAL_INPUT = "/content/local_input"
LOCAL_OUTPUT = "/content/local_output"
SHARD_SIZE = 2048

LABELS = ["Positive", "Negative"]
GAMES = ["Valorant", "CS2", "Apex"]

print("üîó ULTRA-FAST TENSOR STACKER (Local Processing)")
print("=" * 50)

total = 0

for label in LABELS:
    print(f"\nüìÅ {label}")

    for game in GAMES:
        input_folder = os.path.join(INPUT_ROOT, label, game)
        output_folder = os.path.join(OUTPUT_ROOT, label, game)
        local_in = os.path.join(LOCAL_INPUT, label, game)
        local_out = os.path.join(LOCAL_OUTPUT, label, game)

        if not os.path.exists(input_folder):
            continue

        input_zips = sorted(glob.glob(os.path.join(input_folder, "*.zip")))
        if not input_zips:
            continue

        print(f"\n   üéÆ {game}: {len(input_zips)} shards")

        # Create local folders
        os.makedirs(local_in, exist_ok=True)
        os.makedirs(local_out, exist_ok=True)
        os.makedirs(output_folder, exist_ok=True)

        all_stacked = []
        shard_id = 0

        for zip_path in tqdm(input_zips, desc=f"   [{game}]"):
            # Copy to local
            local_zip = os.path.join(local_in, os.path.basename(zip_path))
            shutil.copy(zip_path, local_zip)

            with zipfile.ZipFile(local_zip, 'r') as zf:
                pt_files = [n for n in zf.namelist() if n.endswith('.pt')]

                for pt_name in pt_files:
                    try:
                        with zf.open(pt_name) as f:
                            data = torch.load(io.BytesIO(f.read()), weights_only=False)

                        vocals = data["vocals"]
                        no_vocals = data["no_vocals"]

                        # Pad if needed
                        v_t, nv_t = vocals.shape[-1], no_vocals.shape[-1]
                        if v_t != nv_t:
                            max_t = max(v_t, nv_t)
                            if v_t < max_t:
                                vocals = torch.cat([vocals, torch.zeros(1, 80, max_t - v_t)], dim=-1)
                            if nv_t < max_t:
                                no_vocals = torch.cat([no_vocals, torch.zeros(1, 80, max_t - nv_t)], dim=-1)

                        stacked = torch.cat([vocals, no_vocals], dim=0)

                        all_stacked.append({
                            "tensor": stacked,
                            "clip_name": data.get("clip_name", ""),
                            "game": game,
                            "label": label
                        })

                        # Flush shard locally, then copy to Drive
                        if len(all_stacked) >= SHARD_SIZE:
                            local_shard = os.path.join(local_out, f"stacked_shard_{shard_id}.zip")
                            with zipfile.ZipFile(local_shard, 'w', zipfile.ZIP_DEFLATED) as ozf:
                                for i, item in enumerate(all_stacked):
                                    pt_bytes = io.BytesIO()
                                    torch.save(item, pt_bytes)
                                    ozf.writestr(f"clip_{shard_id}_{i}.pt", pt_bytes.getvalue())

                            # Copy to Drive
                            shutil.copy(local_shard, os.path.join(output_folder, f"stacked_shard_{shard_id}_{len(all_stacked)}clips.zip"))
                            os.remove(local_shard)

                            tqdm.write(f"      üì¶ Shard {shard_id}: {len(all_stacked)} clips")
                            total += len(all_stacked)
                            all_stacked = []
                            shard_id += 1
                    except:
                        continue

            # Cleanup local zip
            os.remove(local_zip)

        # Flush remaining
        if all_stacked:
            local_shard = os.path.join(local_out, f"stacked_shard_{shard_id}.zip")
            with zipfile.ZipFile(local_shard, 'w', zipfile.ZIP_DEFLATED) as ozf:
                for i, item in enumerate(all_stacked):
                    pt_bytes = io.BytesIO()
                    torch.save(item, pt_bytes)
                    ozf.writestr(f"clip_{shard_id}_{i}.pt", pt_bytes.getvalue())

            shutil.copy(local_shard, os.path.join(output_folder, f"stacked_shard_{shard_id}_{len(all_stacked)}clips.zip"))
            tqdm.write(f"      üì¶ Shard {shard_id}: {len(all_stacked)} clips")
            total += len(all_stacked)

# Cleanup
shutil.rmtree(LOCAL_INPUT, ignore_errors=True)
shutil.rmtree(LOCAL_OUTPUT, ignore_errors=True)

print(f"\n‚úÖ Done! Total: {total} clips stacked")

In [None]:
import os
import gc
from google.colab import drive

gc.collect() # Clean up memory and potentially release file handles

# Check if Drive is already mounted and unmount if necessary
if os.path.ismount('/content/drive'):
    print("Google Drive is currently mounted. Attempting to unmount...")
    try:
        drive.flush_and_unmount()
        print("Google Drive unmounted successfully.")
    except Exception as e:
        print(f"Warning: Error during unmount: {e}. Proceeding with directory cleanup.")

# Ensure the mount point directory is absolutely empty
if os.path.exists('/content/drive'):
    print("'/content/drive' directory exists. Clearing its contents...")
    # Forcefully remove all contents, including hidden files/dirs
    !rm -rf /content/drive/*
    # Try to remove the directory itself if it's empty, ignore errors
    !rmdir /content/drive || true

# Recreate a fresh, empty mount point directory
os.makedirs('/content/drive', exist_ok=True)
print("'/content/drive' directory prepared.")

# Verify it's empty before attempting to mount
if os.listdir('/content/drive'):
    print(f"Critical Error: '/content/drive' is NOT empty before mount: {os.listdir('/content/drive')}")
else:
    print("'/content/drive' is verified empty before mounting.")

# Attempt to mount Google Drive
print("Attempting to mount Google Drive...")
drive.mount('/content/drive', force_remount=True)
print("Google Drive mounted successfully!")