# ABOC Cloud Data Preparation

This notebook processes the SemanticKITTI dataset on Google Colab and saves the output directly to Google Cloud Storage (GCS).
This avoids using local disk space and leverages Cloud CPUs for processing.

**Prerequisites:**
1.  **Raw Data on GCS**: You must upload the raw SemanticKITTI `sequences` folder to your GCS bucket first.
    *   Expected Structure: `gs://YOUR_BUCKET/raw/sequences/00/velodyne/*.bin`
2.  **GCS Bucket**: A bucket to store the processed `.fb` files.

**Workflow:**
1.  Downloads a single sequence (e.g., `00`) from GCS to Colab VM.
2.  Runs `dataPrepare` to generate FlatBuffers.
3.  Uploads the result to GCS (`gs://YOUR_BUCKET/data`).
4.  Deletes local files to free up space for the next sequence.

In [None]:
# 1. Setup & Auth
from google.colab import auth
auth.authenticate_user()
print("Authenticated.")

# Install dependencies
!pip install -q flatbuffers gcsfs

import os
import glob
import shutil
import time

# Clone Repo for dataPrepare code
if not os.path.exists('ABOC'):
    !git clone https://github.com/michaelnutt2/ABOC.git

# Add to path
import sys
sys.path.append('/content/ABOC')
from Preparedata.data import dataPrepare

In [None]:
# 2. Configuration
# ==========================================
BUCKET_NAME = "mtn_fb_file_bucket"         # Your Bucket
RAW_PREFIX = "raw/sequences"               # Where raw .bin files live
OUTPUT_PREFIX = "data"                     # Where to save processed .fb files
# ==========================================

# Full list of SemanticKITTI sequences
SEQUENCES = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
             "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"]

# Or just test one:
# SEQUENCES = ["00"]

In [None]:
# 3. Processing Loop

def process_sequence(seq_id):
    print(f"\n=== Processing Sequence {seq_id} ===")
    
    local_raw_dir = f"/content/raw/{seq_id}"
    local_out_dir = f"/content/output/"
    
    # A. Clean Start
    if os.path.exists(local_raw_dir):
        shutil.rmtree(local_raw_dir)
    if os.path.exists(local_out_dir):
        shutil.rmtree(local_out_dir)
    os.makedirs(local_raw_dir)
    os.makedirs(local_out_dir)
    
    # B. Download Raw Sequence
    gcs_raw_path = f"gs://{BUCKET_NAME}/{RAW_PREFIX}/{seq_id}/velodyne/"
    print(f"Downloading {gcs_raw_path}...")
    
    # Use gsutil -m cp for parallel download
    start_t = time.time()
    !gsutil -m cp -r "{gcs_raw_path}*" "{local_raw_dir}/"
    
    # Verify download
    bin_files = glob.glob(f"{local_raw_dir}/*.bin")
    print(f"Downloaded {len(bin_files)} files in {time.time()-start_t:.1f}s")
    
    if not bin_files:
        print(f"Warning: No files found for sequence {seq_id}. Skipping.")
        return
        
    # C. Run DataPrepare
    print("Running Conversion...")
    # dataPrepare expects path to .bin and outputs to saveMatDir
    # It usually creates 'Folder/Name', so we might need to tweak paths
    # We iterate and process
    
    # We process in python loop (can parallelize if Colab has cores)
    # Simple loop for stability
    count = 0
    t0 = time.time()
    
    for f in bin_files:
        try:
            # Setup output naming relative to sequence
            # dataPrepare arguments:
            # file: absolute path to .bin
            # saveMatDir: base output directory
            # ptNamePrefix: prefix for filename (e.g. 'Kitti_00')
            
            # Construct prefix to include sequence folder roughly
            # dataPrepare typically does: saveMatDir + ptNamePrefix + filename
            # We want: output/Kitti_00_XXXX.fb
            
            # Note: We need to pass the folder param or handle naming carefuly
            # Looking at your dataPrepare.py, it uses 'folder' arg if passed to batch, or assumes structure?
            # Let's use the explicit call:
            
            dataPrepare(
                f, 
                saveMatDir=local_out_dir, 
                ptNamePrefix=f"Kitti_{seq_id}_", # e.g. Kitti_00_
                offset='min', 
                qs=2/(2**12-1), 
                normalize=True
            )
            count += 1
            if count % 100 == 0:
                print(f"Processed {count}/{len(bin_files)}...")
                
        except Exception as e:
            print(f"Error on {f}: {e}")
            
    print(f"Converted {count} files in {time.time()-t0:.1f}s")
    
    # D. Upload Results
    gcs_dest = f"gs://{BUCKET_NAME}/{OUTPUT_PREFIX}/"
    print(f"Uploading to {gcs_dest}...")
    !gsutil -m cp -r "{local_out_dir}*" "{gcs_dest}"
    
    # E. Cleanup
    print("Cleaning up local disk...")
    shutil.rmtree('/content/raw')
    shutil.rmtree('/content/output')
    print(f"Sequence {seq_id} Complete!\n")

# Run All
for seq in SEQUENCES:
    process_sequence(seq)