In [None]:
!pip install ultralytics

In [None]:
# ==========================================
# 1. SETUP & IMPORTS
# ==========================================
import os
import glob
import time
import torch
import cv2
import json
import sys
import pandas as pd
import warnings
from collections import defaultdict
import subprocess
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=FutureWarning)

# ==========================================
# 2. AUTHENTICATION (SERVICE ACCOUNT)
# ==========================================
# Path to your uploaded key file
# NOTE: Ensure this file is uploaded to your current folder in Workbench!
KEY_FILE = 'colab-upload-bot-key.json' 

if os.path.exists(KEY_FILE):
    print("üîê Authenticating with Service Account Key...")
    os.system(f'gcloud auth activate-service-account --key-file="{KEY_FILE}"')
else:
    print(f"‚ö†Ô∏è WARNING: Key file '{KEY_FILE}' not found.")
    print("   If you are already logged in to gcloud on this VM, you can ignore this.")

# ==========================================
# 3. DOWNLOAD DATA (RELATIVE PATHS)
# ==========================================
# GCS Configuration
BUCKET_NAME = 'vis-data-2025'
GCS_TRAIN_DIR = f'gs://{BUCKET_NAME}/trainsm'  
GCS_JSON_URL = f'gs://{BUCKET_NAME}/train.json'

# --- FIXED PATHS (Relative to where you run the notebook) ---
LOCAL_BASE_DIR = './data_local'
LOCAL_TRAIN_DIR = os.path.join(LOCAL_BASE_DIR, 'trainsm')
LOCAL_JSON_PATH = './train.json'

def download_with_progress(gcs_src, local_dest_folder):
    """Counts files and downloads with a progress bar."""
    try:
        # Create destination if it doesn't exist
        os.makedirs(local_dest_folder, exist_ok=True)

        # 1. Count files
        print(f"üîç Counting files in {gcs_src}...")
        count_cmd = f"gsutil ls -r {gcs_src} | wc -l"
        result = subprocess.run(count_cmd, shell=True, capture_output=True, text=True)
        try:
            total_files = int(result.stdout.strip())
        except ValueError:
            total_files = 1000 # Fallback if count fails

        print(f"üì¶ Found ~{total_files} files. Starting download to {local_dest_folder}...")
        
        # 2. Start Download
        # Note: We copy the CONTENTS of trainsm into local_dest_folder
        # If gcs_src is .../trainsm, gsutil cp -r gs://.../trainsm ./data_local/ 
        # will create ./data_local/trainsm
        
        # We target the PARENT directory so gsutil creates the 'trainsm' folder inside it
        parent_dir = os.path.dirname(local_dest_folder)
        
        process = subprocess.Popen(
            f'gsutil -m cp -r {gcs_src} {parent_dir}',
            shell=True,
            stderr=subprocess.PIPE, 
            text=True
        )

        pbar = tqdm(total=total_files, unit='file', desc='Downloading')
        for line in process.stderr:
            if "Copying" in line:
                pbar.update(1)
        
        process.wait()
        pbar.close()
        
        if process.returncode == 0:
            print("‚úÖ Download Complete.")
            return True
        else:
            print("‚ùå Download command failed.")
            # Print last few lines of error to help debug
            print("Last error output:", line) 
            return False
            
    except Exception as e:
        print(f"‚ùå Error during download: {e}")
        return False

print(f"\nüöÄ STARTING DATA SETUP...")

# A. Download Annotation JSON
if not os.path.exists(LOCAL_JSON_PATH):
    print(f"‚¨áÔ∏è Downloading annotations...")
    os.system(f'gsutil cp {GCS_JSON_URL} {LOCAL_JSON_PATH}')
else:
    print("‚úÖ Annotations already present.")

# B. Download Training Folder
if not os.path.exists(LOCAL_TRAIN_DIR):
    # Ensure base dir exists
    os.makedirs(LOCAL_BASE_DIR, exist_ok=True)
    download_with_progress(GCS_TRAIN_DIR, LOCAL_TRAIN_DIR)
else:
    print("‚úÖ Training data already exists locally.")

# ==========================================
# 4. CONFIGURATION
# ==========================================
TRAIN_DIR = LOCAL_TRAIN_DIR
ANNOTATION_FILE = LOCAL_JSON_PATH
OUTPUT_CSV_PATH = './metrics/baseline_gpu.csv' # Saved in current folder

MODEL_NAME = 'yolov5l6'
IMG_SIZE = 1280
CONF_THRESH = 0.05
IOU_THRESH = 0.45

print(f"‚è≥ Loading High-Res Model: {MODEL_NAME}...")
try:
    model = torch.hub.load('ultralytics/yolov5', MODEL_NAME, pretrained=True, force_reload=False)
    model.conf = CONF_THRESH
    model.classes = [14]  # Class 14 = Bird
    
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print("‚úÖ Model Loaded on GPU.")
    else:
        device = torch.device('cpu')
        print("‚ö†Ô∏è Model Loaded on CPU.")
        
    model.to(device)
except Exception as e:
    print(f"‚ùå Model Load Error: {e}")

# ==========================================
# 5. HELPER FUNCTIONS
# ==========================================
def load_json_ground_truth(json_path):
    if not os.path.exists(json_path):
        print(f"‚ùå Error: Annotation file not found at {json_path}")
        return {}

    print(f"üìÇ Parsing annotations from: {json_path}...")
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
    except Exception as e:
        print(f"‚ùå JSON Parse Error: {e}")
        return {}

    id_to_filename = {img['id']: img['file_name'] for img in data['images']}
    img_id_to_boxes = defaultdict(list)
    if 'annotations' in data:
        for ann in data['annotations']:
            img_id_to_boxes[ann['image_id']].append(ann['bbox'])

    filename_to_gt = {}
    for img_id, filename in id_to_filename.items():
        key = filename
        if key.startswith('train/'):
            key = key.replace('train/', '', 1)
        filename_to_gt[key] = img_id_to_boxes.get(img_id, [])

    print(f"‚úÖ Loaded GT for {len(filename_to_gt)} images.")
    return filename_to_gt

def calculate_iou(box1, box2):
    x1, y1, w1, h1 = box1
    x2, y2, w2, h2 = box2
    xi1 = max(x1, x2)
    yi1 = max(y1, y2)
    xi2 = min(x1 + w1, x2 + w2)
    yi2 = min(y1 + h1, y2 + h2)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
    union_area = (w1 * h1) + (w2 * h2) - inter_area
    return inter_area / union_area if union_area > 0 else 0
    
def get_next_version_path(path):
    """
    Returns a new file path with an incremented version number if the file already exists.
    Example: 'data.csv' -> 'data_1.csv' -> 'data_2.csv'
    """
    # If the file doesn't exist yet, simply return the original path
    if not os.path.exists(path):
        return path

    directory, filename = os.path.split(path)
    name, ext = os.path.splitext(filename)
    
    # Create the directory if it doesn't exist (safety check)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)

    # Regex pattern to match files like "baseline_gpu_123.csv"
    # Matches: exact_name + underscore + digits + exact_extension
    pattern = re.compile(rf"^{re.escape(name)}_(\d+){re.escape(ext)}$")
    
    max_version = 0
    
    # List files in the directory to find the highest existing number
    for f in os.listdir(directory if directory else '.'):
        match = pattern.match(f)
        if match:
            version = int(match.group(1))
            if version > max_version:
                max_version = version

    # Next version is max found + 1
    new_filename = f"{name}_{max_version + 1}{ext}"
    return os.path.join(directory, new_filename)
    
# ==========================================
# 6. MAIN PIPELINE
# ==========================================
def run_baseline_evaluation():
    gt_data = load_json_ground_truth(ANNOTATION_FILE)
    if not gt_data: return
    
    start_time = time.time()
    
    # Check for trainsm folder contents
    video_folders = sorted(glob.glob(os.path.join(TRAIN_DIR, '*')))
    video_folders = [f for f in video_folders if os.path.isdir(f)]

    if not video_folders:
        print(f"‚ùå No video folders found in {TRAIN_DIR}.")
        print(f"   (Current working dir: {os.getcwd()})")
        return

    print(f"üìÇ Found {len(video_folders)} videos locally.")
    
    total_tp = total_fp = total_fn = total_time_sec = total_frames = 0
    results_data = []

    print(f"\n{'Video':<10} | {'Frames':<6} | {'FPS':<6} | {'Prec':<6} | {'Recall':<6} | {'F1':<6}")
    print("-" * 65)

    for v_idx, video_path in enumerate(video_folders):
        video_name = os.path.basename(video_path)
        images = sorted(glob.glob(os.path.join(video_path, '*.jpg')))

        if not images: continue

        vid_tp = vid_fp = vid_fn = 0
        vid_start = time.time()
        n_frames = len(images)

        for i, img_path in enumerate(images):
            # Progress Bar for Frames
            if i % 10 == 0:
                percent = ((i + 1) / n_frames) * 100
                sys.stdout.write(f"\rüëâ Processing [{video_name}] Frame {i+1}/{n_frames} ({percent:.1f}%)")
                sys.stdout.flush()

            img_filename = os.path.basename(img_path)
            lookup_key = f"{video_name}/{img_filename}"

            img = cv2.imread(img_path)
            if img is None: continue

            results = model(img, size=IMG_SIZE)
            preds = []
            results_numpy = results.xyxy[0].cpu().numpy() if torch.cuda.is_available() else results.xyxy[0].numpy()
            for det in results_numpy:
                x1, y1, x2, y2, conf, cls = det
                preds.append([x1, y1, x2-x1, y2-y1])

            gts = gt_data.get(lookup_key, [])
            matched_gt = set()

            for p_box in preds:
                best_iou = 0
                best_gt_idx = -1
                for idx, g_box in enumerate(gts):
                    if idx in matched_gt: continue
                    iou = calculate_iou(p_box, g_box)
                    if iou > best_iou:
                        best_iou = iou
                        best_gt_idx = idx

                if best_iou >= IOU_THRESH:
                    vid_tp += 1
                    matched_gt.add(best_gt_idx)
                else:
                    vid_fp += 1
            vid_fn += len(gts) - len(matched_gt)

        vid_end = time.time()
        vid_time = vid_end - vid_start
        vid_fps = n_frames / vid_time if vid_time > 0 else 0

        total_time_sec += vid_time
        total_frames += n_frames
        total_tp += vid_tp
        total_fp += vid_fp
        total_fn += vid_fn

        prec = vid_tp / (vid_tp + vid_fp) if (vid_tp + vid_fp) > 0 else 0
        rec = vid_tp / (vid_tp + vid_fn) if (vid_tp + vid_fn) > 0 else 0
        f1 = 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0

        sys.stdout.write("\r" + " " * 80 + "\r")
        print(f"{video_name:<10} | {n_frames:<6} | {vid_fps:<6.1f} | {prec:<6.2f} | {rec:<6.2f} | {f1:<6.2f}")

        results_data.append({
            'Video': video_name,
            'Frames': n_frames,
            'FPS': round(vid_fps, 2),
            'Precision': round(prec, 4),
            'Recall': round(rec, 4),
            'F1': round(f1, 4),
            'TP': vid_tp, 'FP': vid_fp, 'FN': vid_fn
        })

    print("=" * 65)
    avg_fps = total_frames / total_time_sec if total_time_sec > 0 else 0
    overall_prec = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    overall_rec = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    overall_f1 = 2 * (overall_prec * overall_rec) / (overall_prec + overall_rec) if (overall_prec + overall_rec) > 0 else 0

    print("FINAL RESULTS (Small Subset):")
    print(f"Total Frames:   {total_frames}")
    print(f"Average FPS:    {avg_fps:.2f}")
    print(f"Precision:      {overall_prec:.4f}")
    print(f"Recall:         {overall_rec:.4f}")
    print(f"F1-Score:       {overall_f1:.4f}")
    print("=" * 65)

    df = pd.DataFrame(results_data)
    overall_row = {
        'Video': 'OVERALL', 'Frames': total_frames, 'FPS': round(avg_fps, 2),
        'Precision': round(overall_prec, 4), 'Recall': round(overall_rec, 4),
        'F1': round(overall_f1, 4), 'TP': total_tp, 'FP': total_fp, 'FN': total_fn
    }
    df = pd.concat([df, pd.DataFrame([overall_row])], ignore_index=True)
    final_path = get_next_version_path(OUTPUT_CSV_PATH)
    df.to_csv(final_path, index=False)
    print(f"‚úÖ CSV Saved: {final_path}")
    elapsed_time = time.time() - start_time
    print(f"‚è±Ô∏è Process took: {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")
    

if __name__ == "__main__":
    run_baseline_evaluation()