In [None]:
import os
import shutil
import zipfile
from tqdm import tqdm
from google.colab import files

# === STEP 1: Upload the ZIP files ===
uploaded = files.upload()  # Upload images.zip and labels.zip

# === STEP 2: Create extraction folders ===
os.makedirs("dataset/images", exist_ok=True)
os.makedirs("dataset/labels", exist_ok=True)

# === STEP 3: Unzip into the right folders ===
with zipfile.ZipFile("images.zip", 'r') as zip_ref:
    zip_ref.extractall("dataset/images")

with zipfile.ZipFile("labels.zip", 'r') as zip_ref:
    zip_ref.extractall("dataset/labels")

print("✅ Files successfully extracted!")

# === STEP 4: Helper to find files ===
def find_files_by_extension(root_dir, extension):
    all_files = []
    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(extension):
                all_files.append(os.path.join(root, file))
    return all_files

# === STEP 5: Get and match image-label files ===
image_paths = find_files_by_extension("dataset/images", ".jpg")
label_paths = find_files_by_extension("dataset/labels", ".txt")

# Use base filename (without extension) for matching
image_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in image_paths}
label_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in label_paths}

valid_keys = sorted(set(image_dict.keys()) & set(label_dict.keys()))
print(f"{len(valid_keys)} matched image-label pairs found out of {len(image_dict)} images and {len(label_dict)} labels.")

# === STEP 6: Remove images with no label ===
for image_key in tqdm(image_dict.keys(), desc="Cleaning unmatched images"):
    if image_key not in label_dict:
        os.remove(image_dict[image_key])





Saving images.zip to images.zip
Saving labels.zip to labels.zip
✅ Files successfully extracted!
✅ 878 matched image-label pairs found out of 893 images and 878 labels.


Cleaning unmatched images: 100%|██████████| 893/893 [00:00<00:00, 812652.09it/s]


In [2]:
import shutil

# Move all .jpg files from nested folder to main image folder
nested_image_dir = "dataset/images/images"
if os.path.exists(nested_image_dir):
    for file in os.listdir(nested_image_dir):
        shutil.move(os.path.join(nested_image_dir, file), "dataset/images")

# Move all .txt files from nested folder to main label folder
nested_label_dir = "dataset/labels/labels"
if os.path.exists(nested_label_dir):
    for file in os.listdir(nested_label_dir):
        shutil.move(os.path.join(nested_label_dir, file), "dataset/labels")


In [3]:
# Clean up empty nested folders if they exist
if os.path.exists("dataset/images/images"):
    os.rmdir("dataset/images/images")
if os.path.exists("dataset/labels/labels"):
    os.rmdir("dataset/labels/labels")


In [None]:
tumor_count = 0
no_tumor_count = 0

labels_path = "dataset/labels"

for label_file in os.listdir(labels_path):
    label_file_path = os.path.join(labels_path, label_file)
    if os.path.isfile(label_file_path):
        with open(label_file_path, 'r') as f:
            for line in f:
                if line.strip():
                    label = int(line.strip().split()[0])
                    if label == 1:
                        tumor_count += 1
                    elif label == 0:
                        no_tumor_count += 1
                    break  # Only consider the first non-empty label per file

total = tumor_count + no_tumor_count
print(f"Tumor cases: {tumor_count}")
print(f"No Tumor cases: {no_tumor_count}")
print(f"Total labeled files: {total}")


🧠 Tumor cases: 459
🧠 No Tumor cases: 419
📦 Total labeled files: 878


In [None]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split

# Seed for reproducibility
random.seed(42)

# Source directories
images_dir = "dataset/images"
labels_dir = "dataset/labels"

# Output base
output_base = "brain_tumor_yolo"
splits = ["train", "val", "test"]

# Create output directories
for split in splits:
    os.makedirs(f"{output_base}/images/{split}", exist_ok=True)
    os.makedirs(f"{output_base}/labels/{split}", exist_ok=True)

# Gather image-label pairs
pairs = []
for label_file in os.listdir(labels_dir):
    label_path = os.path.join(labels_dir, label_file)
    image_file = os.path.splitext(label_file)[0] + ".jpg"
    image_path = os.path.join(images_dir, image_file)
    if os.path.exists(image_path):
        with open(label_path, 'r') as f:
            first_line = f.readline().strip()
            if first_line:
                cls = int(first_line.split()[0])
                pairs.append((image_path, label_path, cls))

# Stratified split
X = [(img, lbl) for img, lbl, _ in pairs]
y = [cls for _, _, cls in pairs]

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.176, stratify=y_trainval, random_state=42)  # 0.176 of 85% ≈ 15%

# Function to copy files
def copy_pairs(pairs, split):
    for img_path, lbl_path in pairs:
        shutil.copy(img_path, os.path.join(output_base, "images", split, os.path.basename(img_path)))
        shutil.copy(lbl_path, os.path.join(output_base, "labels", split, os.path.basename(lbl_path)))

# Copy to destination folders
copy_pairs(X_train, "train")
copy_pairs(X_val, "val")
copy_pairs(X_test, "test")

# Confirm counts
print(f" Train set: {len(X_train)}")
print(f" Val set: {len(X_val)}")
print(f" Test set: {len(X_test)}")


✅ Train set: 614
✅ Val set: 132
✅ Test set: 132


In [None]:
# add dummy bounding boxes for class 0 to help YOLO classify
from glob import glob

for split in ['train', 'val', 'test']:
    label_dir = f"brain_tumor_yolo/labels/{split}"
    for path in glob(f"{label_dir}/*.txt"):
        with open(path, 'r') as f:
            lines = f.readlines()

        new_lines = []
        for line in lines:
            parts = line.strip().split()
            if not parts:
                continue
            cls = parts[0]
            if cls == '0':
                # Add dummy bbox covering the full image
                new_lines.append("0 0.5 0.5 1.0 1.0\n")
            elif cls == '1' and len(parts) == 5:
                new_lines.append(" ".join(parts) + "\n")

        if new_lines:
            with open(path, 'w') as f:
                f.writelines(new_lines)


In [7]:
!pip install -q albumentations


In [8]:
import albumentations as A
import cv2
import os
from glob import glob
import shutil

# ✅ Include bbox_params to tell Albumentations you're using Pascal VOC format
transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.3),
    A.RandomBrightnessContrast(p=0.3),
    A.Rotate(limit=30, p=0.5),
    A.Blur(blur_limit=3, p=0.2),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels']))

image_dir = "brain_tumor_yolo/images/train"
label_dir = "brain_tumor_yolo/labels/train"

aug_image_dir = "brain_tumor_yolo/images/train"
aug_label_dir = "brain_tumor_yolo/labels/train"

img_paths = glob(f"{image_dir}/*.jpg")

aug_counter = 0

for img_path in img_paths:
    base = os.path.splitext(os.path.basename(img_path))[0]
    label_path = os.path.join(label_dir, base + ".txt")

    # Read image and labels
    image = cv2.imread(img_path)
    height, width = image.shape[:2]

    with open(label_path, 'r') as f:
        lines = f.readlines()

    bboxes = []
    class_labels = []

    for line in lines:
        parts = line.strip().split()
        if len(parts) == 5:
            cls, x_center, y_center, w, h = parts
            class_labels.append(int(cls))
            # Convert to pixel format
            x = float(x_center) * width
            y = float(y_center) * height
            w_box = float(w) * width
            h_box = float(h) * height
            x_min = x - w_box / 2
            y_min = y - h_box / 2
            x_max = x + w_box / 2
            y_max = y + h_box / 2
            bboxes.append([x_min, y_min, x_max, y_max])

    # Skip if no bboxes (to avoid errors)
    if not bboxes:
        continue

    # Apply transformation
    transformed = transform(image=image, bboxes=bboxes, class_labels=class_labels)

    aug_image = transformed["image"]
    aug_bboxes = transformed["bboxes"]
    aug_classes = transformed["class_labels"]

    # Save augmented image
    aug_name = f"{base}_aug{aug_counter}"
    aug_img_path = os.path.join(aug_image_dir, f"{aug_name}.jpg")
    aug_label_path = os.path.join(aug_label_dir, f"{aug_name}.txt")

    cv2.imwrite(aug_img_path, aug_image)

    with open(aug_label_path, 'w') as f:
        for i in range(len(aug_bboxes)):
            x_min, y_min, x_max, y_max = aug_bboxes[i]
            cls = aug_classes[i]
            # Convert back to YOLO format
            x = (x_min + x_max) / 2 / width
            y = (y_min + y_max) / 2 / height
            w = (x_max - x_min) / width
            h = (y_max - y_min) / height
            f.write(f"{cls} {x:.6f} {y:.6f} {w:.6f} {h:.6f}\n")

    aug_counter += 1

print(f"✅ Data augmentation completed: {aug_counter} new images added.")


✅ Data augmentation completed: 614 new images added.


In [9]:
# Clone YOLOv5 repo
!git clone https://github.com/ultralytics/yolov5.git
%cd yolov5

# Install dependencies
!pip install -r requirements.txt


Cloning into 'yolov5'...
remote: Enumerating objects: 17372, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 17372 (delta 44), reused 24 (delta 22), pack-reused 17308 (from 2)[K
Receiving objects: 100% (17372/17372), 16.24 MiB | 26.91 MiB/s, done.
Resolving deltas: 100% (11906/11906), done.
/content/yolov5
Collecting thop>=0.1.1 (from -r requirements.txt (line 14))
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting ultralytics>=8.2.34 (from -r requirements.txt (line 18))
  Downloading ultralytics-8.3.105-py3-none-any.whl.metadata (37 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->-r requirements.txt (line 15))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->-r requirements.txt (line 15))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-no

In [None]:
data_yaml = """
train: ../brain_tumor_yolo/images/train
val: ../brain_tumor_yolo/images/val
test: ../brain_tumor_yolo/images/test

nc: 2
names: ['No Tumor', 'Tumor']
"""

with open('brain_tumor.yaml', 'w') as f:
    f.write(data_yaml)

print(" data.yaml created!")


✅ data.yaml created!


In [11]:
!python train.py \
  --img 640 \
  --batch 16 \
  --epochs 50 \
  --data brain_tumor.yaml \
  --weights yolov5s.pt \
  --name brain_tumor_model_v2 \
  --cache \
  --project runs/train


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with tor

In [14]:
!python val.py \
  --data brain_tumor.yaml \
  --weights runs/train/brain_tumor_model_v2/weights/best.pt \
  --task test \
  --img 640 \
  --conf-thres 0.001 \
  --iou-thres 0.6 \
  --save-txt \
  --save-conf \
  --project runs/val \
  --name brain_tumor_test_eval \
  --exist-ok


[34m[1mval: [0mdata=brain_tumor.yaml, weights=['runs/train/brain_tumor_model_v2/weights/best.pt'], batch_size=32, imgsz=640, conf_thres=0.001, iou_thres=0.6, max_det=300, task=test, device=, workers=8, single_cls=False, augment=False, verbose=False, save_txt=True, save_hybrid=False, save_conf=True, save_json=False, project=runs/val, name=brain_tumor_test_eval, exist_ok=True, half=False, dnn=False
YOLOv5 🚀 v7.0-411-gf4d8a84c Python-3.11.11 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
Model summary: 157 layers, 7015519 parameters, 0 gradients, 15.8 GFLOPs
[34m[1mtest: [0mScanning /content/brain_tumor_yolo/labels/test... 132 images, 0 backgrounds, 0 corrupt: 100% 132/132 [00:00<00:00, 498.61it/s]
[34m[1mtest: [0mNew cache created: /content/brain_tumor_yolo/labels/test.cache
                 Class     Images  Instances          P          R      mAP50   mAP50-95: 100% 5/5 [00:04<00:00,  1.12it/s]
                   all        132        140      0.643      0.90

In [None]:
import os

pred_path = 'runs/val/brain_tumor_test_eval/labels'
true_path = '../brain_tumor_yolo/labels/test'

if not os.path.exists(pred_path):
    print("Predictions folder not found. YOLO did not save label predictions.")
else:
    print(f" Prediction files: {len(os.listdir(pred_path))}")
    print(f"Ground truth files: {len(os.listdir(true_path))}")


📂 Prediction files: 132
📂 Ground truth files: 132


In [None]:
correct = 0
total = 0

for file in os.listdir(true_path):
    true_file = os.path.join(true_path, file)
    pred_file = os.path.join(pred_path, file)

    if not os.path.exists(pred_file):
        continue  # skip if YOLO predicted nothing

    with open(true_file, 'r') as f_true:
        true_lines = [line.strip() for line in f_true if line.strip()]
        true_label = true_lines[0].split()[0] if true_lines else None

    with open(pred_file, 'r') as f_pred:
        pred_lines = [line.strip() for line in f_pred if line.strip()]
        pred_label = pred_lines[0].split()[0] if pred_lines else None

    if true_label and pred_label and true_label == pred_label:
        correct += 1
    total += 1

print(f" Classification Accuracy on test set: {correct / total:.4f} ({correct}/{total})" if total > 0 else " Still no predictions to evaluate.")


🎯 Classification Accuracy on test set: 0.7652 (101/132)
