# 1. Load data

First, we load the data from kaggle and we set the image size, all the images of the dataset should have this size but we will resize them "por si acaso"

In [1]:
import kagglehub
import os

dataset_path = kagglehub.dataset_download("mohammadhossein77/brain-tumors-dataset")
dataset_path = os.path.join(dataset_path, "Data") #Enter in the folder Data


IMG_SIZE = (224, 224)



Now we download the data as a PyTorch Dataset

In [2]:
import torch 
import numpy as np
import cv2
from torch.utils.data import DataLoader, Dataset

class BrainTumorDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = ["Normal", "Tumor/glioma_tumor", "Tumor/meningioma_tumor", "Tumor/pituitary_tumor"]
        self.images = []
        self.labels = []

        for label, category in enumerate(self.classes):
            category_path = os.path.join(root_dir, category)
            for img_name in os.listdir(category_path):
                img_path = os.path.join(category_path, img_name)
                self.images.append(img_path)
                self.labels.append(label)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        img = cv2.imread(img_path)
        if img is None:
            raise FileNotFoundError(f"Could not load image: {img_path}")
        img = cv2.resize(img, IMG_SIZE)
        img = img / 255.0
        img = np.transpose(img, (2, 0, 1))
        img = torch.tensor(img, dtype=torch.float32)
        label = self.labels[idx]
        return img, label


We execute the dataset

In [3]:
brain_dataset = BrainTumorDataset(dataset_path)

# 2. Split data in train and test

First, we will set a seed

In [None]:
import random 

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # Si usas GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(42)

We split data in 80% train and 20% test, stratifying by classes 

In [4]:
from sklearn.model_selection import train_test_split

brain_dataset = BrainTumorDataset(dataset_path)

labels = brain_dataset.labels

train_indices, test_indices = train_test_split(
    list(range(len(brain_dataset))),
    test_size=0.2,
    stratify=labels,
    random_state=42
)

train_dataset = torch.utils.data.Subset(brain_dataset, train_indices)
test_dataset = torch.utils.data.Subset(brain_dataset, test_indices)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

We will use cuda if it is available instead of the cpu

In [None]:
def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = get_device()
