# RQ3: How does the CNN perform on different image tasks and datasets?

## Prepare the datasets

**Dataset 1: Brain MRI Images dataset:**<br>
The dataset contains 2 folders: yes and no which contains 253 Brain MRI Images. The folder yes contains 155 Brain MRI Images that are tumorous and the folder no contains 98 Brain MRI Images that are non-tumorous. You can find it [here](https://www.kaggle.com/navoneel/brain-mri-images-for-brain-tumor-detection). This dataset is then augmented according to the Data Augmentation notebook. After data augmentation, the dataset consists of 1085 positive and 980 examples, resulting in 2065 example images.

**Dataset 2: Brain Tumor MRI dataset:**<br>
This dataset contains 7023 images of human brain MRI images which are classified into 4 classes: glioma - meningioma - no tumor and pituitary. No tumor class images were taken from the Br35H dataset. You can find it [here](https://www.kaggle.com/datasets/masoudnickparvar/brain-tumor-mri-dataset).

**Dataset 3: Brain Tumor Classification (MRI) dataset:**<br>
The dataset holds 3260 images on T1-weighted contrast-enhanced images that were cleaned and augmented. You can find it [here](https://www.kaggle.com/datasets/sartajbhuvaji/brain-tumor-classification-mri).

## Import Necessary Modules

In [1]:
import random
import numpy as np
import os

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

In [2]:
# Now proceed with other imports
import shutil
import cv2
import zipfile
import os

%matplotlib inline

### Downloading the Brain Tumor MRI (Dataset 2) and Brain Tumor Classification (Dataset 3)

In [3]:
# Create data directory and download
os.makedirs('data', exist_ok=True)
os.makedirs('data/brain-tumor-mri', exist_ok=True)
os.makedirs('data/brain-tumor-classification', exist_ok=True)
!curl -L -o data/brain-tumor-mri/brain-tumor-mri-dataset.zip -k https://www.kaggle.com/api/v1/datasets/download/masoudnickparvar/brain-tumor-mri-dataset
!curl -L -o data/brain-tumor-classification/brain-tumor-classification-dataset.zip -k https://www.kaggle.com/api/v1/datasets/download/sartajbhuvaji/brain-tumor-classification-mri

# Unzip the files
with zipfile.ZipFile('data/brain-tumor-mri/brain-tumor-mri-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('data/brain-tumor-mri/')

with zipfile.ZipFile('data/brain-tumor-classification/brain-tumor-classification-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('data/brain-tumor-classification/')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

  0  148M    0   525    0     0    524      0 82:35:11  0:00:01 82:35:10   524
  1  148M    1 1774k    0     0   911k      0  0:02:46  0:00:01  0:02:45 1877k
  3  148M    3 5456k    0     0  1853k      0  0:01:22  0:00:02  0:01:20 2807k
  6  148M    6 9696k    0     0  2460k      0  0:01:01  0:00:03  0:00:58 3297k
  9  148M    9 13.8M    0     0  2876k      0  0:00:52  0:00:04  0:00:48 3607k
 11  148M   11 17.4M    0     0  3006k      0  0:00:50  0:00:05  0:00:45 3615k
 14  148M   14 21.0M    0     0  3111k      0  0:00:48  0:00:06  0:00:42 3969k
 15  148M   15 23.4M    0     0  3024k      0  0:00:50  0:00:07  0:00:43 3714k
 17  148M   17 25.4M    0     0  2915k      0  0:0

### Restructure Dataset 2 and 3 to yes/no folders and balance the classes

In [4]:
# Define the base source and destination paths
base_data_path = "data"
new_structure_path = "data_restructured"

# List of datasets to process
datasets = ["brain-tumor-classification", "brain-tumor-mri"]

# Define tumor types for each dataset
dataset_tumor_types = {
    "brain-tumor-classification": {
        "tumor_types": ["glioma_tumor", "meningioma_tumor", "pituitary_tumor"],
        "no_tumor": "no_tumor"
    },
    "brain-tumor-mri": {
        "tumor_types": ["glioma", "meningioma", "pituitary"],
        "no_tumor": "notumor"
    }
}

for dataset in datasets:
    # Create dataset-specific restructured path
    dataset_restructured_path = os.path.join(new_structure_path, dataset)
    os.makedirs(dataset_restructured_path, exist_ok=True)
    os.makedirs(os.path.join(dataset_restructured_path, "yes"), exist_ok=True)
    os.makedirs(os.path.join(dataset_restructured_path, "no"), exist_ok=True)

    # Get the tumor types and no_tumor name for the current dataset
    tumor_types = dataset_tumor_types[dataset]["tumor_types"]
    notumor_type = dataset_tumor_types[dataset]["no_tumor"]

    # Copy tumor data into 'yes' (flattened) for the current dataset
    for tumor_type in tumor_types:
        for dataset_type in ["Testing", "Training"]:
            source_dir = os.path.join(base_data_path, dataset, dataset_type, tumor_type)
            if os.path.exists(source_dir):
                # Copy all files directly to 'yes'
                for item in os.listdir(source_dir):
                    source_item = os.path.join(source_dir, item)
                    if os.path.isfile(source_item):
                        destination_item = os.path.join(dataset_restructured_path, "yes", item)
                        # Avoid overwriting by adding a number if file exists
                        base_name, extension = os.path.splitext(item)
                        counter = 1
                        while os.path.exists(destination_item):
                            new_name = f"{base_name}_{counter}{extension}"
                            destination_item = os.path.join(dataset_restructured_path, "yes", new_name)
                            counter += 1
                        shutil.copy2(source_item, destination_item)

    # Copy notumor data into 'no' (flattened) for the current dataset
    for dataset_type in ["Testing", "Training"]:
        source_dir = os.path.join(base_data_path, dataset, dataset_type, notumor_type)
        if os.path.exists(source_dir):
            # Copy all files directly to 'no'
            for item in os.listdir(source_dir):
                source_item = os.path.join(source_dir, item)
                if os.path.isfile(source_item):
                    destination_item = os.path.join(dataset_restructured_path, "no", item)
                    # Avoid overwriting by adding a number if file exists
                    base_name, extension = os.path.splitext(item)
                    counter = 1
                    while os.path.exists(destination_item):
                        new_name = f"{base_name}_{counter}{extension}"
                        destination_item = os.path.join(dataset_restructured_path, "no", new_name)
                        counter += 1
                    shutil.copy2(source_item, destination_item)

    # Balance the dataset
    yes_path = os.path.join(dataset_restructured_path, "yes")
    no_path = os.path.join(dataset_restructured_path, "no")

    # Count the number of files in each folder for the current dataset
    yes_files = [f for f in os.listdir(yes_path) if os.path.isfile(os.path.join(yes_path, f))]
    no_files = [f for f in os.listdir(no_path) if os.path.isfile(os.path.join(no_path, f))]

    yes_count = len(yes_files)
    no_count = len(no_files)

    print(f"Before balancing ({dataset}): Yes: {yes_count}, No: {no_count}")

    # If yes has more samples than no, randomly remove samples from yes
    if yes_count > no_count:
        # Calculate how many to remove from yes to match no
        excess = yes_count - no_count
        # Randomly select files to remove
        files_to_remove = random.sample(yes_files, excess)
        # Remove the selected files
        for file in files_to_remove:
            os.remove(os.path.join(yes_path, file))

    # Recount after balancing
    yes_files = [f for f in os.listdir(yes_path) if os.path.isfile(os.path.join(yes_path, f))]
    no_files = [f for f in os.listdir(no_path) if os.path.isfile(os.path.join(no_path, f))]

    yes_count = len(yes_files)
    no_count = len(no_files)

    print(f"After balancing ({dataset}): Yes: {yes_count}, No: {no_count}")

Before balancing (brain-tumor-classification): Yes: 2764, No: 500
After balancing (brain-tumor-classification): Yes: 500, No: 500
Before balancing (brain-tumor-mri): Yes: 5023, No: 2000
After balancing (brain-tumor-mri): Yes: 2000, No: 2000


#### For dataset 4, run the following cell to prepare the data after downloading it:

In [None]:
import os
import h5py

def process_mat_files(input_base, output_base):
    class_names = {1: "meningioma", 2: "glioma", 3: "pituitary"}

    # Create output directories
    os.makedirs(output_base, exist_ok=True)
    for class_name in class_names.values():
        os.makedirs(os.path.join(output_base, class_name), exist_ok=True)

    curStart = 1
    curEnd = 766

    mat_files = []

    # Process each part
    for _ in range(4):
        input_folder = os.path.join(input_base, f"brainTumorDataPublic_{curStart}-{curEnd}")
        mat_files = [f for f in os.listdir(input_folder) if f.endswith('.mat')]
        curStart = curEnd + 1
        curEnd = min(curEnd + 766, 3064)
        
        for mat_file in mat_files:
                
                file_path = os.path.join(input_folder, mat_file)
                try:
                    with h5py.File(file_path, 'r') as mat_data:
                        cjdata = mat_data['cjdata']
                        
                        im1 = np.array(cjdata['image'], dtype=np.float64).T  # Transpose if needed
                        min1, max1 = im1.min(), im1.max()
                        im = ((255 / (max1 - min1)) * (im1 - min1)).astype(np.uint8)
                        label = int(cjdata['label'][()][0][0])  # Extract label properly

                        label_folder = os.path.join(output_base, class_names[label])
                        os.makedirs(label_folder, exist_ok=True)

                        file_name_base, _ = os.path.splitext(mat_file)
                        output_file_path = os.path.join(label_folder, f"{file_name_base}.jpg")

                        cv2.imwrite(output_file_path, im)  
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")