# Short pipeline

Contains only the main code necessary to run this project. No additional visualisations or performance improvements.

## 1. Imports

In [32]:
import os
import pandas as pd
import numpy as np
import random
import shutil
from shutil import copyfile
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from tensorflow.keras import Model
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import (TextArea, DrawingArea, OffsetImage,
                                  AnnotationBbox)
import matplotlib.patches as mpatches
from sklearn.utils import shuffle
from tqdm import tqdm

In [33]:
base_dir  = "/home/maver02/Projects/Infrastructure_suite_project/Development/find-the-dog-project/CLOUD/data/image-segmentation-brain-tumor/"
os.chdir(base_dir)

## 2. Data Processing

In [34]:
df = pd.read_csv(os.path.join(base_dir + "Brain Tumor.csv"))[['Image', 'Class']]
display(df.head())
print(df.shape)

Unnamed: 0,Image,Class
0,Image1,0
1,Image2,0
2,Image3,1
3,Image4,1
4,Image5,0


(3762, 2)


In [35]:
# Split Training + Validation with Testing Set
def split_size(df, size):
    """
    Calculate the split size based on the given fraction of the DataFrame length.
    """
    return int(size * len(df))


train_labels = df['Class'].values[:split_size(df, 0.8)]
train_file_names = df['Image'].values[:split_size(df, 0.8)]

val_labels = df['Class'].values[split_size(df, 0.8):split_size(df, 0.9)]
val_file_names = df['Image'].values[split_size(df, 0.8):split_size(df, 0.9)]

test_labels = df['Class'].values[split_size(df, 0.9):]
test_file_names = df['Image'].values[split_size(df, 0.9):]

# labels are arrays of 1 or 0, names are arrays of image file names

In [36]:
# Split Training + Validation with Testing Set
def split_array_labels(arr_image, arr_label):
    """
    Split the array of images and labels into two separate arrays based on the label.
    """
    arr_image_0 = arr_image[np.where(arr_label==0)]
    arr_image_1 = arr_image[np.where(arr_label==1)]
    return {'0':arr_image_0, '1':arr_image_1}

train_arr_dict = split_array_labels(train_file_names, train_labels)
val_arr_dict = split_array_labels(val_file_names, val_labels)
test_arr_dict = split_array_labels(test_file_names, test_labels)

# each is a dictionary with keys '0' and '1', values are arrays of file names

In [37]:
# create directories for training, validation and testing
def create_empty_directories(base_dir):
    # in case you want to run it several times, delete the directory and create new one
    check_exist_path = os.path.join(base_dir, '_MODELLING')
    if os.path.isdir(check_exist_path):
        shutil.rmtree(check_exist_path)
        print("Remove old directories")
    
    for label in ['0','1']:
        training_dir = os.path.join(base_dir, '_MODELLING', 'training', label)
        validation_dir = os.path.join(base_dir, '_MODELLING', 'validation', label)
        testing_dir = os.path.join(base_dir, '_MODELLING', 'testing', label)    
        os.makedirs(training_dir)
        os.makedirs(validation_dir)
        os.makedirs(testing_dir)

    print(f"Created empty  training, validation and testing directories")

create_empty_directories(base_dir)

# copy data into the directories
def split_data(SOURCE_DIR, train_arr_dict, val_arr_dict, test_arr_dict):
    for label in tqdm(['0','1']):
        for file_name in train_arr_dict[label]:
            file_name = f"{file_name}.jpg"
            source = os.path.join(SOURCE_DIR, 'Brain Tumor', 'Brain Tumor', file_name)
            destination = os.path.join(base_dir, '_MODELLING', 'training', label, file_name)
            copyfile(source, destination)

        for file_name in val_arr_dict[label]:
            file_name = f"{file_name}.jpg"
            source = os.path.join(SOURCE_DIR, 'Brain Tumor', 'Brain Tumor', file_name)
            destination = os.path.join(base_dir, '_MODELLING', 'validation', label, file_name)
            copyfile(source, destination)
        
        for file_name in test_arr_dict[label]:
            file_name = f"{file_name}.jpg"
            source = os.path.join(SOURCE_DIR, 'Brain Tumor', 'Brain Tumor', file_name)
            destination = os.path.join(base_dir, '_MODELLING', 'testing', label, file_name)
            copyfile(source, destination)
    print(f"Created training, validation and testing directories containing images")
    
split_data(base_dir, train_arr_dict, val_arr_dict, test_arr_dict)

Remove old directories
Created empty  training, validation and testing directories


100%|██████████| 2/2 [00:00<00:00,  5.08it/s]

Created training, validation and testing directories containing images





Now the folders contain the data. Data augmentation, and training can be performed.

## 3. Modelling

Create the generator that already implement data augmentation.

In [38]:
# Loads the data from the directories, preprocesses the images 
# and creates the generators

modelling_base_dir = os.path.join(base_dir, '_MODELLING')
os.chdir(modelling_base_dir)

def train_val_generators(TRAINING_DIR, VALIDATION_DIR, TEST_DIR):

    # Instantiate the ImageDataGenerator class (don't forget to set the arguments to augment the images)
    train_datagen = ImageDataGenerator(rescale=1./127.5,
                                       rotation_range=30,
                                       width_shift_range=0.2,
                                       height_shift_range=0.2,
                                       shear_range=0.2,
                                       zoom_range=0.2,
                                    horizontal_flip=True,
                                    fill_mode='nearest')

    # Pass in the appropriate arguments to the flow_from_directory method
    train_generator = train_datagen.flow_from_directory(directory=TRAINING_DIR,
                                                      batch_size=32,
                                                      class_mode='binary',
                                                      target_size=(150, 150))

    # Instantiate the ImageDataGenerator class (don't forget to set the rescale argument)
    valid_or_test_datagen = ImageDataGenerator(rescale=1./127.5)

    # Pass in the appropriate arguments to the flow_from_directory method
    validation_generator = valid_or_test_datagen.flow_from_directory(directory=VALIDATION_DIR,
                                                                batch_size=32,
                                                                class_mode='binary',
                                                                target_size=(150, 150))
    
    test_generator = valid_or_test_datagen.flow_from_directory(directory=TEST_DIR,
                                                                batch_size=32,
                                                                class_mode='binary',
                                                                target_size=(150, 150))
    return train_generator, validation_generator, test_generator

training_dir = os.path.join(modelling_base_dir, 'training')
validation_dir = os.path.join(modelling_base_dir, 'validation')
testing_dir = os.path.join(modelling_base_dir, 'testing')

train_generator, validation_generator, test_generator = train_val_generators(training_dir, validation_dir, testing_dir)

# train generator is a directory iterator, which yields batches of images that are preprocessed with the ImageDataGenerator

Found 3009 images belonging to 2 classes.
Found 376 images belonging to 2 classes.
Found 377 images belonging to 2 classes.


In [39]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")

In [40]:
# Initiate base model
base_model = tf.keras.applications.MobileNetV2(input_shape=(150, 150, 3),
                                               include_top=False,
                                               weights='imagenet')
# Layers of the base model will not be updated during training.
base_model.trainable = False

# Output layer of the base model becomes inut to additinal custom layer
last_output = base_model.output

  base_model = tf.keras.applications.MobileNetV2(input_shape=(150, 150, 3),


In [41]:
# takes output of the base model and adds additional layers on top of it
def transfer_learning(last_output, pre_trained_model):
    """
    Applies transfer learning to a pre-trained model by adding custom layers.
    Args:
        last_output (tf.Tensor): The output tensor from the last layer of the pre-trained model.
        pre_trained_model (tf.keras.Model): The pre-trained model to which new layers will be added.
    Returns:
        tf.keras.Model: A new model with the added custom layers on top of the pre-trained model.
    """
    # Flatten the output layer to 1 dimension
    x = tf.keras.layers.Flatten()(last_output)
    # Add a fully connected layer with 1024 hidden units and ReLU activation
    x = tf.keras.layers.Dense(1024, activation='relu')(x)
    # Add a dropout rate of 0.6
    x = tf.keras.layers.Dropout(0.6)(x)  
    # Add a final sigmoid layer for classification
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)          
    # Create the complete model by using the Model class
    model = Model(inputs=pre_trained_model.input, outputs=x)
    
    return model

# Create the model
model = transfer_learning(last_output, base_model)

# stop training if the validation loss does not decrease for 3 consecutive epochs
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

# set model configuration
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0003),
            loss = 'binary_crossentropy',
            metrics=['accuracy'])

In [43]:
import mlflow
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
# Create a new MLflow Experiment
mlflow.set_experiment("Training_Brain_Tumor")

mlflow.tensorflow.autolog(checkpoint=True, checkpoint_save_best_only=False)

with mlflow.start_run() as run:
    history = model.fit(train_generator,
                        validation_data=validation_generator,
                        epochs=5,
                        callbacks=[callback])   

2024/09/24 16:56:13 INFO mlflow.tracking.fluent: Experiment with name 'Training_Brain_Tumor' does not exist. Creating a new experiment.


Epoch 1/5


  self._warn_if_super_not_called()


[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step - accuracy: 0.7627 - loss: 2.7188



[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 289ms/step - accuracy: 0.7633 - loss: 2.7052 - val_accuracy: 0.8803 - val_loss: 0.3466
Epoch 2/5
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257ms/step - accuracy: 0.8607 - loss: 0.3781



[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 283ms/step - accuracy: 0.8608 - loss: 0.3777 - val_accuracy: 0.8564 - val_loss: 0.3414
Epoch 3/5
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 258ms/step - accuracy: 0.8720 - loss: 0.3235



[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 284ms/step - accuracy: 0.8720 - loss: 0.3235 - val_accuracy: 0.8617 - val_loss: 0.3297
Epoch 4/5
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 259ms/step - accuracy: 0.8837 - loss: 0.3125



[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 286ms/step - accuracy: 0.8836 - loss: 0.3124 - val_accuracy: 0.9016 - val_loss: 0.2399
Epoch 5/5
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 259ms/step - accuracy: 0.8914 - loss: 0.2803



[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 286ms/step - accuracy: 0.8913 - loss: 0.2804 - val_accuracy: 0.8910 - val_loss: 0.2948


2024/09/24 16:58:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run clumsy-colt-467 at: http://127.0.0.1:5000/#/experiments/881152748453209801/runs/6f0a99f66b294038b13df24656fd43ad.
2024/09/24 16:58:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/881152748453209801.
