In [1]:
# ========================================================
# To train and test a classifier using Transfer Learning.
# =======================================================

#--- Import necessary modules from Python libraries.
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.applications import vgg16, mobilenet
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Accuracy, Precision, Recall, AUC
import matplotlib.pyplot as plt
import numpy as np
import cv2, os, pickle
import csv
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2024-11-24 14:32:25.697857: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-24 14:32:25.786677: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732437145.818395    5525 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732437145.828618    5525 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-24 14:32:25.909409: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
def test_classifier(storage_dir, testX, testY):
	#--- Load trained model
	# model = build_model()	
	# model_path = storage_dir + 'VGG16_Classifier.weights.h5'
	model_path = storage_dir + 'VGG16_Classifier.weights.keras'
	# model_weights = model.load_weights(model_path)
	model = load_model(model_path)
	
	#--- Compile model when we need metrics not mentioned while training
	model.compile(loss = 'categorical_crossentropy', metrics = ['accuracy', Precision()])
	
	#--- Predict model's output
	predictedY = np.argmax(model.predict(testX), axis = -1)
	int_testY = np.argmax(testY, axis = -1)
	# n = predictedY.shape[0]
	n = 10
	print('Original_Y 	Predicted_Y')
	print('========== 	===========')	
	for i in range(n):
		print('{}                 {}'.format(int_testY[i], predictedY[i]))
    
    #--- Evaluate model performance
	test_metrics = model.evaluate(testX, testY)
	
	return test_metrics

In [3]:
def train_classifier(storage_dir, trainX, trainY, batch_size):
    #--- Build model
    model = build_model()
    model.summary(show_trainable = True)
    
    #--- Freez backbone
    for layer in model.layers[:-5]:
        layer.trainable = False
    model.summary(show_trainable = True)

    # Split the training data into training and validation sets
    trainX, valX, trainY, valY = train_test_split(trainX, trainY, test_size=0.2)
    
    # Data generator for batching
    datagen = ImageDataGenerator()
    # batch_size = 8  # Start with a small batch size
    steps_per_epoch = len(trainX) // batch_size
    
    # Train the model with data generator
    model.fit(
        datagen.flow(trainX, trainY, batch_size=batch_size),
        steps_per_epoch=steps_per_epoch,
        validation_data=(valX, valY),
        epochs=WARMUP_EPOCHS
    )
    
    #--- Unfreez some Convolutional layers of backbone for fine-tuning
    for layer in model.layers[-7:-5]:
        layer.trainable = True
    model.summary(show_trainable = True)	
    
    #--- Callbacks
    # model_path = storage_dir + 'VGG16_Classifier.weights.h5'
    model_path = storage_dir + 'VGG16_Classifier.weights.keras'
    callbacks = [
        ModelCheckpoint(model_path, monitor = "val_loss", mode = 'min', save_best_only = True, save_weights_only = False),
        EarlyStopping(monitor = "val_loss", mode = 'min', patience = EARLY_STOP_PATIENCE),
        ReduceLROnPlateau(monitor = "val_loss", mode = 'min', factor = LR_REDUCE_FACTOR, patience = LR_REDUCE_PATIENCE)
    ]


    # Train the model with data generator
    hist = model.fit(
        datagen.flow(trainX, trainY, batch_size=batch_size),
        steps_per_epoch=steps_per_epoch,
        validation_data=(valX, valY),
        epochs=EPOCHS, 
        callbacks=callbacks
    )
    
    #--- Save history
    performance_path = storage_dir + 'TrainVal_'
    save_model_performance(performance_path, hist)
    
    return hist

In [4]:
def save_model_performance(performance_path, history):
	#--- Save history into a dictionary
	hist_dict = history.history
	with open(performance_path + 'PerformanceDict.pkl', 'wb') as f:
		pickle.dump(hist_dict, f)

	#--- Plot progress graphs
	# Plot loss
	x_axis = np.arange(len(hist_dict['loss']))
	plt.rcParams.update({'font.size': 22})
	plt.figure(figsize = (20, 20))
	plt.plot(x_axis, hist_dict['loss'], 'k.--', linewidth = 2, markersize = 12)
	plt.plot(x_axis, hist_dict['val_loss'], 'g*--', linewidth = 2, markersize = 12)
	plt.xlabel('Loss')
	plt.ylabel('Epoch')
	plt.title('Training and Validation Loss')
	plt.xticks(rotation = 90)
	plt.legend(['training_loss', 'validation_loss'])
	plt.savefig(performance_path + 'Loss.jpg')
	plt.close()

	# Plot accuracy
	metric = 'accuracy'
	plt.rcParams.update({'font.size': 22})
	plt.figure(figsize = (20, 20))
	plt.plot(x_axis, hist_dict[metric], 'k.--', linewidth = 2, markersize = 12)
	plt.plot(x_axis, hist_dict['val_' + metric], 'g*--', linewidth = 2, markersize = 12)
	plt.xlabel('Accuracy')
	plt.ylabel('Epoch')
	plt.title('Training and Validation Accuracy')
	plt.xticks(rotation = 90)
	plt.legend(['training_' + metric, 'validation_' + metric])
	plt.savefig(performance_path + metric + '.jpg')
	plt.close()


In [5]:
def process_data():
    #-- Load data
    # (trainX, trainY), (testX, testY) = fashion_mnist.load_data() 
    (trainX, trainY), (testX, testY) = cifar10.load_data() 
    
    #--- Turn 3D image dataset into 4D dataset for Conv2D layers
    print('trainX.shape: {}, trainX.dtype: {}'.format(trainX.shape, trainX.dtype))
    print('testX.shape: {}, testX.dtype: {}'.format(testX.shape, testX.dtype))
    
    # resize data
    trainX = resize_images(trainX)
    testX = resize_images(testX)
    print('trainX.shape: {}, trainX.dtype: {}'.format(trainX.shape, trainX.dtype))
    print('testX.shape: {}, testX.dtype: {}'.format(testX.shape, testX.dtype))

    #--- Preprocess imageset according to the preprocess procedure of pre-trained model
    trainX = vgg16.preprocess_input(trainX)
    testX = vgg16.preprocess_input(testX)
    print('trainX.shape: {}, trainX.dtype: {}'.format(trainX.shape, trainX.dtype))
    print('testX.shape: {}, testX.dtype: {}'.format(testX.shape, testX.dtype))
            
    #--- Turn y as one-hot-encoding
    print('trainY.shape: {}, trainY.dtype: {}'.format(trainY.shape, trainY.dtype))
    print('testY.shape: {}, testY.dtype: {}'.format(testY.shape, testY.dtype))
    trainY = to_categorical(trainY, NUM_CLASSES)
    testY = to_categorical(testY, NUM_CLASSES)
    print('trainY.shape: {}, trainY.dtype: {}'.format(trainY.shape, trainY.dtype))
    print('testY.shape: {}, testY.dtype: {}'.format(testY.shape, testY.dtype))
        
    #--- Cross check
    # plt.imshow(trainX[0])
    # plt.title(trainY[0])
    # plt.show()
    # plt.close()
    
    return (trainX, trainY), (testX, testY)

In [6]:
def convert_3D_to_4D(x):
	n, h, w = x.shape
	x4D = np.zeros((n, IMG_SIZE, IMG_SIZE, 3), dtype = np.uint8)
	for i in range(n):
		#--- Resize image
		resized_img = cv2.resize(x[i], (IMG_SIZE, IMG_SIZE))
		
		#--- Convert 2D image into 3D image
		x4D[i] = cv2.cvtColor(resized_img, cv2.COLOR_GRAY2RGB) 
	return x4D

In [7]:
def resize_images(images):
    # Resize images to (224, 224, 3) as required by the VGG16 model
    resized_images = np.zeros((images.shape[0], IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)
    for i in range(images.shape[0]):
        resized_images[i] = cv2.resize(images[i], (IMG_SIZE, IMG_SIZE))
    return resized_images

In [8]:
def build_model():
	#--- Load a pre-trained backbone
	base_model = vgg16.VGG16(include_top = False, weights = 'imagenet', input_shape = (IMG_SIZE, IMG_SIZE, 3))
	base_model.summary(show_trainable = True)
		
	#--- Build a new model based on loaded backbone
	inputs = base_model.input
	x = base_model.output
	x = layers.Flatten()(x)
	x = layers.Dense(128, activation = 'relu')(x)
	x = layers.Dense(64, activation = 'relu')(x)	
	outputs = layers.Dense(10, activation = 'softmax')(x)
	model = Model(inputs, outputs)
	
	#--- Compile model
	model.compile(loss = 'categorical_crossentropy', metrics = ['accuracy'])
	
	return model

In [9]:
#--- Fixed terms
WORKING_DIR = '/home/mursalin/m3c/computer-vision/task/'  
IMG_SIZE = 224
EARLY_STOP_PATIENCE = 50
LR_REDUCE_PATIENCE = 10
LR_REDUCE_FACTOR = 0.8 #--- new_lr = old_lr * LR_REDUCE_FACTOR
NUM_CLASSES = 10
WARMUP_EPOCHS = 5
EPOCHS = 10
BATCH_SIZE = 128

In [10]:
#--- Create a directory to store model and figures
storage_dir = WORKING_DIR + 'batch/' 
if (os.path.exists(storage_dir) == False):
    os.makedirs(storage_dir)
else:
    print(storage_dir + ' exists.')
    
#--- Prepare data
(trainX, trainY), (testX, testY) = process_data()

# Enable GPU memory growth to avoid memory issues
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

#--- Train a classifier using Transfer learning
history = train_classifier(storage_dir, trainX, trainY, BATCH_SIZE)

#--- Test trained classifier
test_metrics = test_classifier(storage_dir, testX, testY)


/home/mursalin/m3c/computer-vision/task/batch/ exists.
trainX.shape: (50000, 32, 32, 3), trainX.dtype: uint8
testX.shape: (10000, 32, 32, 3), testX.dtype: uint8
trainX.shape: (50000, 224, 224, 3), trainX.dtype: uint8
testX.shape: (10000, 224, 224, 3), testX.dtype: uint8
trainX.shape: (50000, 224, 224, 3), trainX.dtype: float32
testX.shape: (10000, 224, 224, 3), testX.dtype: float32
trainY.shape: (50000, 1), trainY.dtype: uint8
testY.shape: (10000, 1), testY.dtype: uint8
trainY.shape: (50000, 10), trainY.dtype: float64
testY.shape: (10000, 10), testY.dtype: float64


I0000 00:00:1732437161.855019    5525 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 18965 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:01:00.0, compute capability: 8.9


Epoch 1/5


  self._warn_if_super_not_called()
I0000 00:00:1732437172.656862    5667 service.cc:148] XLA service 0x7ba50440db80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732437172.663198    5667 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2024-11-24 14:32:52.698239: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1732437172.787110    5667 cuda_dnn.cc:529] Loaded cuDNN version 90300

I0000 00:00:1732437181.053987    5667 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 206ms/step - accuracy: 0.6220 - loss: 4.5452 - val_accuracy: 0.7978 - val_loss: 0.7493
Epoch 2/5
[1m  1/312[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m28s[0m 92ms/step - accuracy: 0.8359 - loss: 0.7823

2024-11-24 14:34:05.314325: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-11-24 14:34:05.314423: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
2024-11-24 14:34:05.314441: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 9005859322797773434
2024-11-24 14:34:05.314458: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 4225635991422258652
  self.gen.throw(value)


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.8359 - loss: 0.7823 - val_accuracy: 0.8252 - val_loss: 0.5802
Epoch 3/5
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 137ms/step - accuracy: 0.8856 - loss: 0.3757 - val_accuracy: 0.8441 - val_loss: 0.5709
Epoch 4/5
[1m  1/312[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m25s[0m 83ms/step - accuracy: 0.9141 - loss: 0.2627

2024-11-24 14:34:55.431995: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
2024-11-24 14:34:55.432098: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 9005859322797773434
2024-11-24 14:34:55.432111: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 4225635991422258652


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9141 - loss: 0.2627 - val_accuracy: 0.8574 - val_loss: 0.4984
Epoch 5/5
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 138ms/step - accuracy: 0.9391 - loss: 0.1884 - val_accuracy: 0.8575 - val_loss: 0.5937


Epoch 1/10
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 151ms/step - accuracy: 0.9663 - loss: 0.1056 - val_accuracy: 0.8642 - val_loss: 0.6681 - learning_rate: 0.0010
Epoch 2/10
[1m  1/312[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m27s[0m 89ms/step - accuracy: 0.9688 - loss: 0.1053

2024-11-24 14:36:33.479295: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 9005859322797773434
2024-11-24 14:36:33.479580: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 4225635991422258652


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9688 - loss: 0.1053 - val_accuracy: 0.8706 - val_loss: 0.5913 - learning_rate: 0.0010
Epoch 3/10
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 121ms/step - accuracy: 0.9791 - loss: 0.0631 - val_accuracy: 0.8676 - val_loss: 0.7792 - learning_rate: 0.0010
Epoch 4/10
[1m  1/312[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24s[0m 79ms/step - accuracy: 0.9531 - loss: 0.1908

2024-11-24 14:37:18.906425: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
2024-11-24 14:37:18.906505: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 9005859322797773434
2024-11-24 14:37:18.906527: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 4225635991422258652


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9531 - loss: 0.1908 - val_accuracy: 0.8731 - val_loss: 0.7025 - learning_rate: 0.0010
Epoch 5/10
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 121ms/step - accuracy: 0.9859 - loss: 0.0440 - val_accuracy: 0.8738 - val_loss: 0.8097 - learning_rate: 0.0010
Epoch 6/10
[1m  1/312[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m27s[0m 89ms/step - accuracy: 0.9844 - loss: 0.0372

2024-11-24 14:38:04.003099: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 9005859322797773434
2024-11-24 14:38:04.003185: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 4225635991422258652


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9844 - loss: 0.0372 - val_accuracy: 0.8640 - val_loss: 0.8855 - learning_rate: 0.0010
Epoch 7/10
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 121ms/step - accuracy: 0.9899 - loss: 0.0332 - val_accuracy: 0.8706 - val_loss: 0.9878 - learning_rate: 0.0010
Epoch 8/10
[1m  1/312[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m29s[0m 96ms/step - accuracy: 0.9844 - loss: 0.0498

2024-11-24 14:38:49.083674: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 9005859322797773434
2024-11-24 14:38:49.083749: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 4225635991422258652


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9844 - loss: 0.0498 - val_accuracy: 0.8570 - val_loss: 1.1463 - learning_rate: 0.0010
Epoch 9/10
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 111ms/step - accuracy: 0.9924 - loss: 0.0254 - val_accuracy: 0.8734 - val_loss: 1.0379 - learning_rate: 0.0010
Epoch 10/10
[1m  1/312[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m21s[0m 70ms/step - accuracy: 1.0000 - loss: 0.0024

2024-11-24 14:39:30.878314: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 9005859322797773434
2024-11-24 14:39:30.878340: I tensorflow/core/framework/local_rendezvous.cc:424] Local rendezvous recv item cancelled. Key hash: 4225635991422258652


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 1.0000 - loss: 0.0024 - val_accuracy: 0.8749 - val_loss: 1.0257 - learning_rate: 0.0010


  saveable.load_own_variables(weights_store.get(inner_path))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step
Original_Y 	Predicted_Y
3                 6
8                 8
8                 8
0                 0
6                 6
6                 6
1                 5
6                 6
3                 3
1                 1
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.8665 - loss: 0.6393 - precision: 0.8739
