# Hybrid Model (CNN and Transformer) - Convolutional Vision Transformer



In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, utils, Sequential
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.callbacks import ReduceLROnPlateau
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, accuracy_score, f1_score
import seaborn as sns

### Reading The Data

In [2]:
!git clone https://github.com/nschultze/CS577Project.git

Cloning into 'CS577Project'...
remote: Enumerating objects: 2949, done.[K
remote: Counting objects: 100% (1153/1153), done.[K
remote: Compressing objects: 100% (1098/1098), done.[K
remote: Total 2949 (delta 75), reused 1125 (delta 54), pack-reused 1796[K
Receiving objects: 100% (2949/2949), 10.07 GiB | 37.91 MiB/s, done.
Resolving deltas: 100% (120/120), done.
Updating files: 100% (2720/2720), done.


In [3]:
train_data_dir = 'CS577Project/train'
validation_data_dir = 'CS577Project/val'
test_data_dir = 'CS577Project/test'

#set the size of the image to be resized
target_size = (224, 224)

#using ImageDataGenerator for data augmentation and preprocessing (altering images to improve data variety)
datagen = ImageDataGenerator(
    rescale=1./255, #used to normalize the pixel values (dividing by max pixel value of 255)
    rotation_range=20, #randomly rotating the images by 20 degrees (almost like noise where we add variation)
    width_shift_range=0.2, #randomly shifting the images horizontally by 20 percent
    height_shift_range=0.2, #randomly shifting the images vertically by 20 percent
    shear_range=0.2, #setting the shear intensity to "stretch" the image
    zoom_range=0.2, #randomly zooming into the images by 20 percent
    horizontal_flip=True, #randomly flipping the images horizontally
    fill_mode='nearest') #because of the shifting, rotating, etc, there are missing pixels so we fill in those missing pixels with nearest valid values

#creating the generator for the training dataset
train_generator = datagen.flow_from_directory(
    train_data_dir,
    target_size=target_size, #resizing
    batch_size=32,
    class_mode='binary') #binary for 2 labels

val_test_gen = ImageDataGenerator(rescale=1.0 / 255.0)

#creating the generator for the validation set
validation_generator = val_test_gen.flow_from_directory(
    validation_data_dir,
    target_size=target_size,
    batch_size=32,
    class_mode='binary',
    shuffle=False)

#creating the generator for the testing dataset
test_generator = val_test_gen.flow_from_directory(
    test_data_dir,
    target_size=target_size,
    batch_size=32,
    class_mode='binary',
    shuffle=False)

Found 1887 images belonging to 2 classes.
Found 402 images belonging to 2 classes.
Found 410 images belonging to 2 classes.


In [None]:
#Replicating CvT (Convolutional Vision Transformer)
#building the EfficientNetB0 model section (CNN section)
efficient_net = EfficientNetB0(include_top=False, input_shape=(256, 256, 3), weights='imagenet') #do not freeze the weights (want to train the weights more like transfer learning)

#input value with the target/altered size
inp = tf.keras.Input(shape=(224, 224, 3))

#building the CvT model
x = efficient_net(inp)
x = layers.Reshape((-1, 1280))(x) #resize to accommodate the output of efficient net

patch_size = 32
num_patches = (224 // patch_size) * (224 // patch_size)
x = layers.Reshape((-1, num_patches, x.shape[-1]))(x)
x = MultiHeadAttention(num_heads=4, key_dim=x.shape[-1] // 4, dropout=0.1)(x, x)
#x = MultiHeadAttention(num_heads=4, key_dim=1280//4, dropout=0.1)(x, x) #adding the multi head attention to allow it to focus on different parts of the input
x = layers.GlobalAveragePooling2D()(x) #pooling to reduce input
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.2)(x)
#adding a batch normalization for regularization
x = layers.BatchNormalization()(x)
#final output is passed into single neuron using sigmoid (good for binary classification)
output = layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inp, outputs=output)

#compiling the model with an initial lr of 1e-3 (will change as training)
model.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
#used to change the learning rate as it is training
# reduces learning rate based on validation loss
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6)

#printing the model
model.summary()

#training the model
history = model.fit(train_generator,
                    epochs=40,
                    validation_data=validation_generator,
                    callbacks=[lr_scheduler])  #adding the learning rate scheduler to change learning rate as it trains


#saving the model as a HDF5 file
model.save("cvt_model.h5")

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_10 (InputLayer)       [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 efficientnetb0 (Functional  (None, 8, 8, 1280)           4049571   ['input_10[0][0]']            
 )                                                                                                
                                                                                                  
 reshape_5 (Reshape)         (None, 49, 1280)             0         ['efficientnetb0[0][0]']      
                                                                                                  
 reshape_6 (Reshape)         (None, 1, 49, 1280)          0         ['reshape_5[0][0]']       







Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25

In [None]:
#plotting the two figures of loss and accuracy
plt.figure(figsize=(12, 4))

#plotting the training & validation losses
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training vs. Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Training', 'Validation'], loc='upper right') #placed at top right as loss should decrease as epochs increase

#plotting the training & validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Training vs. Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Training', 'Validation'], loc='lower right') #moved to lower right to not obstruct lines (acc should go upwards as epochs increase)

#saving the plots
plt.savefig("training_model_graphs.png")
plt.show()

In [None]:
#evaluating the model on the testing set
test_loss, test_acc = model.evaluate(test_generator)
print(f'\nTest accuracy: {test_acc}')

In [None]:
#loading the model
loaded_model = tf.keras.models.load_model("cvt_model.h5")

In [None]:
# Generate predictions from the test set
pred = loaded_model.predict(test_generator)

In [None]:
threshold = 0.5  # Set your chosen threshold here

# Manual computation of binary predictions
pred_binary = (pred >= .5).astype(int).flatten()

# True labels
true_y = test_generator.classes

# Compute confusion matrix
confusion_mat = confusion_matrix(true_y, pred_binary)

# Display the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues', xticklabels=['Fire', 'No Fire'], yticklabels=['Fire', 'No Fire'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix of the Hybrid Model')
plt.show()


In [None]:
print(classification_report(true_y, pred_binary, target_names = ["Fire", "No Fire"]))

precision = precision_score(true_y, pred_binary, average='macro')
recall = recall_score(true_y, pred_binary, average='macro')
accuracy = accuracy_score(true_y, pred_binary)
f1 = f1_score(true_y, pred_binary, average='macro')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')