# DLP Project

## Group Members:

* Fouzan Asif (19K-1345) (BCS-8A)
* Aashir (19K-0314) (BCS-8A)
* Abdul Saboor (19K-1433) (BCS-8A)

### Step1: Obtain images and labels for training and testing

In [2]:
import os
import cv2
import xml.etree.ElementTree as ET
import numpy as np

dataset_path = "VOC2008"

def get_class_label(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    for obj in root.findall('object'):
        name = obj.find('name').text
        return name

with open(os.path.join(dataset_path, "ImageSets/Main/train.txt"), 'r') as f:
    train_image_names = f.readlines()
train_image_names = [name.strip() for name in train_image_names]

with open(os.path.join(dataset_path, "ImageSets/Main/val.txt"), 'r') as f:
    val_image_names = f.readlines()
val_image_names = [name.strip() for name in val_image_names]

train_images = []
train_labels = []
val_images = []
val_labels = []

for name in train_image_names:
    image_path = os.path.join(dataset_path, "JPEGImages", name + ".jpg")
    image = cv2.imread(image_path)
    train_images.append(image)
    annotation_path = os.path.join(dataset_path, "Annotations", name + ".xml")
    class_label = get_class_label(annotation_path)
    train_labels.append(class_label)

for name in val_image_names:
    image_path = os.path.join(dataset_path, "JPEGImages", name + ".jpg")
    image = cv2.imread(image_path)
    val_images.append(image)
    annotation_path = os.path.join(dataset_path, "Annotations", name + ".xml")
    class_label = get_class_label(annotation_path)
    val_labels.append(class_label)

train_images = np.array(train_images)
train_labels = np.array(train_labels)
val_images = np.array(val_images)
val_labels = np.array(val_labels)


  train_images = np.array(train_images)
  val_images = np.array(val_images)


### Step 2: Mapping class labels to a range of integers for modelling

In [3]:
from tensorflow.keras.utils import to_categorical

dataset_path = "VOC2008"

class_names = np.unique(train_labels)
class_map = {class_name: i for i, class_name in enumerate(class_names)}
train_labels = np.array([class_map[label] for label in train_labels])

num_classes = len(class_names)
train_labels = to_categorical(train_labels, num_classes)

### Step 3 - Resizing training and validation (testing) images

In [4]:
import cv2
import numpy as np

height = 224
width = 224

train_images_resized = []
for image_path in train_image_names:
    image = cv2.imread("VOC2008/JPEGImages/"+image_path+".jpg")
    image = cv2.resize(image, (height, width))
    train_images_resized.append(image)

train_images_res = np.array(train_images_resized)
train_images_res = np.reshape(train_images_res, (len(train_images_res), height, width, 3))


In [5]:
val_images_res = []
for image_path in val_image_names:
    image = cv2.imread("VOC2008/JPEGImages/"+image_path+".jpg")
    image = cv2.resize(image, (224, 224))
    val_images_res.append(image)
val_images_res = np.array(val_images_res)

### Step 4: Defining Necessary functions for Accuracy and MPA

In [6]:
from sklearn.metrics import average_precision_score

def mAP(val_labels, val_preds, inverted_class_map):
    ap_dict = {}
    for i in range(len(inverted_class_map)):
        class_name = inverted_class_map[i]
        if class_name not in val_labels:
            continue
        y_true = (val_labels == class_name).astype(int)
        y_pred = val_preds[:, i]
        ap = average_precision_score(y_true, y_pred)
        ap_dict[class_name] = ap
    mAP = sum(ap_dict.values()) / len(ap_dict)
    return mAP, ap_dict

In [50]:
inverted_class_map = dict(map(reversed, class_map.items()))

def accuracy(model):
    global class_map, val_images_res, val_labels
    val_preds = model.predict(val_images_res)
    c = 0
    for i in range(len(val_images_res)):
        if inverted_class_map[np.argmax(val_preds[i])] == val_labels[i]:
            c+=1
    Map, Ap = mAP(val_labels,val_preds, inverted_class_map)
    return "Accuracy: " + str((c/len(val_images_res))*100) + "%\nMean Average Precision: " + str(Map)

In [35]:
np.save("VGG_training_output.npy",m1output)
np.save("ResNet_training_output.npy",m2output)
np.save("DenseNet_training_output.npy",m3output)
np.save("MobileNet_training_output.npy",m4output)
np.save("InceptionV3_training_output.npy",m5output)

np.save("VGG_validation_output.npy",v1output)
np.save("ResNet_validation_output.npy",v2output)
np.save("DenseNet_validation_output.npy",v3output)
np.save("MobileNet_validation_output.npy",v4output)
np.save("InceptionV3_validation_output.npy",v5output)

In [40]:
m1output = np.load("VGG_training_output.npy",allow_pickle=True)
m2output = np.load("ResNet_training_output.npy",allow_pickle=True)
m3output = np.load("DenseNet_training_output.npy",allow_pickle=True)
m4output = np.load("MobileNet_training_output.npy",allow_pickle=True)
m5output = np.load("InceptionV3_training_output.npy",allow_pickle=True)

v1output = np.load("VGG_validation_output.npy",allow_pickle=True)
v2output = np.load("ResNet_validation_output.npy",allow_pickle=True)
v3output = np.load("DenseNet_validation_output.npy",allow_pickle=True)
v4output = np.load("MobileNet_validation_output.npy",allow_pickle=True)
v5output = np.load("InceptionV3_validation_output.npy",allow_pickle=True)


# CNN - 1: VGG-16 (with a 3 layer NN for classification)

The fully connected NN has filtered 25000+ (or filtered 4096) input nodes, 1024 hidden layer nodes, and 20 output nodes

### Step 4a: Defining VGG and the NN

In [8]:
from tensorflow import keras
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers import Input, Flatten, Dense
from tensorflow.keras.models import Model

base_model = VGG16(weights='imagenet', include_top=False, input_tensor=Input(shape=(224, 224, 3)))

x = base_model.output
x = Flatten()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(num_classes, activation='softmax')(x)
vgg_nn = Model(inputs=base_model.input, outputs=predictions)

for layer in base_model.layers:
    layer.trainable = False #we don't trained pre-trained CNNs

vgg_nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


#### Defining checkpoints to store .h5 files per epoch (containing full model and weights only as well)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf

arch_path = 'voc_new.h5'
weights_path = 'voc_weights_new.h5'

arch_checkpoint = ModelCheckpoint(arch_path, save_best_only=False, mode='min', save_weights_only=False)

weights_checkpoint = ModelCheckpoint(weights_path, save_best_only=False, mode='min', save_weights_only=True)

### Step 4b: Training the model with VGG as CNN

In [27]:
history = vgg_nn.fit(train_images_res, train_labels, batch_size=32, epochs=10, callbacks=[arch_checkpoint, weights_checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Step 4b Alternate: Loading our trained model

In [9]:
from tensorflow.keras.models import load_model

vgg_nn = load_model("voc_new.h5")

### Step 4c: Finding Accuracy and Mean Average Precision

In [13]:
print(accuracy(vgg_nn))

Accuracy:61.45880234128771%
Mean Average Precision: 0.48199936721385106


## CNN - 2: ResNet50 with a custom 3 layer NN

### Step a: Using Transfer Learning, importing a pre-trained CNN ResNet50

In [10]:
from keras.applications import ResNet50
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)

for layers in base_model.layers:
    layers.trainable = False
    
predictions = Dense(num_classes, activation='softmax')(x)


resnet_nn = Model(inputs=base_model.input, outputs=predictions)

# model.load_weights('voc_weights.h5', by_name=True, skip_mismatch=True)

resnet_nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


### Creating Checkpoints per epoch

In [7]:
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf

arch_path = 'resnet.h5'
weights_path = 'resnet_weights_new.h5'

arch_checkpoint = ModelCheckpoint(arch_path, save_best_only=False, mode='min', save_weights_only=False)

weights_checkpoint = ModelCheckpoint(weights_path, save_best_only=False, mode='min', save_weights_only=True)

### Training the model

In [10]:
resnet = resnet_nn.fit(train_images_res, train_labels, batch_size=32, epochs=10, callbacks=[arch_checkpoint, weights_checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Loading the saved and trained model

In [15]:
from tensorflow.keras.models import load_model

resnet_nn = load_model("resnet.h5")

print(accuracy(resnet_nn))



## CNN - 3: DenseNet with a custom 3 layer NN

In [11]:
from tensorflow.keras.applications.densenet import DenseNet121, preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

densenet_model = DenseNet121(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in densenet_model.layers:
    layer.trainable = False

x = densenet_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(num_classes, activation='softmax')(x)

densenet_nn = Model(inputs=densenet_model.input, outputs=predictions)

densenet_nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [15]:
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf

arch_path = 'densenet.h5'
weights_path = 'densenet_weights_new.h5'

arch_checkpoint = ModelCheckpoint(arch_path, save_best_only=False, mode='min', save_weights_only=False)

weights_checkpoint = ModelCheckpoint(weights_path, save_best_only=False, mode='min', save_weights_only=True)

In [17]:
densenet = densenet_nn.fit(train_images_res, train_labels, batch_size=32, epochs=10, callbacks=[arch_checkpoint, weights_checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
from tensorflow.keras.models import load_model

densenet_nn = load_model("densenet.h5")

print(accuracy(densenet_nn))

Accuracy: 37.10040522287258%
Mean Average Precision: 0.23809376655205425


## CNN - 4: MobileNet with a custom 3 layer NN

In [13]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model

mobilenet_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in mobilenet_model.layers:
    layer.trainable = False

x = GlobalAveragePooling2D()(mobilenet_model.output)
x = Dense(1024, activation='relu')(x)
predictions = Dense(num_classes, activation='softmax')(x)

mobilenet_nn = Model(inputs=mobilenet_model.input, outputs=predictions)

mobilenet_nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# model.load_weights('voc_weights.h5', by_name=True, skip_mismatch=True)

In [21]:
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf

arch_path = 'mobilenet.h5'
weights_path = 'mobilenet_weights.h5'

arch_checkpoint = ModelCheckpoint(arch_path, save_best_only=False, mode='min', save_weights_only=False)

weights_checkpoint = ModelCheckpoint(weights_path, save_best_only=False, mode='min', save_weights_only=True)

In [22]:
mobilenet = mobilenet_nn.fit(train_images_res, train_labels, batch_size=32, epochs=10, callbacks=[arch_checkpoint, weights_checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
from tensorflow.keras.models import load_model

mobilenet_nn = load_model("mobilenet.h5")

print(accuracy(mobilenet_nn))

Accuracy: 36.380009004952726%
Mean Average Precision: 0.2121310724688476


## CNN - 5: InceptionV3 with a custom 3 layer NN

In [16]:
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model

inceptionv3_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in inceptionv3_model.layers:
    layer.trainable = False

x = GlobalAveragePooling2D()(inceptionv3_model.output)
x = Dense(1024, activation='relu')(x)
predictions = Dense(num_classes, activation='softmax')(x)

icpv3_nn = Model(inputs=inceptionv3_model.input, outputs=predictions)

icpv3_nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [58]:
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf

arch_path = 'icpv3.h5'
weights_path = 'icpv3_weights.h5'

arch_checkpoint = ModelCheckpoint(arch_path, save_best_only=False, mode='min', save_weights_only=False)

weights_checkpoint = ModelCheckpoint(weights_path, save_best_only=False, mode='min', save_weights_only=True)


In [28]:
icpv3 = icpv3_nn.fit(train_images_res, train_labels, batch_size=32, epochs=10, callbacks=[arch_checkpoint, weights_checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
from tensorflow.keras.models import load_model

icpv3_nn = load_model("icpv3.h5")

print(accuracy(icpv3_nn))

Accuracy: 28.95092300765421%
Mean Average Precision: 0.09713809770477483


## PROJECT (C): ENSEMBLED LEARNING

### Early Fusion & Late Fusion

#### Early Fusion

In [24]:
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.models import load_model

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

model1 = load_model('voc_new.h5')
model2 = load_model('resnet.h5')
model3 = load_model('mobilenet.h5')
model4 = load_model('densenet.h5')
model5 = load_model('icpv3.h5')

input1 = model1.input
input2 = model2.input
input3 = model3.input
# input4 = model4.input
input5 = model5.input

output1 = model1.layers[-2].output
output2 = model2.layers[-2].output
output3 = model3.layers[-2].output
# output4 = model4.layers[-2].output
output5 = model5.layers[-2].output

merged = Concatenate()([output1, output2, output3, output5])
x = Dense(64, activation='relu')(merged)
x = Dense(20, activation='sigmoid')(x)
new_model = Model(inputs=[model1.input, model2.input, model3.input, model5.input], outputs=x)

new_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [59]:
history = new_model.fit([train_images_res, train_images_res, train_images_res, train_images_res], train_labels, 
                        epochs=10, batch_size=32, callbacks=[arch_checkpoint, weights_checkpoint])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


KeyboardInterrupt: 

In [25]:
model = load_model("early_fusion.h5")

# Create a pipeline with SVM classifier
svm = make_pipeline(StandardScaler(), SVC(kernel='linear', C=1.0))

# Fit the SVM classifier on the outputs of the final layers of each model

m1output = model1.predict(train_images_res)
m2output = model2.predict(train_images_res)
m3output = model3.predict(train_images_res)
m4output = model4.predict(train_images_res)
m5output = model5.predict(train_images_res)

v1output = model1.predict(val_images_res)
v2output = model2.predict(val_images_res)
v3output = model3.predict(val_images_res)
v4output = model4.predict(val_images_res)
v5output = model5.predict(val_images_res)



In [44]:
from sklearn.metrics import accuracy_score

labels = []
for name in train_image_names:
    annotation_path = os.path.join(dataset_path, "Annotations", name + ".xml")
    class_label = get_class_label(annotation_path)
    labels.append(class_label)
labels = np.array(labels)

train_features = np.concatenate((m1output, m2output, m3output, m5output), axis=1)
val_features = np.concatenate((v1output,v2output,v3output,v5output), axis=1)

svm.fit(train_features, labels)

val_preds = svm.predict(val_features)
print("Accuracy on validation set: " + str(accuracy_score(val_labels, val_preds)*100) + "%")


Accuracy on validation set: 71.36425033768573%


### Late Fusion

In [47]:
abc = np.mean([v1output,v2output,v3output,v4output,v5output],axis=0)

In [46]:
len(np.mean([v1output,v2output,v3output,v4output,v5output],axis=0)[0])

20

In [58]:
abcd = np.argmax(abc, axis=1)
lst = []
for all in abcd:
    lst.append(inverted_class_map[all])

c=0
for all in range(len(lst)):
    if lst[all] == val_labels[all]:
        c+=1

print("Late Fusion Accuracy: " + str(100*(c/len(val_labels))) + "%")

Late Fusion Accuracy: 68.03241782980639%
