''' Reference : https://colab.research.google.com/drive/16-Al3cM1PnKjYK9fY7vV9H2QdM_dMPAG#scrollTo=0ad8hHvAlGdA '''

## Import the libraries

In [37]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.datasets import VOCDetection
from sklearn import svm
import numpy as np

## Download a pre-trained CNN model

In [None]:
# Step 1: Download a pre-trained CNN model
model = models.resnet50(pretrained=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

## Define data transformation
'''The specific values used in the transforms.Normalize transformation are derived from the ImageNet dataset statistics. These values are commonly used for pre-trained models trained on the ImageNet dataset.'''Extract features from the last fully-connected layer

In [49]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [45]:
# Extract features from the last fully-connected layer
feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])

## Load the VOC train dataset

In [50]:
def get_pascal_voc2007_data(image_root, split='train'):
  from torchvision import datasets
  image_root='/content/drive/MyDrive/DL_Assignment/dataset/VOCtrainval_06-Nov-2007/'
  train_dataset = datasets.VOCDetection(image_root, year='2007', image_set=split, download=False)  
  return train_dataset

In [52]:
train_dataset = get_pascal_voc2007_data('/content', 'trainval')
val_dataset = get_pascal_voc2007_data('/content', 'val')

# an example on the raw annotation
import json
print(json.dumps(train_dataset[1][1]['annotation'], indent=2))
print(json.dumps(val_dataset[1][1]['annotation'], indent=2))

{
  "folder": "VOC2007",
  "filename": "000007.jpg",
  "source": {
    "database": "The VOC2007 Database",
    "annotation": "PASCAL VOC2007",
    "image": "flickr",
    "flickrid": "194179466"
  },
  "owner": {
    "flickrid": "monsieurrompu",
    "name": "Thom Zemanek"
  },
  "size": {
    "width": "500",
    "height": "333",
    "depth": "3"
  },
  "segmented": "0",
  "object": [
    {
      "name": "car",
      "pose": "Unspecified",
      "truncated": "1",
      "difficult": "0",
      "bndbox": {
        "xmin": "141",
        "ymin": "50",
        "xmax": "500",
        "ymax": "330"
      }
    }
  ]
}
{
  "folder": "VOC2007",
  "filename": "000007.jpg",
  "source": {
    "database": "The VOC2007 Database",
    "annotation": "PASCAL VOC2007",
    "image": "flickr",
    "flickrid": "194179466"
  },
  "owner": {
    "flickrid": "monsieurrompu",
    "name": "Thom Zemanek"
  },
  "size": {
    "width": "500",
    "height": "333",
    "depth": "3"
  },
  "segmented": "0",
  "object"

''' In order to use these annotations to train our model, we need to convert this nested dictionary data structure into a set of PyTorch tensors.

We also need to preprocess the image, converting it to a PyTorch tensor and resizing it to 224x224. Real object detection systems typically work with much higher-resolution images, but we will use a low resolution for computational efficiency in this assignment.

We also want to train our models using minibatches of data, so we need to group the annotations from several images into minibatches.

I perform both of these functions by using a customized PyTorch DataLoader'''

In [53]:
def pascal_voc2007_loader(dataset, batch_size, num_workers=0):
  """
  Data loader for Pascal VOC 2007.
  https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
  """
  from torch.utils.data import DataLoader
  # turn off shuffle so we can index the original image
  train_loader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=False, pin_memory=True,
                            num_workers=num_workers,
                            collate_fn=voc_collate_fn)
  return train_loader


class_to_idx = {'aeroplane':0, 'bicycle':1, 'bird':2, 'boat':3, 'bottle':4,
                'bus':5, 'car':6, 'cat':7, 'chair':8, 'cow':9, 'diningtable':10,
                'dog':11, 'horse':12, 'motorbike':13, 'person':14, 'pottedplant':15,
                'sheep':16, 'sofa':17, 'train':18, 'tvmonitor':19
}
idx_to_class = {i:c for c, i in class_to_idx.items()}


from torchvision import transforms

def voc_collate_fn(batch_lst, reshape_size=224):
    preprocess = transforms.Compose([
      transforms.Resize((reshape_size, reshape_size)),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
      ])
    
    batch_size = len(batch_lst)
    
    img_batch = torch.zeros(batch_size, 3, reshape_size, reshape_size)
    
    max_num_box = max(len(batch_lst[i][1]['annotation']['object']) \
                      for i in range(batch_size))

    box_batch = torch.Tensor(batch_size, max_num_box, 5).fill_(-1.)
    w_list = []
    h_list = []
    img_id_list = []
    
    for i in range(batch_size):
      img, ann = batch_lst[i]
      w_list.append(img.size[0]) # image width
      h_list.append(img.size[1]) # image height
      img_id_list.append(ann['annotation']['filename'])
      img_batch[i] = preprocess(img)
      all_bbox = ann['annotation']['object']
      if type(all_bbox) == dict: # inconsistency in the annotation file
        all_bbox = [all_bbox]
      for bbox_idx, one_bbox in enumerate(all_bbox):
        bbox = one_bbox['bndbox']
        obj_cls = one_bbox['name']
        box_batch[i][bbox_idx] = torch.Tensor([float(bbox['xmin']), float(bbox['ymin']),
          float(bbox['xmax']), float(bbox['ymax']), class_to_idx[obj_cls]])
    
    h_batch = torch.tensor(h_list)
    w_batch = torch.tensor(w_list)

    return img_batch, box_batch, w_batch, h_batch, img_id_list

'''Training with the entire PASCAL VOC will be too computationally expensive for this homework assignment, so I have taken subsample the dataset by wrapping each Dataset object in a Subset object'''

In [54]:
train_dataset = torch.utils.data.Subset(train_dataset, torch.arange(0, 2500)) # use 2500 samples for training
train_loader = pascal_voc2007_loader(train_dataset, 50)
val_loader = pascal_voc2007_loader(val_dataset, 50)

## Extract the features for the training data

In [55]:
train_features = []
train_labels = []

#model.eval()

with torch.no_grad():
    for image, target in train_dataset:
        try:
            image = transform(image)  # Apply transform to convert Image to Tensor
            feature = model(torch.unsqueeze(image, 0))
            train_features.append(feature.squeeze().numpy())
            train_labels.append(target["annotation"]["object"][0]["name"])
        except FileNotFoundError as e:
            print(f"Error: {e}. Skipping image.")

train_features = np.array(train_features)
train_labels = np.array(train_labels)




## Download a pre-trained CNN model

In [None]:
''' model = models.resnet50(pretrained=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval() '''

## Extract the feature from last fully connected layer

In [18]:
# Extract features from the last fully-connected layer
'''feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])'''


## Flatten the features if they have more than 2 dimensions

In [56]:
# Flatten the features if they have more than 2 dimensions
if train_features.ndim > 2:
    train_features = train_features.reshape(train_features.shape[0], -1)

## Train binary one-vs.-rest SVM classifiers

In [57]:
## Train binary one-vs.-rest SVM classifiers
svm_classifiers = []
classes = np.unique(train_labels)
print(classes)

for class_idx in classes:
    # Create binary labels for one class vs. rest
    binary_labels = np.where(train_labels == class_idx, 1, 0)
    
    # Train SVM classifier for the current class
    svm_classifier = svm.SVC(kernel='linear')
    svm_classifier.fit(train_features, binary_labels)
    svm_classifiers.append(svm_classifier)

['aeroplane' 'bicycle' 'bird' 'boat' 'bottle' 'bus' 'car' 'cat' 'chair'
 'cow' 'diningtable' 'dog' 'horse' 'motorbike' 'person' 'pottedplant'
 'sheep' 'sofa' 'train' 'tvmonitor']


## Evaluate classification accuracy on the validation set

In [58]:
# Step 4: Evaluate classification accuracy on the validation set
##val_dataset = VOCDetection(data_dir, year="2007", image_set="test", download=False, transform=transform)
subset_size = 500  # Number of images in the subset. Evaluate classification accuracy on a subset of the test dataset
val_dataset = get_pascal_voc2007_data('/content', 'val')

val_features = []
val_labels = []
subset_counter = 0  # Counter to keep track of the number of images added to the subset


with torch.no_grad():
    for image, target in val_dataset:
        try:
            image = transform(image)  # Apply transform to convert Image to Tensor
            feature = model(torch.unsqueeze(image, 0))
            val_features.append(feature.squeeze().numpy())
            val_labels.append(target["annotation"]["object"][0]["name"])
            subset_counter += 1

            if subset_counter >= subset_size:
              break
        except FileNotFoundError as e:
            print(f"Error: {e}. Skipping image.")

val_features = np.array(val_features)
val_labels = np.array(val_labels)
print("Validation Features:")
print(val_features)
print("Validation Labels:")
print(val_labels)


if val_features.ndim > 2:
    val_features = val_features.reshape(val_features.shape[0], -1)

val_predictions = []

for feature in val_features:
    predictions = []
    for classifier in svm_classifiers:
        prediction = classifier.predict(feature.reshape(1, -1))
        predictions.append(prediction)
    val_predictions.append(predictions)

val_predictions = np.array(val_predictions)
val_labels = np.array(val_labels)



Validation Features:
[[-2.457909   -0.7341722  -1.4729944  ... -0.47497892  0.49226993
  -1.7070079 ]
 [-0.20414515 -1.2274386  -2.2608266  ... -1.1279682   0.53636587
  -0.0572966 ]
 [-0.23880373 -2.3812976  -0.5844568  ... -2.397542    2.1173797
  -0.1096619 ]
 ...
 [ 0.5107952   1.5010961  -2.0242476  ...  0.6346948   1.0828748
   0.8873813 ]
 [-1.214551    0.5656842   2.0793295  ...  0.33333498  4.501563
  -1.1175675 ]
 [-1.5068824  -1.7804626  -2.9174366  ... -1.0629003   2.1781566
   2.6822975 ]]
Validation Labels:
['chair' 'car' 'horse' 'bicycle' 'cat' 'car' 'dog' 'train' 'bicycle'
 'tvmonitor' 'tvmonitor' 'bird' 'chair' 'motorbike' 'pottedplant' 'car'
 'dog' 'dog' 'motorbike' 'sofa' 'dog' 'train' 'cat' 'chair' 'sofa' 'sheep'
 'bird' 'person' 'person' 'aeroplane' 'cat' 'dog' 'tvmonitor' 'train'
 'bicycle' 'boat' 'car' 'bus' 'car' 'cat' 'person' 'horse' 'sofa' 'bird'
 'person' 'car' 'bottle' 'diningtable' 'car' 'boat' 'bus' 'chair'
 'diningtable' 'car' 'cow' 'horse' 'cat' 'dog' '

## Calculate accuracy and print confusion matrix

In [34]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
# Calculate accuracy
correct_predictions = 0
total_predictions = len(val_predictions)

for predictions, label in zip(val_predictions, val_labels):
    if label in predictions:
        correct_predictions += 1

accuracy = correct_predictions / total_predictions
print(f"Validation accuracy: {accuracy * 100:.2f}%")



Validation accuracy: 0.00%


  if label in predictions:


In [61]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Convert labels to a consistent data type
val_labels_flat = np.array(val_labels_flat)

# Flatten the predictions and convert them to the same data type as labels
val_predictions_flat = np.concatenate(val_predictions).astype(val_labels_flat.dtype)[:subset_size]

# Calculate accuracy
accuracy = accuracy_score(val_labels_flat, val_predictions_flat)
print(f"Validation accuracy: {accuracy * 100:.2f}%")

# Calculate confusion matrix
confusion_mat = confusion_matrix(val_labels_flat, val_predictions_flat)
print("Confusion Matrix:")
print(confusion_mat)


Validation accuracy: 0.00%
Confusion Matrix:
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 3  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 5  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [18  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0