<a href="https://colab.research.google.com/github/owenmk/ok_cars/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cars Dataset
Exploring the importance of size for supervised learning
### reference
[1] 3D Object Representations for Fine-Grained Categorization
Jonathan Krause, Michael Stark, Jia Deng, Li Fei-Fei
4th IEEE Workshop on 3D Representation and Recognition, at ICCV 2013 (3dRR-13). Sydney, Australia. Dec. 8, 2013.

###  imports

In [45]:
from typing import List
import torchvision.datasets.stanford_cars as stanford_cars
import torchvision 
from torchvision import datasets, io, models, ops, transforms, utils
from torch.nn import Identity
from torch.utils.data import DataLoader
from torch import device
from torch import save as torch_save
from torch import load as torch_load

import pandas as pd
import scipy.io
import logging
import os

import numpy as np
from sklearn.model_selection import train_test_split

# create logger
logger = logging.getLogger(__name__)
# set log level for all handlers to debug
logger.setLevel(logging.DEBUG)


In [46]:
## The Cars Dataset can be found here: (https://ai.stanford.edu/~jkrause/cars/car_dataset.html). 
# Data from reference [1] has been copied to an accessible filepath.

NUM_IMAGE = 16185
"""The dataset contains 16185 images. Here numbered with idx = 0, 1, ..."""

IMG_PATH = '/mnt/c/Users/Owen/Documents/img_data/'
"""Data from reference [1] has been copied to an accessible filepath."""

# Make image data available in the notebook
#
# transform = transforms.Compose([transforms.Resize(255),
#                                 transforms.CenterCrop(224),
#                                 transforms.ToTensor()])
transform = transforms.ToTensor()
dataset = datasets.ImageFolder(IMG_PATH, transform=transform)

# imagenet_data = torchvision.datasets.ImageNet('path/to/imagenet_root/')
data_loader = DataLoader(dataset, batch_size=1, shuffle=True,num_workers=1)


# Obtain train/test split and other annotations from the original author
mat = scipy.io.loadmat(IMG_PATH +"cars_annos.mat")
annotations = pd.DataFrame(mat['annotations'][0])
annotations['class2']=[c[0,0] for c in annotations['class'].values] # manage indexing
annotations['test2']=[c[0,0] for c in annotations['test'].values] # manage indexing
annotations['relative_im_path2']=[c[0] for c in annotations['relative_im_path'].values] # manage indexing
assert annotations.shape[0]==NUM_IMAGE, "annotation problem"
annotations


Unnamed: 0,relative_im_path,bbox_x1,bbox_y1,bbox_x2,bbox_y2,class,test,class2,test2,relative_im_path2
0,[car_ims/000001.jpg],[[112]],[[7]],[[853]],[[717]],[[1]],[[0]],1,0,car_ims/000001.jpg
1,[car_ims/000002.jpg],[[48]],[[24]],[[441]],[[202]],[[1]],[[0]],1,0,car_ims/000002.jpg
2,[car_ims/000003.jpg],[[7]],[[4]],[[277]],[[180]],[[1]],[[0]],1,0,car_ims/000003.jpg
3,[car_ims/000004.jpg],[[33]],[[50]],[[197]],[[150]],[[1]],[[0]],1,0,car_ims/000004.jpg
4,[car_ims/000005.jpg],[[5]],[[8]],[[83]],[[58]],[[1]],[[0]],1,0,car_ims/000005.jpg
...,...,...,...,...,...,...,...,...,...,...
16180,[car_ims/016181.jpg],[[38]],[[36]],[[375]],[[234]],[[196]],[[1]],196,1,car_ims/016181.jpg
16181,[car_ims/016182.jpg],[[29]],[[34]],[[235]],[[164]],[[196]],[[1]],196,1,car_ims/016182.jpg
16182,[car_ims/016183.jpg],[[25]],[[32]],[[587]],[[359]],[[196]],[[1]],196,1,car_ims/016183.jpg
16183,[car_ims/016184.jpg],[[56]],[[60]],[[208]],[[186]],[[196]],[[1]],196,1,car_ims/016184.jpg


## 1. Masking labels

In [47]:
def mask_labels(dataset_labels: List, proportion: float)->pd.DataFrame:
    """
    Remove a subset of labels from samples. Ensure each class has at least one labeled sample.
    To remove 10% of labels, use proportion=0.1.
    dataset_labels is a list of labels in sample order such that dataset_labels[0] is the label of the first sample and there are no gaps.
    """
    # Cons
    df = pd.DataFrame(data={'orig_label':dataset_label_list,'label':None})
    df.index.name = 'idx'
    num_samples = df.shape[0]

    # Select samples to guarantee one from each class
    keep_df = df.groupby('orig_label').apply(pd.DataFrame.sample, n=1)
    keep_df = keep_df.reset_index('idx').reset_index(drop=True)
    num_classes = keep_df.shape[0]

    # How many samples to keep, in addition to 1 per class. Select them.
    max_drop_frac = 1 - (num_classes / num_samples)
    assert proportion <= max_drop_frac, f"proportion must be less than {max_drop_frac}"
    num_new_keep = num_samples - num_classes - int(proportion*num_samples)
    choose_from_idx = list(set(df.index) - set(keep_df['idx']))
    new_keep_df = df.loc[choose_from_idx,:].sample(n=num_new_keep)

    # For the ist of all sample indices to keep, transfer the known label
    keep_idx = list(keep_df['idx']) + list(new_keep_df.index)
    df.loc[keep_idx,'label'] = df.loc[keep_idx,'orig_label']
    
    return df
# Example:
dataset_label_list = annotations['class2'].tolist()
proportion = 0.60 # 60% unlabelled
label_df = mask_labels(dataset_label_list, proportion)

## 2. Data cleaning
Find and delete any images that do not have 3 channels

In [48]:
def is_3chan_image(tensor_size)->bool:
    """utility returns True for 3-channel images"""
    # TODO: confirm functionality using synthesize bw image: transform = transforms.Grayscale()
    num_chanels = tensor_size[0]
    result = num_chanels == 3 
    return result

def delete_image(imgpath: str):
    logger.info(f"Deleting image {imgpath}")
    try:
        # os.remove(imgpath)
        logger.info(f"Removing sample {imgpath}")
    except OSError as e:
        logger.error(f"Error {e}")
        raise

# Scan the dataset to confirm all 3 channels are present in each image
logger.info("Check all images have 3 channels...")
image_sizes = {}
num_deleted = 0
for k in range(NUM_IMAGE):
    sample_img, _ = dataset[k]
    # num_chan = sample_img.size()[0]
    if (k+1) % 1000 == 0: logger.info(f"images checked: {k+1}")
    sample_image_size = sample_img.size()
    if not is_3chan_image(sample_image_size):
        print(f"non standard image {k}: size {sample_image_size}, _ {_}")
        img_path = IMG_PATH + annotations.loc[k,'relative_im_path2']
        delete_image(img_path)
        num_deleted += 1
    image_sizes[k] = sample_image_size
logger.info(f"Number of images deleted / scanned: {num_deleted}/{NUM_IMAGE}")

    

KeyboardInterrupt: 

## 3. Dataset representation


In [8]:
model = torchvision.models.resnet18()
model.fc = Identity()
m = model.eval()

NUM_SAMPLE_TEST = 20 # Reduce problem dimension for demonstration purposes
dataset_rep = {k:{'embedding':None,'class_idx':annotations.loc[k,'class2'], 'labelled': True} for k in range(NUM_IMAGE)}
for k in range(NUM_SAMPLE_TEST):
    img, _ = dataset[k]
    img = img[None,:,:,:] # batch dimension is needed allow using batch-api in a non-batch manner
    embedding = m(img)
    # dataset_rep[k] = embedding  # Uses too much memory for my machine
    dataset_rep[k]['embedding'] = embedding[0,:10] # Reduce problem dimension for demonstration purposes
    if (k+1) % 10 == 0: 
        save_path = IMG_PATH + f"cars_emebeddings_{k+1}.pt"
        logger.info(f"images embedded: {k+1}")
        torch_save(dataset_rep, save_path)
save_path = IMG_PATH + f"cars_emebeddings_{k+1}.pt"
logger.info(f"Total number of images embedded: {k+1}")
torch_save(dataset_rep, save_path)

## 4. Partially labelled dataset

In [49]:
NUM_IMAGES = 20 # revised down due to processing limitation
load_path = IMG_PATH + f"cars_emebeddings_{NUM_IMAGES}.pt"
dataset_1 = torch_load(load_path)
new_dataset = pd.DataFrame(dataset_1).T

dataset_label_list = annotations.loc[:NUM_IMAGES,'class2'].tolist()
proportion = 0.60 # 60% unlabelled
label_df = mask_labels(dataset_label_list, proportion)

new_dataset = new_dataset[:NUM_IMAGES]
new_dataset.index.name='idx'
dataset_rep = new_dataset.copy()
new_dataset.loc[:,'class_idx'] = label_df['label']
new_dataset['labelled'] = new_dataset['class_idx'].notnull()



The amount of labeled data is shown 

In [50]:
# Count labeled and unlabeled samples in the new dataset
new_dataset.labelled.value_counts()


False    11
True      9
Name: labelled, dtype: int64

## 5. Train / validation split

In [51]:
def val_split(data_set_input :pd.DataFrame, data_set_labels:List, training_proportion:float):
    X_train, X_test, y_train, y_test = train_test_split(data_set_input, data_set_labels, test_size= 1-training_proportion  )
    return (X_train, X_test, y_train, y_test)


In [52]:
# demonstrates the train/validations split
data_set_input = dataset_rep['embedding']
data_set_labels = dataset_rep['class_idx'].values
training_proportion = 0.6
X_train, X_test, y_train, y_test = val_split(data_set_input, data_set_labels, training_proportion)
train_inputs, test_inputs, train_labels, test_labels = X_train, X_test, y_train, y_test
train_inputs, test_inputs, train_labels, test_labels

(idx
 11    [tensor(0.6834, grad_fn=<UnbindBackward0>), te...
 19    [tensor(0.8647, grad_fn=<UnbindBackward0>), te...
 14    [tensor(0.5153, grad_fn=<UnbindBackward0>), te...
 4     [tensor(0.3922, grad_fn=<UnbindBackward0>), te...
 15    [tensor(0.5952, grad_fn=<UnbindBackward0>), te...
 7     [tensor(0.5582, grad_fn=<UnbindBackward0>), te...
 12    [tensor(0.6606, grad_fn=<UnbindBackward0>), te...
 10    [tensor(0.4729, grad_fn=<UnbindBackward0>), te...
 8     [tensor(0.9126, grad_fn=<UnbindBackward0>), te...
 2     [tensor(1.0437, grad_fn=<UnbindBackward0>), te...
 3     [tensor(0.7866, grad_fn=<UnbindBackward0>), te...
 1     [tensor(0.8416, grad_fn=<UnbindBackward0>), te...
 Name: embedding, dtype: object,
 idx
 13    [tensor(0.6656, grad_fn=<UnbindBackward0>), te...
 18    [tensor(0.6566, grad_fn=<UnbindBackward0>), te...
 5     [tensor(0.5522, grad_fn=<UnbindBackward0>), te...
 6     [tensor(0.8365, grad_fn=<UnbindBackward0>), te...
 17    [tensor(1.0090, grad_fn=<UnbindBackwar

## 6. Experiment

## 7. Active learning

## 8. Training and Evaluation
