<a href="https://colab.research.google.com/github/owenmk/ok_cars/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cars Dataset
Exploring the importance of size for supervised learning
### reference
[1] 3D Object Representations for Fine-Grained Categorization
Jonathan Krause, Michael Stark, Jia Deng, Li Fei-Fei
4th IEEE Workshop on 3D Representation and Recognition, at ICCV 2013 (3dRR-13). Sydney, Australia. Dec. 8, 2013.

###  imports

In [None]:
from typing import List
import torchvision.datasets.stanford_cars as stanford_cars
from torchvision import datasets, io, models, ops, transforms, utils
import pandas as pd
import scipy.io
import logging
import os

# create logger
logger = logging.getLogger(__name__)
# set log level for all handlers to debug
logger.setLevel(logging.DEBUG)


In [None]:
## The Cars Dataset can be found here: (https://ai.stanford.edu/~jkrause/cars/car_dataset.html). 
# Data from reference [1] has been copied to an accessible filepath.

NUM_IMAGE = 16185
"""The dataset contains 16185 images. Here numbered with idx = 0, 1, ..."""

IMG_PATH = '/mnt/c/Users/Owen/Documents/img_data/'
"""Data from reference [1] has been copied to an accessible filepath."""

# Make image data available in the notebook
#
# transform = transforms.Compose([transforms.Resize(255),
#                                 transforms.CenterCrop(224),
#                                 transforms.ToTensor()])
transform = transforms.ToTensor()
dataset = datasets.ImageFolder(IMG_PATH, transform=transform)

# Obtain train/test split and other annotations from the original author
mat = scipy.io.loadmat(IMG_PATH +"cars_annos.mat")
annotations = pd.DataFrame(mat['annotations'][0])
annotations['class2']=[c[0,0] for c in annotations['class'].values] # manage indexing
annotations['test2']=[c[0,0] for c in annotations['test'].values] # manage indexing
annotations['relative_im_path2']=[c[0] for c in annotations['relative_im_path'].values] # manage indexing
assert annotations.shape[0]==NUM_IMAGE, "annotation problem"
annotations


## 1. Masking labels

In [None]:
def mask_labels(dataset_labels: List, proportion: float)->pd.DataFrame:
    """
    Remove a subset of labels from samples. Ensure each class has at least one labeled sample.
    To remove 10% of labels, use proportion=0.1.
    dataset_labels is a list of labels in sample order such that dataset_labels[0] is the label of the first sample and there are no gaps.
    """
    # Cons
    df = pd.DataFrame(data={'orig_label':dataset_label_list,'label':None})
    df.index.name = 'idx'
    num_samples = df.shape[0]

    # Select samples to guarantee one from each class
    keep_df = df.groupby('orig_label').apply(pd.DataFrame.sample, n=1)
    keep_df = keep_df.reset_index('idx').reset_index(drop=True)
    num_classes = keep_df.shape[0]

    # How many samples to keep, in addition to 1 per class. Select them.
    max_drop_frac = 1 - (num_classes / num_samples)
    assert proportion <= max_drop_frac, f"proportion must be less than {max_drop_frac}"
    num_new_keep = num_samples - num_classes - int(proportion*num_samples)
    choose_from_idx = list(set(df.index) - set(keep_df['idx']))
    new_keep_df = df.loc[choose_from_idx,:].sample(n=num_new_keep)

    # For the ist of all sample indices to keep, transfer the known label
    keep_idx = list(keep_df['idx']) + list(new_keep_df.index)
    df.loc[keep_idx,'label'] = df.loc[keep_idx,'orig_label']
    
    return df
# Example:
dataset_label_list = annotations['class2'].tolist()
proportion = 0.3
label_df = mask_labels(dataset_label_list, proportion)

## 2. Data cleaning
Find and delete any images that do not have 3 channels

In [None]:
def is_3chan_image(tensor_size)->bool:
    """utility returns True for 3-channel images"""
    # TODO: confirm functionality using synthesize bw image: transform = transforms.Grayscale()
    num_chanels = tensor_size[0]
    result = num_chanels == 3 
    return result

def delete_image(imgpath: str):
    logger.info(f"Deleting image {imgpath}")
    try:
        # os.remove(imgpath)
        logger.info(f"Removing sample {imgpath}")
    except OSError as e:
        logger.error(f"Error {e}")
        raise

# Scan the dataset to confirm all 3 channels are present in each image
logger.info("Check all images have 3 channels...")
image_sizes = {}
num_deleted = 0
for k in range(NUM_IMAGE):
    sample_img, _ = dataset[k]
    # num_chan = sample_img.size()[0]
    if k % 1000 == 0: logger.info(f"images checked: {k}")
    sample_image_size = sample_img.size()
    if not is_3chan_image(sample_image_size):
        print(f"non standard image {k}: size {sample_image_size}, _ {_}")
        img_path = IMG_PATH + annotations.loc[k,'relative_im_path2']
        delete_image(img_path)
        num_deleted += 1
    image_sizes[k] = sample_image_size
logger.info(f"Number of images deleted / scanned: {num_deleted}/{NUM_IMAGE}")

    

## 3. Dataset representation


In [None]:
dataset_rep = {k:{'embedding':None,'class_idx':None, 'labelled': None} for k in range(NUM_IMAGE)}


# Development continues...

In [None]:
transform = transforms.Grayscale()
img = transform(sample_img)

In [None]:
help(stanford_cars)