In [3]:
"""
Mark Campbell
Data Science Fellowship program exam
Stream 1 - Machine Learning
2022-12-03
"""
import os
import numpy as np
import pandas as pd
from matplotlib import image

import scipy.io
from torchvision import transforms
import torch
from PIL import Image

projdir = r'C:\Users\Mark\Documents\Resume\gc_data_science_fellowship'
IMGS_PATH = os.path.join(projdir, 'car_ims')

In [8]:
# TASK 1 - Build a function that converts a labelled dataset into labelled and unlabelled subsets.

def labelled_unlabelled_split(dataset_labels, proportion):
    """
    Description:
    ------------
    Create a logical mask that can be used to perform a stratified split on the dataset from which the input 
    dataset_labels come. The logical mask is True for samples that should be labelled (training set) and False for samples that 
    should be unlabelled (testing set). 
    
    Parameters:
    -----------
    dataset_labels (list):
        list of labels. Labels must be integers.
    proportion (float):
        the proportion of instances for each unique class label in dataset_labels that should be included in the training set.
        
    Returns:
    --------
    labelled_subset_mask (array):
        Array of booleans the same length as dataset_labels. An element is True if the label is to be included in the 
        labelled dataset, False otherwise.
    kept_labels (array):
        the labels of the samples for which the subset mask is True
    unkept_labels (array):
        the labels of the samples for which the subset mask is False
    """
    assert((proportion >= 0) & (proportion <= 1)), "argument 'proportion' must be between 0 and 1, inclusive"    
    labels_ = np.asarray(dataset_labels.copy())    
    classes, counts = np.unique(dataset_labels, return_counts=True)
    labelled_subset_mask = np.full((sum(counts)), False)
    
    for c, count in zip(classes, counts):
        keep = int(round(count * proportion))
        # ensure each class has at least one instance labelled within the dataset
        keep = max([1, keep])
        if keep == count:
            warnings.warn(f'Insufficient samples - class label {c} does not appear in the unlabelled dataset')
        
        this_class_idx = np.where(labels_ == c)[0]
        keep_this_label = this_class_idx[: keep]
        # record label was kept
        labelled_subset_mask[keep_this_label] = True
    # end for
    kept_labels = labels_[labelled_subset_mask]
    unkept_labels = labels_[~labelled_subset_mask]
    
    return labelled_subset_mask, kept_labels, unkept_labels
    

In [28]:
# TASK 2 - Data cleaning


def find_non_rgb_img(path):
    """
    Description:
    ------------
    Identify images that are not in RBG format.
    
    Parameters:
    -----------
    path (str):
        Path containing images
    
    Returns:
    --------
        flagged_paths (list):
            full paths of images within path that were not in RBG format.
    """        
    flagged_paths = []
    for fname in os.listdir(path):
        full_path = os.path.join(path, fname)
        img = image.imread(full_path)
        if img.ndim != 3 or img.shape[2] != 3:
            flagged_paths.append(full_path)
        # end if
    # end for
    return flagged_paths
    
    
delete_imgs = find_non_rgb_img(IMGS_PATH)
print(len(delete_imgs))

for path in delete_imgs:
    os.remove(path)
# end for

34


In [6]:
# TASK 3 - Dataset representation


def load_annotations(mat_file):
    """
    Description:
    ------------
    Load cars dataset annotations from .mat file to pandas DataFrame
    
    Parameters:
    -----------
    mat_file (str):
        full path to .mat file containing cars dataset annotations
    
    Returns:
    --------
    annotations (pd.DataFrame)
        loaded annotations
    """
    loaded = scipy.io.loadmat(mat_file)
    annotations = loaded['annotations'][0]
    columns = list(annotations.dtype.names)
    unpacked_ann = []
    for ann in annotations:
        unpacked_ann.append([elem.flat[0] for elem in ann])
    # end for
    annotations = pd.DataFrame(data=unpacked_ann, columns=columns)
    try:
        fnames_ = annotations['relative_im_path']
        fnames = [name.split('/')[-1] for name in fnames_]
        annotations['fnames'] = fnames
    except:
        pass
    # end try
    return annotations
    

def mk_data_representation_map(imgs_path, annotations_path, model, preprocess):
    representation = {}
    annotations = load_annotations(annotations_path)
    has_class_column = 'class' in annotations.columns
    for i, fname in enumerate(os.listdir(imgs_path)):
        path = os.path.join(imgs_path, fname)
        embedding = get_embedding(model, preprocess, path)
        
        row = annotations.loc[annotations.fnames == fname]
        has_label = has_class_column
        if has_class_column:
            class_idx = row['class'].values[0]
            if class_idx is None:
                has_label = False
            # end if
        else:
            class_idx = None
        # end if
        representation[i] = {
            'embedding': embedding,
            'class_idx': class_idx, 
            'labelled': has_label}
    # end for
    return representation


def get_embedding(model, preprocess, img_path):
    input_img = Image.open(img_path)
    input_tensor = preprocess(input_img)
    input_batch = input_tensor.unsqueeze(0)
    with torch.no_grad():
        embedding = model(input_batch)
    # end with
    return embedding

    
# {1: {'embedding': <np.ndarray>, 'class_idx': <int>, ‘labelled': <boolean or int>}
    
annotations_path = os.path.join(projdir, 'labels', 'cars_annos.mat')

model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
model.fc = torch.nn.Identity()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

all_data = mk_data_representation_map(IMGS_PATH, annotations_path, model, preprocess)
    
save_path = os.path.join(projdir, 'data.pt')

torch.save(all_data, save_path)

Using cache found in C:\Users\Mark/.cache\torch\hub\pytorch_vision_v0.10.0


In [9]:
# TASK 4 - Build a partially labelled dataset

data = torch.load(save_path)
dataset_labels = [entry['class_idx'] for entry in data.values()]
labelled_subset_mask, kept_labels, unkept_labels = labelled_unlabelled_split(dataset_labels, 0.4)

partial_dataset = {}
for key, val in data.items():
    new_val = val.copy()
    if not labelled_subset_mask[key]:
        new_val['class_idx'] = np.nan
        new_val['labelled'] = False
    # end if
    partial_dataset[key] = new_val
# end for


In [12]:
# TASK 5 - Create train/validation split


def train_test_split(dataset_inputs, dataset_labels, training_proportion):
    """
    Description:
    ------------
    # TASK 5 - Create train/validation split
    
    Parameters:
    -----------
    dataset_inputs (list):
        list of feature vectors or embeddings.    
    dataset_labels (list):
        The class label corresponding to each element in dataset_inputs.    
    training_proportion (float)
        The proportion ([0, 1]) of instances for each unique class label in dataset_labels that should be included in the 
        training set.
    
    Returns:
    --------
    training_inputs (array):
        Elements from dataset_inputs that comprise the training set.
    training_labels (array):
        The class labels for
    test_inputs (array):
        Elements from dataset_inputs that comprise the testing set.
    test_labels (array):
        The class labels for test_inputs.
    """    
    labelled_subset_mask, training_labels, test_labels = labelled_unlabelled_split(dataset_labels, training_proportion)
    training_inputs = dataset_inputs[labelled_subset_mask]
    test_inputs = dataset_inputs[~labelled_subset_mask]
    return training_inputs, training_labels, test_inputs, test_labels 
    

In [None]:
# TASK 6 - Create experiment(s) to convince clients that more labelled data will improve model performance


`To convince clients to label some more data, I will create a demonstration that shows how increasing the number of labels 
increases model prediction accuracy. To accomplish this, I will divide the labelled dataset into two subsets. The first subset 
will contain 75% of the labelled data (30% of the full dataset) used for training. The second subset will contain the remaining 
25% of the labelled data (10% of the full dataset) used for testing. Models will be trained using 10, 20, 30, 40, 50,… 100% of 
the labelled data set aside for training. Each model will then be evaluated on the same testing set, which is comprised of the 
same 25% of the labelled data. By plotting accuracy as a function of training set size, I will be able to estimate the impact 
of additional labelled data on model accuracy. Then, the clients will be able to judge if the increase in model accuracy 
expected from the additional data labels is worth the cost of labelling additional data.`

In [None]:
from sklearn.linear_model import SGDClassifier

partial_labels = np.array([entry['class_idx'] for entry in partial_dataset.values() if entry['labelled']])
partial_inputs = [np.array(entry['embedding']) for entry in partial_dataset.values() if entry['labelled']]
partial_inputs = np.concatenate(partial_inputs, axis=0)
props = np.arange(0.1, 1.1, 0.1)
accuracies = []
for prop in props:
    train_x, train_y, test_x, test_y = train_test_split(partial_inputs, partial_labels, prop)
    model = SGDClassifier()
    model.fit(train_x, train_y)
    score = model.score(test_x, test_y)
    accuracies.append(score)
# end for
fg, ax = plt.subplots(1, 1, figsize=(16,9))
ax.plot(props, accuracies)
plt.show()

In [None]:
# TASK 7 - Active learning to select new instances to be labelled

In [None]:
# TASK 8 - Final model training and evaluation