In [4]:
import os
import cv2

import pandas as pd
from PIL import Image
from typing import Any, Tuple, Optional, Callable
from torch.utils.data import Dataset

def read_csv(path: str) -> pd.DataFrame:
    '''
    Read a csv file.

    Args:
        path (str): Path to the csv file.
    
    Returns:
        pd.DataFrame: Dataframe with the csv file data.
    '''
    
    assert os.path.exists(path), f'CSV file not found: {path}!'
    assert os.path.splitext(path)[
    -1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'
    return pd.read_csv(path)

class ImageDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, images_folder: str = './images', transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
        '''
        Image dataset.

        Args:
            dataframe (pd.DataFrame): Dataframe with the image filenames and labels.
            images_folder (str): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
            target_transform (callable, optional): Optional transform to be applied on a target.
        '''
        assert 'Filename' in dataframe.columns, f'Filename column not found!'
        assert os.path.exists(images_folder), f'Image folder not found: {images_folder}!'

        self.dataframe = dataframe
        self.images_folder = images_folder
        self.transform = transform
        self.target_transform = target_transform

        data = []
        targets = []

        for i, sample in dataframe.iterrows():
            image = cv2.imread(os.path.join(images_folder, sample['Filename']))
            data.append(image)

            targets.append(int(sample['Label']) if 'Label' in sample else -1)

        self.data = data
        self.targets = targets

    def __len__(self) -> int:
        '''
        Returns:
            int: Length of the dataset.
        '''
        return len(self.data)

    def __getitem__(self, index: int) -> Tuple[Any, Any]:
        '''
        Args:
            index (int): Index
        
        Returns:
            tuple: (image, target) where target is class_index of the target class. For the public test set, target is a class from [0, 1, 2, 3, 4, 5, 6, 7, 8]. For the private test set (before releasing the test set labels), target is -1.
        '''
        img = self.data[index]
        target = self.targets[index]

        img = Image.fromarray(img)

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

In [5]:
public_dataframe = read_csv('assignment_7_public.csv')
public_dataset = ImageDataset(public_dataframe)

print('Image', type(public_dataset[0][0]), public_dataset[0][0].size) # Image <class 'PIL.Image.Image'> (28, 28)
print('Target', type(public_dataset[0][1])) # Target <class 'int'>
print('Length', len(public_dataset)) # Length 85744



'''
CODE HERE!
'''

Image <class 'PIL.Image.Image'> (28, 28)
Target <class 'int'>
Length 85744


'\nCODE HERE!\n'

In [9]:
public_dataframe

Unnamed: 0,Filename,Label
0,0.jpg,5
1,1.jpg,8
2,2.jpg,8
3,3.jpg,3
4,4.jpg,1
...,...,...
85739,85739.jpg,5
85740,85740.jpg,5
85741,85741.jpg,1
85742,85742.jpg,2


In [6]:
pd.reset_option('^display.', silent=True)
public_dataframe.value_counts(['Label']).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,9379
1,9059
2,9497
3,9812
4,7935
5,11354
6,7570
7,8711
8,12427


In [7]:
pd.set_option('display.float_format', '{:.2%}'.format)
public_dataframe.value_counts(['Label'], normalize=True).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,proportion
Label,Unnamed: 1_level_1
0,10.94%
1,10.57%
2,11.08%
3,11.44%
4,9.25%
5,13.24%
6,8.83%
7,10.16%
8,14.49%


In [None]:
private_dataframe = read_csv('assignment_7_private.csv')
private_dataset = ImageDataset(private_dataframe)

print('Image', type(private_dataset[0][0])) # Image <class 'PIL.Image.Image'> (28, 28)
print('Length', len(private_dataset)) # Length 21436

import numpy as np

# remove and make your own predictions.
preds = np.full(len(private_dataset), -1,
                dtype=int)
'''
CODE HERE!
e.g.,
preds = np.full(len(X_private), -1, dtype=int)
'''

submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_7.csv', index=True, index_label='Id')