In [2]:
import numpy as np
import pandas as pd

# Import Training Data and show metadata

In [7]:
train_df = pd.read_csv("data/train.csv")
print(train_df.shape)
print("Train Columns: ", train_df.columns)
print("Label: ", train_df.columns[0])

(42000, 785)
Train Columns:  Index(['label', 'pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5',
       'pixel6', 'pixel7', 'pixel8',
       ...
       'pixel774', 'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779',
       'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object', length=785)
Label:  label


# Import Testing Data and show metadata

In [8]:
test_df = pd.read_csv('data/test.csv')
print(test_df.shape)
print("Test Columns: ", test_df.columns)

(28000, 784)
Test Columns:  Index(['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9',
       ...
       'pixel774', 'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779',
       'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object', length=784)


# Observations so far
There are 42,000 different images in the training dataset and 28,000 different images in the testing dataset.

There are 735 columns. The first column appears to be the label (ground truth data) and the rest appear represent the image itself.

The testing dataset does not have any ground truth data

# Creating dataset class

## What are Datasets and DataLoaders?
Processing data samples can cause code to get messy. Datasets and DataLoaders provide a way to modularize data samples and make it easier for the model to access the samples and labels that go along with the dataset. Datasets and Dataloaders provide a way to decouple interacting with the data sample from the actual training and testing of a model. Datasets stores the samples and DataLoaders provide an easy way to acccess the samples. 

`
from torchvision import datasets
`

It is also possible to load current datasets with the code above

In [20]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda
import os

class CustomMNISTDataset(Dataset):
    def __init__(self, csv_name, img_dir, transform=None, target_transform=None, label_name = 'label'):
        self.img_filename = csv_name
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform
        self.label_name = label_name

        img_path = os.path.join(self.img_dir, self.img_filename)
        self.img_df = pd.read_csv(img_path)

    def __len__(self):
        return len(self.img_df)

    def __getitem__(self, index):
        img_cols = [i for i in self.img_df.columns if i not in self.label_name]
        image = self.img_df.iloc[[index]][img_cols].values

        image = image.reshape(28,28)
        image = image/255.0

        if self.transform:
            image = self.transform(image)

        image = image.float()

        if self.label_name in self.img_df.columns:
            if self.target_transform:
                label = self.target_transform(label)
            label = int(self.img_df.iloc[[index]][self.label_name].values)
            return image, label
        else:
            return image

            

# Checking the Distribution of target labels

In [11]:
train_df['label'].value_counts().sort_index()

label
0    4132
1    4684
2    4177
3    4351
4    4072
5    3795
6    4137
7    4401
8    4063
9    4188
Name: count, dtype: int64

# Splitting Training Dataset

In [14]:
from sklearn.model_selection import train_test_split
indices = list(range(len(train_df)))

train_indices, test_indices = train_test_split(indices, test_size=0.1, stratify=train_df['label'])

len(train_indices), len(test_indices) , len(train_df)

(37800, 4200, 42000)

In [15]:
train_subset = train_df.loc[train_indices]
val_subset = train_df.loc[test_indices]

print("Distribution of target values in training dataset ; ")
print( train_subset['label'].value_counts().sort_index() / train_subset['label'].value_counts().sort_index().sum() )

print("Distribution of target values in validation dataset ; ")
print( val_subset['label'].value_counts().sort_index() / val_subset['label'].value_counts().sort_index().sum() )

Distribution of target values in training dataset ; 
label
0    0.098386
1    0.111534
2    0.099444
3    0.103598
4    0.096958
5    0.090344
6    0.098492
7    0.104788
8    0.096746
9    0.099709
Name: count, dtype: float64
Distribution of target values in validation dataset ; 
label
0    0.098333
1    0.111429
2    0.099524
3    0.103571
4    0.096905
5    0.090476
6    0.098571
7    0.104762
8    0.096667
9    0.099762
Name: count, dtype: float64


In [21]:
from torchvision import transforms

train_csv_name = 'train.csv'
test_csv_name = 'test.csv'
img_dir = 'data/'

transform = transforms.Compose( [transforms.ToTensor() , transforms.Normalize((0.5,), (0.5,)) , ] )

train_dataset = CustomMNISTDataset(csv_name=train_csv_name, img_dir=img_dir, transform=transform, target_transform=None, label_name='label')
x0 , y0 = train_dataset[0]
print(x0.shape , y0)

AttributeError: 'numpy.ndarray' object has no attribute 'float'