# How to create a custom dataset

In [None]:
import numpy, torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim

from PIL import Image
from torch.utils.data.dataset import Dataset
from torch.utils.data import TensorDataset
from torch.autograd import Variable
from torchvision import transforms
from torchvision.datasets import MNIST
import matplotlib.pyplot as plt

SEED = 1

# CUDA?
cuda = torch.cuda.is_available()

# For reproducibility
torch.manual_seed(SEED)

if cuda:
    torch.cuda.manual_seed(SEED)

In [None]:
class CustomDatasetFromCSV(Dataset):
    """
    Create a data set from a CSV file.
    
    Attributes
    transforms : torchvision.transforms
        The transformations to apply to the data.
    data_info : pd.dataframe_like
        The pieces of information related to the data.
    image_arr : np.array_like
        The arrangement of the images.
    label_arr : np.array_like
        The arrangement of the labels.
    data_len : int
        The length of the data.
    width : int
        The width of the pictures.
    height : int
        The height of the pictures.
    
    Examples
    --------
    >>> dataset =  CustomDatasetFromCSV('./images.csv',28, 28)
    """
    
    def __init__(self, csv_path, width, height, transforms=None):
        """
        Create the data set.
        
        Parameters
        ----------
        csv_path : str
            The path to the CSV file.
        width : int
            The width of the pictures.
        height : int
            The height of the pictures.
        transforms : torchvision.transforms
            The transformations to apply to the data set.
        """
        self.transforms = transforms
        self.data_info = pd.read_csv(csv_path, header=None)
        self.image_arr = np.asarray(self.data_info.iloc[:, 0])
        self.label_arr = np.asarray(self.data_info.iloc[:, 1])
        self.data_len = len(self.data_info.index)
        self.width = width
        self.height = height
        
    def __getitem__(self, index):
        """
        Get an item of the data set.
        
        Parameters
        ----------
        index : int
            The index of the item to get.
        
        Returns
        -------
        out : (tensor, object)
            The item related to the index.
        """
        single_image_name = self.image_arr[index]
        img_as_img = Image.open(single_image_name)
        img_as_img = img_as_img.resize((self.width, self.height), Image.BICUBIC)

        img_as_img = img_as_img.convert('L')

        if self.transforms is not None:
            img_as_tensor = self.transforms(img_as_img)

        single_image_label = self.label_arr[index]

        return (img_as_tensor, single_image_label)

    def __len__(self):
        """
        Length of the data set.
        
        Returns
        -------
        out : int
            The length of the data set.
        
        Examples
        --------
        >>> dataset =  CustomDatasetFromCSV('./images.csv',28, 28)
        >>> print(dataset.__len__())
        """
        return self.data_len

In [None]:
transformations = transforms.Compose([transforms.ToTensor()])
train =  CustomDatasetFromCSV('./img/images.csv',28, 28,transformations)

In [None]:
img, label = train.__getitem__(0)
plt.imshow(img.squeeze(0))