<a href="https://colab.research.google.com/github/mamuncseru/deep_understanding_deep_learning/blob/main/DUDL_data_datasetLoader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# import libraries
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

## Datasets

In [3]:
# create some data in numpy
nObservations = 100
nFeatures = 20

data = np.random.randn(nObservations, nFeatures)

In [4]:
# Convert to pytorch tensor
dataT = torch.tensor(data)

# print out some information
print('Numpy data: ')
print(type(data))
print(data.shape)
print(data.dtype)
print(' ')

print('Tensor data: ')
print(type(dataT))
print(dataT.size())
print(dataT.dtype)
print(' ')

Numpy data: 
<class 'numpy.ndarray'>
(100, 20)
float64
 
Tensor data: 
<class 'torch.Tensor'>
torch.Size([100, 20])
torch.float64
 


In [5]:
# sometimes you need to convert data types
dataT2 = torch.tensor(data).float()
print(dataT2.dtype)

# "long" is for ints
dataT3 = torch.tensor(data).long()
print(dataT3.dtype)

torch.float32
torch.int64


In [7]:
dataT2

tensor([[-0.2136,  1.6850, -0.1273,  ..., -1.5579,  1.1060,  0.1855],
        [-0.7886,  0.3451,  0.5140,  ..., -0.7181, -0.5244,  0.8912],
        [ 1.2190, -0.2542, -0.8772,  ..., -1.0539,  0.1826,  0.2899],
        ...,
        [-0.5079,  0.3029, -0.0847,  ...,  1.1270,  0.0905,  0.9156],
        [-0.5867,  1.2015, -0.1808,  ..., -1.5030, -0.6807,  0.2147],
        [ 0.2196,  0.3490, -0.8075,  ...,  0.0242,  0.9437,  0.1757]])

In [6]:
dataT3

tensor([[ 0,  1,  0,  ..., -1,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 1,  0,  0,  ..., -1,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  1,  0,  ..., -1,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])

In [9]:
# convert tensor into pytorch datasets

# dataset = TensorDataset(data) # not a tensor!
dataset = TensorDataset(dataT)

# dataset is a two-element tuple comprising data, labels
dataset.tensors

(tensor([[-0.2136,  1.6850, -0.1273,  ..., -1.5579,  1.1060,  0.1855],
         [-0.7886,  0.3451,  0.5140,  ..., -0.7181, -0.5244,  0.8912],
         [ 1.2190, -0.2542, -0.8772,  ..., -1.0539,  0.1826,  0.2899],
         ...,
         [-0.5079,  0.3029, -0.0847,  ...,  1.1270,  0.0905,  0.9156],
         [-0.5867,  1.2015, -0.1808,  ..., -1.5030, -0.6807,  0.2147],
         [ 0.2196,  0.3490, -0.8075,  ...,  0.0242,  0.9437,  0.1757]],
        dtype=torch.float64),)

In [14]:
# let's try again with labels
labels = torch.ceil(torch.linspace(.01, 4, nObservations))

# transform to an actual matrix (column vector)
labels = labels.reshape((len(labels), 1))
# print(labels)

# now make another datadet
dataset = TensorDataset(dataT, labels)

print(dataset.tensors[0].size())
print(dataset.tensors[1].size())

# for comparison
print(np.shape(np.random.randint(5, size=nObservations)))

torch.Size([100, 20])
torch.Size([100, 1])
(100,)


## DataLoaders

In [15]:
# create a dataloader object
batchsize = 25
dataloader = DataLoader(dataset, batch_size=batchsize)#,shuffle=True,drop_last=True)

dataloader.dataset.tensors[0].size()

torch.Size([100, 20])

In [16]:
#sizes of each batch
for dat, labs, in dataloader:
    print('BATCH INFO:')
    print(dat.size())
    print(labs.size())
    print(' ')

BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 


In [18]:
# inspect the labels
for dat, labs in dataloader:
    print(labs.T)
    print(' ')

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1.]])
 
tensor([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2.]])
 
tensor([[3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
         3., 3., 3., 3., 3., 3., 3.]])
 
tensor([[4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4., 4., 4., 4., 4., 4.]])
 


In [23]:
# try again with shuffling(shuffling happens during iterations)
# dataloader = DataLoader(dataset, batch_size=batchsize,shuffle=True,drop_last=True)
# inspect the labels
for dat, labs in dataloader:
    print(labs.T)
    print(' ')

tensor([[1., 3., 3., 2., 4., 3., 2., 4., 1., 3., 2., 3., 1., 3., 3., 4., 3., 4.,
         2., 1., 1., 2., 4., 2., 4.]])
 
tensor([[1., 4., 3., 3., 2., 4., 1., 4., 4., 1., 2., 4., 1., 3., 3., 1., 4., 4.,
         2., 1., 3., 4., 1., 3., 2.]])
 
tensor([[1., 3., 1., 3., 2., 1., 3., 2., 2., 1., 3., 1., 1., 1., 4., 2., 1., 3.,
         2., 2., 3., 1., 4., 4., 4.]])
 
tensor([[3., 4., 2., 2., 2., 4., 1., 1., 2., 2., 1., 1., 4., 2., 2., 3., 3., 3.,
         3., 4., 4., 4., 2., 4., 2.]])
 


In [21]:
# TO get only one batch (e.g. for testing)
dat, labs = next(iter(dataloader))
labs.T

tensor([[4., 3., 4., 1., 4., 4., 1., 3., 3., 1., 2., 4., 2., 3., 1., 3., 1., 3.,
         4., 2., 2., 4., 2., 2., 1.]])