## Loading Data frmo files

In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../PYTORCH_NOTEBOOKS/Data/iris.csv')
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


## Classical method of creating splits

In [3]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(df.drop('target',axis=1).values,
                                                    df['target'].values, test_size=0.2,
                                                    random_state=33)

# Convert the numpy arrays to tensors
X_train = torch.FloatTensor(train_X)
X_test = torch.FloatTensor(test_X)

# 'y' does not need to be float since its a label
# reshaping to make it a column istead of a vector
# -1 just means it is unknown and you want pytorch to calculate it
# https://stackoverflow.com/questions/18691084/what-does-1-mean-in-numpy-reshape
y_train = torch.LongTensor(train_y).reshape(-1, 1) 
y_test = torch.LongTensor(test_y).reshape(-1, 1)

In [5]:
print(f'Training size: {len(y_train)}')
labels, counts = y_train.unique(return_counts=True)
print(f'Labels: {labels}\nCounts: {counts}')

Training size: 120
Labels: tensor([0, 1, 2])
Counts: tensor([42, 42, 36])


In [6]:
print(X_train.size())
print(y_train.size())

torch.Size([120, 4])
torch.Size([120, 1])


## Using PyTorch's Dataset and DataLoader classes

In [7]:
from torch.utils.data import TensorDataset, DataLoader

data = df.drop('target',axis=1).values
labels = df['target'].values

iris = TensorDataset(torch.FloatTensor(data),torch.LongTensor(labels))

In [8]:
print(len(iris))
print(type(iris))

150
<class 'torch.utils.data.dataset.TensorDataset'>


In [9]:
for i in iris:
    print(i)

(tensor([5.1000, 3.5000, 1.4000, 0.2000]), tensor(0))
(tensor([4.9000, 3.0000, 1.4000, 0.2000]), tensor(0))
(tensor([4.7000, 3.2000, 1.3000, 0.2000]), tensor(0))
(tensor([4.6000, 3.1000, 1.5000, 0.2000]), tensor(0))
(tensor([5.0000, 3.6000, 1.4000, 0.2000]), tensor(0))
(tensor([5.4000, 3.9000, 1.7000, 0.4000]), tensor(0))
(tensor([4.6000, 3.4000, 1.4000, 0.3000]), tensor(0))
(tensor([5.0000, 3.4000, 1.5000, 0.2000]), tensor(0))
(tensor([4.4000, 2.9000, 1.4000, 0.2000]), tensor(0))
(tensor([4.9000, 3.1000, 1.5000, 0.1000]), tensor(0))
(tensor([5.4000, 3.7000, 1.5000, 0.2000]), tensor(0))
(tensor([4.8000, 3.4000, 1.6000, 0.2000]), tensor(0))
(tensor([4.8000, 3.0000, 1.4000, 0.1000]), tensor(0))
(tensor([4.3000, 3.0000, 1.1000, 0.1000]), tensor(0))
(tensor([5.8000, 4.0000, 1.2000, 0.2000]), tensor(0))
(tensor([5.7000, 4.4000, 1.5000, 0.4000]), tensor(0))
(tensor([5.4000, 3.9000, 1.3000, 0.4000]), tensor(0))
(tensor([5.1000, 3.5000, 1.4000, 0.3000]), tensor(0))
(tensor([5.7000, 3.8000, 1.7

**Once we have a dataset we can wrap it with a DataLoader. This gives us a powerful sampler that provides single- or multi-process iterators over the dataset.**

In [10]:
iris_loader = DataLoader(iris, batch_size=105, shuffle=True)

In [15]:
# Create mini-batchs
for i_batch, sample_batched in enumerate(iris_loader):
    print("Number of observations in batch {}: {}".format(
        i_batch, len(sample_batched[1])))
    print(i_batch, sample_batched)
     
# First Batch has 105 observations
# Second batch has the remaining observations

Number of observations in batch 0: 105
0 [tensor([[5.0000, 2.3000, 3.3000, 1.0000],
        [4.5000, 2.3000, 1.3000, 0.3000],
        [6.5000, 3.0000, 5.8000, 2.2000],
        [5.1000, 3.5000, 1.4000, 0.2000],
        [6.7000, 3.3000, 5.7000, 2.1000],
        [6.7000, 2.5000, 5.8000, 1.8000],
        [5.1000, 3.8000, 1.5000, 0.3000],
        [5.1000, 3.5000, 1.4000, 0.3000],
        [5.7000, 2.8000, 4.1000, 1.3000],
        [5.6000, 3.0000, 4.1000, 1.3000],
        [6.4000, 2.8000, 5.6000, 2.2000],
        [6.2000, 3.4000, 5.4000, 2.3000],
        [4.8000, 3.4000, 1.6000, 0.2000],
        [5.5000, 2.6000, 4.4000, 1.2000],
        [6.3000, 2.3000, 4.4000, 1.3000],
        [6.1000, 2.6000, 5.6000, 1.4000],
        [5.3000, 3.7000, 1.5000, 0.2000],
        [6.2000, 2.8000, 4.8000, 1.8000],
        [6.3000, 3.3000, 4.7000, 1.6000],
        [5.1000, 3.4000, 1.5000, 0.2000],
        [6.5000, 3.0000, 5.5000, 1.8000],
        [4.6000, 3.4000, 1.4000, 0.3000],
        [7.6000, 3.0000, 6.6000, 2

In [23]:
# 0 is for mini-batch number, 1 is for labels
list(iris_loader)[0][1].bincount() 

tensor([34, 35, 36])

In [25]:
next(iter(iris_loader))

[tensor([[6.3000, 3.3000, 6.0000, 2.5000],
         [6.2000, 3.4000, 5.4000, 2.3000],
         [6.9000, 3.2000, 5.7000, 2.3000],
         [5.2000, 3.5000, 1.5000, 0.2000],
         [5.1000, 3.8000, 1.6000, 0.2000],
         [4.9000, 3.0000, 1.4000, 0.2000],
         [5.4000, 3.4000, 1.5000, 0.4000],
         [6.0000, 2.2000, 4.0000, 1.0000],
         [4.9000, 2.4000, 3.3000, 1.0000],
         [4.7000, 3.2000, 1.3000, 0.2000],
         [6.6000, 2.9000, 4.6000, 1.3000],
         [4.9000, 3.1000, 1.5000, 0.1000],
         [4.6000, 3.2000, 1.4000, 0.2000],
         [6.5000, 3.0000, 5.5000, 1.8000],
         [6.3000, 2.8000, 5.1000, 1.5000],
         [6.4000, 3.2000, 5.3000, 2.3000],
         [6.9000, 3.1000, 5.4000, 2.1000],
         [6.2000, 2.8000, 4.8000, 1.8000],
         [5.5000, 3.5000, 1.3000, 0.2000],
         [5.0000, 2.3000, 3.3000, 1.0000],
         [5.0000, 3.0000, 1.6000, 0.2000],
         [6.4000, 2.8000, 5.6000, 2.2000],
         [5.3000, 3.7000, 1.5000, 0.2000],
         [6