<a href="https://colab.research.google.com/github/mamuncseru/deep_understanding_deep_learning/blob/main/DUDL_CrossValidation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# imports
import torch
import torch.nn as nn
import numpy as np
import seaborn as sns

In [2]:
# import dataset
iris = sns.load_dataset('iris')

# convert from pandas dataframe to tensor
data = torch.tensor(iris[iris.columns[0:4]].values).float()

# transform species to number
labels = torch.zeros(len(data), dtype=torch.long)
# labels[iris.species=='setosa'] = 0 # don't need
labels[iris.species == 'versicolor'] = 1
labels[iris.species == 'virginica'] = 2

# Separate data into train and test

In [10]:
# (no devset here)
# how many training examples
propTraining = .8 # in proportion, not percent
nTraining = int(len(labels)*propTraining)

# initialize a boolean vector to select data and labels
traintestBool = np.zeros(len(labels), dtype = bool)

# is this the correct way to select samples?
# traintestBool[range(nTraining)] = True

## this is better, but why?
items2use4train = np.random.choice(range(len(labels)), nTraining, replace=False)
traintestBool[items2use4train] = True

traintestBool


array([False,  True, False,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True, False,  True,  True,  True,  True,  True, False,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True, False,
        True,  True, False,  True,  True,  True,  True,  True, False,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,

In [9]:
labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])

In [11]:
# test whether it's balanced
print('Average of full data:')
print(torch.mean(labels.float())) # should be 1
print('')

print('Average of training data: ')
print(torch.mean(labels[traintestBool].float())) # should be 1
print('')

print('Average of test data: ')
print(torch.mean(labels[~traintestBool].float())) # should also be 1

Average of full data:
tensor(1.)

Average of training data: 
tensor(1.0167)

Average of test data: 
tensor(0.9333)


In [14]:
# create the ANN model

# model architecture
ANNiris = nn.Sequential(
    nn.Linear(4, 64),   # input layer
    nn.ReLU(),          # activation unit
    nn.Linear(64, 64),  # hidden layer
    nn.ReLU(),          # activation unit
    nn.Linear(64, 3)    # output layer
)

# loss function
lossfun = nn.CrossEntropyLoss()

# optimizer
optimizer = torch.optim.SGD(ANNiris.parameters(), lr=.01)


In [15]:
# entire dataset
print( data.shape )

# training set
print( data[traintestBool, :].shape)

# test set
print(data[~traintestBool, :].shape)

torch.Size([150, 4])
torch.Size([120, 4])
torch.Size([30, 4])


# Train and Test the model

In [16]:
# train the model
numepochs = 1000

# initialize losses
losses = torch.zeros(numepochs)
ongoingAcc = []

# loop over epochs
for epochi in range(numepochs):
    yHat = ANNiris(data[traintestBool, :])

    # compute accuracy (note: denser than previous code!)
    ongoingAcc.append( 100*torch.mean(
        (torch.argmax(yHat, axis=1) == labels[traintestBool]).float()
    ))

    # compute loss
    loss = lossfun(yHat, labels[traintestBool])
    losses[epochi] = loss

    # backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [18]:
# compute train and test acccuracies

# final forward pass Using training data
predictions = ANNiris(data[traintestBool, :])
trainacc = 100*torch.mean((torch.argmax(predictions, axis=1) == labels[traintestBool]).float())

# final forward pass Using test data
predictions = ANNiris(data[~traintestBool, :])
testacc = 100*torch.mean((torch.argmax(predictions, axis=1) == labels[~traintestBool]).float())

In [19]:
# report accuracies

print('Final Train accuracy: %g%%' %trainacc)
print('Final Test accuracy: %g%%' %testacc)

Final Train accuracy: 97.5%
Final Test accuracy: 100%


In [None]:
# normally also inspect losses and accuracy by epoch, etc, etc, etc.
