In [21]:
import numpy as np
from torchvision import transforms
import torch
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

In [22]:
#############################################################################
def train_ann(ann=None, dataloader=None, criterion=None, epochs=None):
    for epoch in range(epochs):  # loop over the dataset multiple times

        running_loss = 0.0
        i = 0
        for data, target, index in dataloader:
            #print(i)
            #print(data)
            # get the inputs; data is a list of [inputs, labels/target]
            inputs = data
            labels = target
    
            # zero the parameter gradients
            optimizer.zero_grad()
    
            # forward + backward + optimize
            outputs = ann(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
            # print statistics
            running_loss += loss.item()
            if i % 20 == 19:    # print every 20 mini-batches
                print('[epoch %d, pattern number %d] loss: %.3f' %
                      (epoch + 1, index, running_loss / 20))
                running_loss = 0.0
            i += 1

    print('Finished Training')


In [23]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
        assert len(X) == len(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        data, target = self.X[i], self.y[i]
        # to be able to get the pattern/example index later
        index = i
        return data,target,index


In [24]:
# this is one way to define a network
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x)) # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x

In [30]:
if __name__ == '__main__':    
    torch.manual_seed(1)    # reproducible
    
    # x data (tensor), shape=(100, 1)
    X = torch.unsqueeze(torch.linspace(-1, 1, 4000), dim=1)
    # noisy y data (tensor), shape=(100, 1)
    y = X.pow(2) + 0.2*torch.rand(X.size())
    
    dataset = MyDataset(X, y)
    
    # Set the k-fold
    k = 3

 Split the indices into k mutually exclusive subsets $\mathcal{D}_i$

In [31]:
    indices = range(len(dataset))
    partitions = kf = KFold(n_splits=k, random_state=None, shuffle=True)

The error vector contains errors $e_i$ for every pattern $z^{(i)}$.
The size of this vector in a sigle task scenario with continuos output
(univariate regression) for a dataset with N pattern is (1 x N).

In [32]:
    error_vector = np.arange(len(dataset))

In [33]:
    from IPython.core.display import display, HTML
    display(HTML("<style>div.output_scroll { height: 44em; }</style>"))
    
    fold = 0
    loss_function = torch.nn.MSELoss()

    for train_index, test_index in kf.split(indices):
        fold += 1
        print("Training in fold number:", fold)
        
        # Define the network for this fold. It is a kind of weight reset.
        # In more complex scenarios we could use different ANN for every fold.
        # For example, assuming there is a function taking an integer and
        # returning a network we could make net = get_network_for_fold(fold)
        net = Net(n_feature=1, n_hidden=10, n_output=1)
        # print(net)  # net architecture
    
        # We globaly define the hyperparamers but they could be paramerters 
        # of the training algo.
        epochs = 2
        optimizer = torch.optim.SGD(net.parameters(), lr=0.2)
        criterion = torch.nn.MSELoss()
        
        current_training_d_without_d_i = SubsetRandomSampler(
                indices=train_index)
        
        current_training_d_loader = torch.utils.data.DataLoader(dataset, 
                                                            batch_size=1, 
                                                            shuffle=False, 
                                        sampler=current_training_d_without_d_i, 
                                                            batch_sampler=None,
                                                            num_workers=0,
                                                            collate_fn=None,
                                                            pin_memory=False,
                                                            drop_last=False,
                                                            timeout=0,
                                                        worker_init_fn=None, 
                                                multiprocessing_context=None)
        
        current_d_i = SubsetRandomSampler(indices=test_index)
        
        current_d_i_loader = torch.utils.data.DataLoader(dataset, 
                                                            batch_size=1, 
                                                            shuffle=False, 
                                                        sampler=current_d_i, 
                                                            batch_sampler=None,
                                                            num_workers=0,
                                                            collate_fn=None,
                                                            pin_memory=False,
                                                            drop_last=False,
                                                            timeout=0,
                                                        worker_init_fn=None, 
                                                multiprocessing_context=None)
        # train CNN
        # $f_i$ is the learning algorithm. In this case, is the ANN with the 
        # "best parameters" according to the loss function used inside the
        # training loop. Note that network architecture, loss function 
        # (criterion) and number of iterations (epochs) remain constant.
        # However, these paramters could be changed to perform a model 
        # selection/evaluation.
        train_ann(ann=net, dataloader=current_training_d_loader, 
                  criterion=criterion, epochs=epochs)
        
        f_i = net
    
        # Calculate loss of the trained model output and the data elements of
        # the current partition. Note that we could use now a different loss
        # function than the one used to train the network itself. Nevertheless,
        # I use the same here (L1 loss).
        current_loss = 0.0
        print("Validating in fold number:", fold)
        i = 0
        for data, target, index in current_d_i_loader:
            # get the inputs; data is a list of [inputs, labels]
            inputs = data
            labels = target
    
            # only forward because we are performing evaluation
            outputs = net(inputs)
            loss = loss_function(outputs, labels)
    
            # print statistics
            current_loss = loss.item()
            # i is the pattern/example index
            error_vector[i] = current_loss
            if index % 20 == 19:    # print every 20 examples
                print('[fold number %d, pattern number %d] current (single pattern) loss: %.3f' %
                      (fold, index, current_loss))
            i += 1
    print("Finished fold iterations")

Training in fold number: 1
[epoch 1, pattern number 3362] loss: 0.150
[epoch 1, pattern number 1147] loss: 0.090
[epoch 1, pattern number 3590] loss: 0.094
[epoch 1, pattern number 176] loss: 0.073
[epoch 1, pattern number 3199] loss: 0.072
[epoch 1, pattern number 1679] loss: 0.048
[epoch 1, pattern number 1672] loss: 0.009
[epoch 1, pattern number 1910] loss: 0.029
[epoch 1, pattern number 3858] loss: 0.021
[epoch 1, pattern number 2795] loss: 0.012
[epoch 1, pattern number 1503] loss: 0.020
[epoch 1, pattern number 907] loss: 0.009
[epoch 1, pattern number 2722] loss: 0.016
[epoch 1, pattern number 3065] loss: 0.017
[epoch 1, pattern number 3533] loss: 0.022
[epoch 1, pattern number 3664] loss: 0.007
[epoch 1, pattern number 2235] loss: 0.004
[epoch 1, pattern number 1345] loss: 0.010
[epoch 1, pattern number 848] loss: 0.010
[epoch 1, pattern number 531] loss: 0.011
[epoch 1, pattern number 1605] loss: 0.012
[epoch 1, pattern number 1730] loss: 0.009
[epoch 1, pattern number 3980] 

[epoch 2, pattern number 3986] loss: 0.006
[epoch 2, pattern number 2507] loss: 0.011
[epoch 2, pattern number 1001] loss: 0.005
[epoch 2, pattern number 1987] loss: 0.005
[epoch 2, pattern number 916] loss: 0.011
[epoch 2, pattern number 3571] loss: 0.010
[epoch 2, pattern number 3707] loss: 0.005
[epoch 2, pattern number 227] loss: 0.006
[epoch 2, pattern number 2406] loss: 0.004
[epoch 2, pattern number 927] loss: 0.007
[epoch 2, pattern number 3656] loss: 0.005
[epoch 2, pattern number 1487] loss: 0.005
[epoch 2, pattern number 2162] loss: 0.007
[epoch 2, pattern number 3661] loss: 0.008
[epoch 2, pattern number 2178] loss: 0.006
[epoch 2, pattern number 3137] loss: 0.005
[epoch 2, pattern number 1404] loss: 0.012
[epoch 2, pattern number 2945] loss: 0.007
[epoch 2, pattern number 1023] loss: 0.006
[epoch 2, pattern number 711] loss: 0.007
[epoch 2, pattern number 2109] loss: 0.011
[epoch 2, pattern number 1556] loss: 0.007
[epoch 2, pattern number 1824] loss: 0.007
[epoch 2, patte

[epoch 1, pattern number 3038] loss: 0.005
[epoch 1, pattern number 2674] loss: 0.006
[epoch 1, pattern number 2749] loss: 0.006
[epoch 1, pattern number 900] loss: 0.003
[epoch 1, pattern number 2808] loss: 0.010
[epoch 1, pattern number 2739] loss: 0.004
[epoch 1, pattern number 566] loss: 0.006
[epoch 1, pattern number 3837] loss: 0.006
[epoch 1, pattern number 3162] loss: 0.006
[epoch 1, pattern number 545] loss: 0.006
[epoch 1, pattern number 3228] loss: 0.006
[epoch 1, pattern number 1316] loss: 0.004
[epoch 1, pattern number 207] loss: 0.007
[epoch 1, pattern number 3744] loss: 0.006
[epoch 1, pattern number 2886] loss: 0.006
[epoch 1, pattern number 1237] loss: 0.005
[epoch 1, pattern number 934] loss: 0.007
[epoch 1, pattern number 970] loss: 0.007
[epoch 1, pattern number 1858] loss: 0.007
[epoch 1, pattern number 3426] loss: 0.007
[epoch 1, pattern number 3762] loss: 0.006
[epoch 1, pattern number 1736] loss: 0.006
[epoch 1, pattern number 3406] loss: 0.004
[epoch 1, pattern

[epoch 2, pattern number 2470] loss: 0.003
[epoch 2, pattern number 3223] loss: 0.008
[epoch 2, pattern number 1736] loss: 0.006
[epoch 2, pattern number 384] loss: 0.007
[epoch 2, pattern number 1770] loss: 0.007
[epoch 2, pattern number 3308] loss: 0.007
[epoch 2, pattern number 3727] loss: 0.007
Finished Training
Validating in fold number: 2
[fold number 2, pattern number 1459] current (single pattern) loss: 0.033
[fold number 2, pattern number 819] current (single pattern) loss: 0.003
[fold number 2, pattern number 3139] current (single pattern) loss: 0.005
[fold number 2, pattern number 539] current (single pattern) loss: 0.000
[fold number 2, pattern number 1559] current (single pattern) loss: 0.017
[fold number 2, pattern number 2099] current (single pattern) loss: 0.009
[fold number 2, pattern number 579] current (single pattern) loss: 0.010
[fold number 2, pattern number 879] current (single pattern) loss: 0.017
[fold number 2, pattern number 3239] current (single pattern) los

[epoch 1, pattern number 3605] loss: 0.006
[epoch 1, pattern number 1137] loss: 0.005
[epoch 1, pattern number 3077] loss: 0.007
[epoch 1, pattern number 2384] loss: 0.013
[epoch 1, pattern number 395] loss: 0.006
[epoch 1, pattern number 244] loss: 0.009
[epoch 1, pattern number 3236] loss: 0.007
[epoch 1, pattern number 27] loss: 0.009
[epoch 1, pattern number 537] loss: 0.006
[epoch 1, pattern number 1055] loss: 0.005
[epoch 1, pattern number 1689] loss: 0.006
[epoch 1, pattern number 3912] loss: 0.007
[epoch 1, pattern number 1069] loss: 0.007
[epoch 1, pattern number 433] loss: 0.011
[epoch 1, pattern number 2675] loss: 0.006
[epoch 1, pattern number 3931] loss: 0.006
[epoch 1, pattern number 2088] loss: 0.005
[epoch 1, pattern number 424] loss: 0.008
[epoch 1, pattern number 44] loss: 0.005
[epoch 1, pattern number 2889] loss: 0.006
[epoch 1, pattern number 2301] loss: 0.004
[epoch 1, pattern number 2338] loss: 0.004
[epoch 1, pattern number 2420] loss: 0.006
[epoch 1, pattern nu

[fold number 3, pattern number 2999] current (single pattern) loss: 0.003
Finished fold iterations
