In [1]:
## This file contains training code to learn the weights
## Author : Avadesh Meduri
## Date : 9/05/2022

%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import time
import pathlib

python_path = pathlib.Path('.').absolute().parent/'python'
os.sys.path.insert(1, str(python_path))

In [2]:
import numpy as np
import torch
from torch.autograd import Function
from torch.nn import functional as F
from torch.utils import data
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, RandomSampler, Sampler

from IPython.display import display, clear_output

In [18]:
class WeightDataSet(Dataset):
    
    def __init__(self, indices, mean = None, std = None):
        

        for i in range(len(indices)):
            if i == 0:
                self.x_train = torch.load("../data/x_train" + str(indices[i]) + ".pt")
                self.y_train = torch.load("../data/y_train" + str(indices[i]) + ".pt")
        
            else:
                x_train = torch.load("../data/x_train" + str(indices[i]) + ".pt")
                y_train = torch.load("../data/y_train" + str(indices[i]) + ".pt")
                
                self.x_train = torch.vstack((self.x_train, x_train))
                self.y_train = torch.vstack((self.y_train, y_train))
        
        
        if isinstance(mean, np.ndarray) and isinstance(std, np.ndarray):
            print("using given mean")
            self.mean = mean
            self.std = std
            self.y_train = (self.y_train - self.mean)/self.std
        else:
            
            self.mean = torch.mean(self.y_train, axis = 0)
            self.std = torch.std(self.y_train, axis = 0)
            z = torch.where(self.std == 0) ## checking where std and mean are zero
            self.std[z] = 1.0
            
            self.y_train = (self.y_train - self.mean)/self.std
    
    def get_mean_std(self):
        return self.mean, self.std
    
    def get_nn_size(self):
        return self.x_train.shape[1], self.y_train.shape[1]
    
    def __len__(self):
        return int(len(self.y_train))
    
    def __getitem__(self, gidx):
        return self.x_train[gidx], self.y_train[gidx]

In [19]:
class Net(torch.nn.Module):

    def __init__(self, inp_size, out_size):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(inp_size, 512)
        self.fc2 = torch.nn.Linear(512, 512)
        self.out = torch.nn.Linear(512, out_size)

    def forward(self, x):
       
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = self.out(x)
        return x

In [20]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print("using cpu")
    device = torch.device("cpu")

lr = 1.0e-3
eps = 1000
indices = [5]
test_indices = [4]

train_dataset = WeightDataSet(indices)
mean, std = train_dataset.get_mean_std()
inp, out = train_dataset.get_nn_size()

net = Net(inp, out)
net_name = "model1"

# net.load_state_dict(torch.load("./models/" + net_name))
net = net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = torch.nn.L1Loss() #torch.nn.MSELoss() #torch.nn.HuberLoss()

data = DataLoader(train_dataset, \
                  batch_size = 64, num_workers = 12, shuffle=True)
test = DataLoader(WeightDataSet(test_indices, mean = mean, std = std), \
                  batch_size = 64, num_workers = 12, shuffle=True)

using cpu


In [17]:
best_loss = 1e3
last_save = 0
loss_arr = []
test_arr = []
for i in range(eps):
    m_e = []
    for x_train_batch, y_train_batch in data:
        x_train_gpu = x_train_batch.to(device)
        y_train_gpu = y_train_batch.to(device)
        y_pred = net(x_train_gpu)
        error = loss(y_pred, y_train_gpu) 
        m_e.append(error.cpu().detach().numpy())
        optimizer.zero_grad()
        error.backward()
        optimizer.step()
    
    m_error = np.mean(m_e)
    loss_arr.append(m_error)

    t_e = []
    for x_test_batch, y_test_batch in test:
        x_test_gpu = x_test_batch.to(device)
        y_test_gpu = y_test_batch.to(device)
        y_pred = net(x_test_gpu)
        error = loss(y_pred, y_test_gpu) 
        t_e.append(error.cpu().detach().numpy())
    
    t_error = np.mean(t_e)
    test_arr.append(t_error)

    #plotting
    plt.plot(loss_arr, label = "training loss")
    plt.plot(test_arr, label = "test loss")

    plt.yscale("log")
    plt.grid()
    plt.legend()
    clear_output(wait=True)
    plt.show()
  
    print("The iteration number : " + str(i) + " The loss is :" + str(m_error) + \
                          " Last save :"  + str(last_save), end='\r', flush  = True)    
    
    
    if best_loss > m_error:
        last_save = i
        torch.save(net.state_dict(), "./models/" + net_name)
        best_loss = m_error
    if i == 2:
        optimizer = torch.optim.Adam(net.parameters(), lr=lr/10)
    if i == 70:
        optimizer = torch.optim.Adam(net.parameters(), lr=lr/20)

        
torch.save(net.state_dict(), "./models/" + net_name)   

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fc4b4e35700><function _MultiProcessingDataLoaderIter.__del__ at 0x7fc4b4e35700>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
        self._shutdown_workers()self._shutdown_workers()
Exception ignored in: 
  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fc4b4e35700>  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    <function _MultiProcessingDataLoaderIter.__del__ at 0x7fc4b4e35700>Exception ignored in: if w.is_alive():    

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 84, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 84, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 56, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: stack(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.



Exception ignored in: Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fc4b4e35700>Traceback (most recent call last):
can only test a child process<function _MultiProcessingDataLoaderIter.__del__ at 0x7fc4b4e35700><function _MultiProcessingDataLoaderIter.__del__ at 0x7fc4b4e35700>

Traceback (most recent call last):


  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()      File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
self._shutdown_workers()    
self._shutdown_workers()  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
self._shutdown_workers()
  File "/home/ameduri/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
  File "/usr/lib/python3.8/multiprocess