# HW1-2: Optimization

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
TORCH_CUDA_ARCH_LIST="8.6"

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torchvision.transforms as transformtransforms

from torchvision import models
from torchsummary import summary
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToPILImage
from tqdm import tqdm

import cv2
import copy
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from glob import glob

Project_PATH = os.path.dirname(os.path.abspath('__file__'))
outputs_dir = Project_PATH + '/'
model_path = Project_PATH + '/save_models/'


In [None]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
device_default = torch.cuda.current_device()
torch.cuda.device(device_default)
print(torch.cuda.get_device_name(device_default))
device = torch.device("cuda")
print(torch.version.cuda)
print(torch.__version__)
print(torch.cuda.get_arch_list())

## HW1-2 Visualize the Optimization Process

In [None]:
class DNN_MNIST(nn.Module):
    def __init__(self):
        super(DNN_MNIST, self).__init__()
        self.layer1 = nn.Sequential(nn.Linear(28*28, 32),nn.ReLU(True))
        self.layer2 = nn.Sequential(nn.Linear(32, 16),nn.ReLU(True))
        self.layer3 = nn.Sequential(nn.Linear(16, 10))
    def forward(self, x):
        x = self.layer1(x) 
        x = self.layer2(x)    
        x = self.layer3(x)    
        return x
    
device = torch.device("cuda")
Model_DNN_MNIST = DNN_MNIST().to(device)
summary(Model_DNN_MNIST, input_size=(1,28*28))


class CNN_CIFAR_3(nn.Module):
    def __init__(self):
        super(CNN_CIFAR_3, self).__init__()
        self.layer1 = nn.Sequential(nn.Conv2d(3, 10, 3),nn.BatchNorm2d(10),nn.ReLU(True),nn.MaxPool2d(kernel_size=(2, 2), stride=2))
        self.layer2 = nn.Sequential(nn.Conv2d(10, 16, 3),nn.BatchNorm2d(16),nn.ReLU(True),nn.MaxPool2d(kernel_size=(2, 2), stride=2))
        self.layer3 = nn.Sequential(nn.Conv2d(16, 32, 3),nn.BatchNorm2d(32),nn.ReLU(True))
        self.layer4 = nn.Sequential(nn.Linear(32*4*4, 64),nn.BatchNorm1d(64),nn.ReLU(True))
        self.layer5 = nn.Sequential(nn.Linear(64, 16),nn.BatchNorm1d(16),nn.ReLU(True))
        self.layer6 = nn.Sequential(nn.Linear(16, 10))
    def forward(self, x):
        x = self.layer1(x) 
        x = self.layer2(x)    
        x = self.layer3(x) 
        x = x.view(x.size()[0], -1)
        x = self.layer4(x)    
        x = self.layer5(x)    
        x = self.layer6(x)    
        return x
    
device = torch.device("cuda")
Model = CNN_CIFAR_3().to(device)
summary(Model, input_size=(3,32,32))


In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
TORCH_CUDA_ARCH_LIST="8.6"

import torch
import torchvision
import torchvision.transforms.functional as TF
from torch import nn
from torchvision import transforms
from torchvision.transforms import ToPILImage
from torch.utils.data import Dataset, DataLoader

import cv2
import random
import numpy as np
from PIL import Image
from glob import glob



def train(model_name, Epochs=20, Batch=2000, Data_workers=0, LR=0.1):
    '''
    Load datasets
    '''
#     transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
#     trainset = torchvision.datasets.CIFAR10(root='./data/',train=True,download=True,transform=transform)
#     testset = torchvision.datasets.CIFAR10(root='./data/',train=False,download=True,transform=transform)
    trainset = torchvision.datasets.MNIST(root='./data/',train=True,download=True,transform=transforms.ToTensor())
    testset = torchvision.datasets.MNIST(root='./data/',train=False,download=True,transform=transforms.ToTensor())
    trainloader = DataLoader(trainset, batch_size=Batch, shuffle=True, num_workers=Data_workers)
    testloader  = DataLoader(testset,  batch_size=Batch, shuffle=True, num_workers=Data_workers)
    print(trainset.classes)
    print(trainset.data.shape)
    print(testset.data.shape)
    torch.cuda.is_available()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    Model = model_name().to(device)
#     print(Model)
    num_param = sum(param.numel() for param in Model.parameters())
    print('# of total parameters: ', num_param)
    '''
    loss & optimizer
    '''
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(Model.parameters(), lr=LR, momentum=0.9)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma = 0.8)
    '''
    Training
    '''
    trainloss_list = []
    testloss_list  = []
    accuracy_list  = []
    lr_list = []
    w = []
    w_1 = []
    w_loss = []
    grad_list = []
    for epoch in range(Epochs):
        Model.train()
        train_loss = 0.0
        for i, data in enumerate(trainloader):
            images, labels = data
#             images = images.to(device)
            images = (images.view(-1, 28*28)).to(device)
            labels = labels.to(device)
            outputs = Model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        '''
        Evaluating
        '''
        Model.eval()
        with torch.no_grad():
            test_loss = 0
            correct = 0
            total = 0
            for data in testloader:
                images, labels = data
#                 images = images.to(device)
                images = (images.view(-1, 28*28)).to(device)
                labels = labels.to(device)
                outputs = Model(images)
                loss = criterion(outputs, labels)
                test_loss += loss.item()
                _, pred = torch.max(outputs.data, 1)
                correct += (pred == labels).cpu().sum()
            total = len(testloader.dataset)
            accuracy = 100.0*correct/total
        '''
        Save loss
        '''
        lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        trainloss_list.append(train_loss)
        testloss_list.append(test_loss)
        accuracy_list.append(accuracy)
        print('{}/{} Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
                epoch, Epochs,test_loss, correct, total, accuracy))
        '''
        Weight collect
        '''
        if epoch % 1 == 0:
            # Layer weights
            weights_layer = np.zeros(0)
            for name,parameters in Model.named_parameters():
                if name == 'layer2.0.weight':
#                     print(name)
                    weight_i = (parameters.detach().cpu().numpy().reshape(-1))
                    weights_layer = np.concatenate((weights_layer, weight_i))
                    break
            print(weights_layer.shape)
            w_1.append(weights_layer)
            
            # Total weights
            weights = np.zeros(0)
            for name,parameters in Model.named_parameters():
                if name[-6:] == 'weight':
                    weight_i = (parameters.detach().cpu().numpy().reshape(-1))
                    weights = np.concatenate((weights, weight_i))
            print(weights.shape)
            w.append(weights)
            w_loss.append(train_loss)
        '''
        Grad collect
        '''
        grad_all = 0.0
        for p in Model.parameters():
            grad = 0.0
            if p.grad is not None:
                grad = (p.grad.cpu().data.numpy()**2).sum()
            grad_all += grad
        grad_list.append(grad_all**0.5)
        
    return [trainloss_list,
            testloss_list,
            accuracy_list,
            lr_list,
            w,
            w_1,
            w_loss,
            grad_list]
    
# [trainloss_list,testloss_list,accuracy_list,lr_list,w,w_loss,grad_list] = train(CNN_CIFAR_3)


### Collect weights of the model

In [None]:
events = 8
W = []
W_loss = []
W_1 = []
G = []

for i in range(events):
    print('Event: '+str(i+1))
    #     DNN_MNIST
    #     CNN_CIFAR_3
    
    [_,_,_,_,w,w_1,w_loss,grad_list] = train(DNN_MNIST)
    W.append(w)
    W_1.append(w_1)
    W_loss.append(w_loss)
    G.append(grad_list)
    

print(np.array(W).shape)
print(np.array(W_1).shape)
W_1

w = np.array(w)
print(w.shape)

pca = PCA(n_components=2)
pca.fit(w)
w_new = pca.transform(w)

print(w_new)
plt.scatter(w_new[:,0],w_new[:,1])

### PCA reduce dimention to 2

In [None]:
from sklearn.decomposition import PCA

def W_2(W_i):
    w = np.array(W_i)
    pca = PCA(n_components=2)
    pca.fit(w)
    w_new = pca.transform(w)
    return w_new
W_pca = []
for i in range(events):
    w_pca = W_2(W[i])
    W_pca.append(w_pca)
W_pca = np.array(W_pca)

# print(W_pca.shape)    
# print(W_pca[0].shape)
# print(W_pca)

W_1_pca = []
for i in range(events):
    w_1_pca = W_2(W_1[i])
    W_1_pca.append(w_1_pca)
W_1_pca = np.array(W_1_pca)


In [None]:
### Plot weight

In [None]:
plt.figure(figsize=(20,10))
plt.xlabel('w1',fontsize=20)
plt.ylabel('w2',fontsize=20)
plt.title('Model Weights',fontsize=20)
plt.legend(fontsize=20)
for i in range(events):
    W_i = W_pca[i]
    plt.scatter(W_i[:,0], W_i[:,1])
    for j in range(len(W_i)):
        plt.annotate(round(W_loss[i][j],1), (W_i[j,0],W_i[j,1]))
plt.show()

plt.figure(figsize=(20,10))
plt.xlabel('w1',fontsize=20)
plt.ylabel('w2',fontsize=20)
plt.title('Layer Weights',fontsize=20)
for i in range(events):
    W_i = W_1_pca[i]
    plt.scatter(W_i[:,0], W_i[:,1])
    for j in range(len(W_i)):
        plt.annotate(round(W_loss[i][j],1), (W_i[j,0],W_i[j,1]))
plt.show()

## Visualize error surface

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
    
fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)

W_all = W_pca.reshape((events*len(W_pca[0]),2))
W_loss_all = np.array(W_loss)
W_loss_all = W_loss_all.reshape(-1)
# print(W_loss_all)

x = np.linspace(0,10,10)
y = np.linspace(0,10,10)
X, Y = np.meshgrid(x, y)
z = X-X

error = (W_all-np.min(W_all))/(np.max(W_all)-np.min(W_all))
error[error==1] = 0.99

for i in range(len(error[:,0])):
    zx = ((error[i,0])*10).astype(np.int)
    zy = ((error[i,1])*10).astype(np.int)
#     print(zx,zy,W_loss_all[i])
    z[zx,zy] = W_loss_all[i]
    

ax.plot_surface(X, Y, z, cmap='rainbow')
ax.view_init(elev=30, azim=-15)
plt.show()
    

## HW1-2 Observe Gradient Norm During Training

In [None]:
class DNN_MNIST_3(nn.Module):
    def __init__(self):
        super(DNN_MNIST_3, self).__init__()
        self.layer1 = nn.Sequential(nn.Linear(1, 32),nn.ReLU(True))
        self.layer2 = nn.Sequential(nn.Linear(32, 16),nn.ReLU(True))
        self.layer3 = nn.Sequential(nn.Linear(16, 1))
    def forward(self, x):
        x = self.layer1(x) 
        x = self.layer2(x)    
        x = self.layer3(x)    
        return x
    
# device = torch.device("cuda")
# Model_DNN_MNIST = DNN_MNIST().to(device)
# summary(Model_DNN_MNIST, input_size=(1000,28*28))

device = torch.device("cuda")
Model = DNN_MNIST_3().to(device)
summary(Model, (1,1))


In [None]:
'''
Initiate data
'''
x = torch.linspace(0,1,1000).unsqueeze(1)

y = torch.sin(5*np.pi*x)/(5*np.pi*x)
y[0] = y[1]
func_1 = y

'''
Define train function
'''
def train(function,
          model_name,
          Epochs = 20000,
          Batch  = 1000,
          Data_workers = 0,
          LR = 0.0005):
    '''
    Initiate model
    '''
    torch.cuda.is_available()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
    Model = model_name().to(device)
    x = torch.linspace(0,1,1000).unsqueeze(1)
    x = x.to(device)
    y = function.to(device)
    '''
    loss & optimizer
    '''
    criterion = nn.MSELoss()
    optimizer = optim.Adam(Model.parameters(), lr=LR)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 100, gamma = 0.8)
    '''
    Training
    '''
    trainloss_list = []
    lr_list = []
    grad_list = []
    for epoch in range(Epochs):
        Model.train()
        train_loss = 0.0
        y_pred = Model(x)
        loss = criterion(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss = loss.item()
        trainloss_list.append(train_loss)
#         if epoch >= Epochs//2:
#             scheduler.step()
        if epoch % (Epochs//10) == 0:
            print('{}/{}, loss: {}'.format(epoch,Epochs,train_loss))
        lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        
        '''
        Grad collect
        '''
        grad_all = 0.0
        for p in Model.parameters():
            grad = 0.0
            if p.grad is not None:
                grad = (p.grad.cpu().data.numpy()**2).sum()
            grad_all += grad
        grad_list.append(grad_all**0.5)
            
    return [Model,trainloss_list,lr_list,grad_list]

In [None]:
[Model,trainloss,lr,grad_list] = train(func_1, DNN_MNIST_3, Epochs=20000, Batch=1000, Data_workers=0, LR=0.005)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(np.array(trainloss)**0.2, label='train_loss')
plt.xlabel('epoch',fontsize=20)
plt.ylabel('loss',fontsize=20)
plt.title('sin(5*np.pi*x)/(5*np.pi*x)',fontsize=20)
plt.legend(fontsize=20)
plt.show()

plt.figure(figsize=(20,10))
plt.plot(np.array(grad_list)**1, label='grad')
plt.xlabel('epoch',fontsize=20)
plt.ylabel('gard',fontsize=20)
plt.title('sin(5*np.pi*x)/(5*np.pi*x)',fontsize=20)
plt.legend(fontsize=20)
plt.show()