# HW1-3: Generalization

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
TORCH_CUDA_ARCH_LIST="8.6"

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torchvision.transforms as transformtransforms

from torchvision import models
from torchsummary import summary
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToPILImage
from tqdm import tqdm

import cv2
import copy
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from glob import glob

Project_PATH = os.path.dirname(os.path.abspath('__file__'))
outputs_dir = Project_PATH + '/'
model_path = Project_PATH + '/save_models/'


In [None]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
device_default = torch.cuda.current_device()
torch.cuda.device(device_default)
print(torch.cuda.get_device_name(device_default))
device = torch.device("cuda")
print(torch.version.cuda)
print(torch.__version__)
print(torch.cuda.get_arch_list())

In [None]:
class DNN_MNIST_N(nn.Module):
    def __init__(self, in_dim, hidden_1, hidden_2, hidden_3, out_dim):
        super(DNN_MNIST_N, self).__init__()
        self.layer1 = nn.Sequential(nn.Linear(in_dim, hidden_1),nn.BatchNorm1d(hidden_1),nn.ReLU(True))
        self.layer2 = nn.Sequential(nn.Linear(hidden_1, hidden_2),nn.BatchNorm1d(hidden_2),nn.ReLU(True))
        self.layer3 = nn.Sequential(nn.Linear(hidden_2, hidden_3),nn.BatchNorm1d(hidden_3),nn.ReLU(True))
        self.layer4 = nn.Sequential(nn.Linear(hidden_3, out_dim))
        
    def forward(self, x):
        x = self.layer1(x) 
        x = self.layer2(x)    
        x = self.layer3(x)    
        x = self.layer4(x)
        return x
    
# device = torch.device("cuda")
# Model = DNN_MNIST_N(28*28,10,20,10,10).to(device)
# Model.eval()
# print('# of total parameters: ', sum(param.numel() for param in Model.parameters()))
# summary(Model, input_size=(1,28*28))

class DNN_MNIST_3(nn.Module):
    def __init__(self):
        super(DNN_MNIST_3, self).__init__()
        self.layer1 = nn.Sequential(nn.Linear(28*28, 256),nn.BatchNorm1d(256),nn.ReLU(True))
        self.layer2 = nn.Sequential(nn.Linear(256, 128),nn.BatchNorm1d(128),nn.ReLU(True))
        self.layer3 = nn.Sequential(nn.Linear(128, 64),nn.BatchNorm1d(64),nn.ReLU(True))
        self.layer4 = nn.Sequential(nn.Linear(64, 32),nn.BatchNorm1d(32),nn.ReLU(True))
        self.layer5 = nn.Sequential(nn.Linear(32, 10))
        
    def forward(self, x):
        x = self.layer1(x) 
        x = self.layer2(x)    
        x = self.layer3(x)    
        x = self.layer4(x)    
        x = self.layer5(x)    
        return x
    
# device = torch.device("cuda")
# Model_DNN_MNIST_3 = DNN_MNIST_3().to(device)
# summary(Model_DNN_MNIST_3, input_size=(1000,28*28))

def standardization(x):
    x = np.array(x)
    x[np.isnan(x)] = 0
    return (x-np.mean(x))/np.std(x)


## Can network fit random labels?

In [None]:

'''
Define train function
'''
def train_shuffle(model_name,
                Epochs = 100,
                Batch  = 2000,
                Data_workers = 0,
                LR = 0.01):
    '''
    Initiate data
    '''
#     transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
#     trainset = torchvision.datasets.CIFAR10(root='./data/',train=True,download=True,transform=transform)
#     testset = torchvision.datasets.CIFAR10(root='./data/',train=False,download=True,transform=transform)
    trainset = torchvision.datasets.MNIST(root='./data/',train=True,download=True,transform=transforms.ToTensor())
    testset = torchvision.datasets.MNIST(root='./data/',train=False,download=True,transform=transforms.ToTensor())
    
    # Shuffle labels
    random.shuffle(trainset.train_labels)

    trainloader = DataLoader(trainset, batch_size=Batch, shuffle=True, num_workers=Data_workers)
    testloader  = DataLoader(testset,  batch_size=Batch, shuffle=True, num_workers=Data_workers)
    print(trainset.classes)
    print(trainset.data.shape)
    print(testset.data.shape)
    '''
    Initiate model
    '''
    torch.cuda.is_available()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
    Model = model_name.to(device)
    '''
    loss & optimizer
    '''
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(Model.parameters(), lr=LR, momentum=0.9)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma = 0.8)
    '''
    Training
    '''
    trainloss_list = []
    testloss_list  = []
    accuracy_list  = []
    lr_list = []
   
    for epoch in range(Epochs):
        Model.train()
        train_loss = 0.0
#         with tqdm(total=(len(trainset) - len(trainset) % Batch)) as t:
#             t.set_description('epoch: {}/{}'.format(epoch+1, Epochs))
        for i, data in enumerate(trainloader):
            images, labels = data
#             images = images.to(device)
            images = (images.view(-1, 28*28)).to(device)
            labels = labels.to(device)
            outputs = Model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            total = (i+1)*Batch
#             t.set_postfix(loss='{:.6f}'.format(train_loss))
#             t.update(len(images))
        '''
        Evaluating
        '''
        Model.eval()
        with torch.no_grad():
            test_loss = 0
            correct = 0
            total = 0
            for data in testloader:
                images, labels = data
#                 images = images.to(device)
                images = (images.view(-1, 28*28)).to(device)
                labels = labels.to(device)
                outputs = Model(images)
                loss = criterion(outputs, labels)
                test_loss += loss.item()
                _, pred = torch.max(outputs.data, 1)
                correct += (pred == labels).cpu().sum()
                total += labels.size(0)
            total = len(testloader.dataset)
            accuracy = 100.0*correct/total
        '''
        Save loss
        '''
        scheduler.step()
        lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        trainloss_list.append(train_loss)
        testloss_list.append(test_loss)
        accuracy_list.append(accuracy)
        print('{}/{} Test set: Average loss: {:.4f}/{:.4f}, Accuracy: {}/{} ({:.2f}%) lr={}'.format(
                epoch, Epochs, train_loss,test_loss, correct, total, accuracy, lr_list[-1]))

    return [trainloss_list,
            testloss_list,
            accuracy_list,
            lr_list]


[trainloss_list,testloss_list,accuracy_list,lr_list] = train_shuffle(model_name=DNN_MNIST_3())

In [None]:
plt.figure(figsize=(20,10))
plt.plot(np.array(trainloss_list), label='train_loss')
plt.plot(np.array(testloss_list)*6, label='test_loss')
plt.xlabel('epoch',fontsize=20)
plt.ylabel('loss',fontsize=20)
plt.title('Loss',fontsize=20)
plt.legend(fontsize=20)
plt.show()

plt.figure(figsize=(20,10))
plt.plot(accuracy_list, label='accuracy')
plt.xlabel('epoch',fontsize=20)
plt.ylabel('accuracy',fontsize=20)
plt.title('accuracy',fontsize=20)
plt.legend(fontsize=20)
plt.show()

## Number of parameters v.s. Generalization

In [None]:

def train_CIFAR10(model_name, Epochs=20, Batch=2000, Data_workers=0, LR=0.1):
    '''
    Load datasets
    '''
    transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
    trainset = torchvision.datasets.CIFAR10(root='./data/',train=True,download=True,transform=transform)
    testset = torchvision.datasets.CIFAR10(root='./data/',train=False,download=True,transform=transform)
    trainloader = DataLoader(trainset, batch_size=Batch, shuffle=True, num_workers=Data_workers)
    testloader  = DataLoader(testset,  batch_size=Batch, shuffle=True, num_workers=Data_workers)
    print(trainset.classes)
    print(trainset.data.shape)
    print(testset.data.shape)
    torch.cuda.is_available()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    Model = model_name.to(device)
    num_param = sum(param.numel() for param in Model.parameters())
    print(model_name)
    print('# of total parameters: ', num_param)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(Model.parameters(), lr=LR, momentum=0.9)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 2, gamma = 0.8)
    '''
    Training
    '''
    trainloss_list = []
    testloss_list  = []
    train_acc_list  = []
    test_acc_list = []
    lr_list = []
    for epoch in range(Epochs):
        Model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        for i, data in enumerate(trainloader):
            images, labels = data
            images = (images.view(-1, 3*32*32)).to(device)
            labels = labels.to(device)
            outputs = Model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, pred = torch.max(outputs.data, 1)
            train_correct += (pred == labels).cpu().sum()
        train_total = len(trainloader.dataset)
        train_acc = 100.0*train_correct/train_total
        '''
        Evaluating
        '''
        Model.eval()
        with torch.no_grad():
            test_loss = 0
            test_correct = 0
            test_total = 0
            for data in testloader:
                images, labels = data
                images = (images.view(-1, 3*32*32)).to(device)
                labels = labels.to(device)
                outputs = Model(images)
                loss = criterion(outputs, labels)
                test_loss += loss.item()
                _, pred = torch.max(outputs.data, 1)
                test_correct += (pred == labels).cpu().sum()
            test_total = len(testloader.dataset)
            test_acc = 100.0*test_correct/test_total

        '''
        Save loss
        '''
#         scheduler.step()
        lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        trainloss_list.append(train_loss)
        testloss_list.append(test_loss)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print('{}/{} Test set: Average loss: {:.4f}/{:.4f}, Accuracy: {}/{} ({:.2f}%)/({:.2f}%) lr={}'.format(
                epoch, Epochs, train_loss,test_loss, train_correct,test_correct, train_acc,test_acc, lr_list[-1]))

    return [trainloss_list, testloss_list,
            train_acc_list, test_acc_list,
            lr_list, num_param]
    


In [None]:
[trainloss_1,testloss_1,train_acc_1,test_acc_1,_,num_param_1] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,1,2,1,10),Epochs=100)
[trainloss_2,testloss_2,train_acc_2,test_acc_2,_,num_param_2] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,2,4,2,10),Epochs=100)
[trainloss_3,testloss_3,train_acc_3,test_acc_3,_,num_param_3] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,3,6,3,10),Epochs=100)
[trainloss_4,testloss_4,train_acc_4,test_acc_4,_,num_param_4] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,5,10,5,10),Epochs=100)
[trainloss_5,testloss_5,train_acc_5,test_acc_5,_,num_param_5] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,10,20,10,10),Epochs=100)
[trainloss_6,testloss_6,train_acc_6,test_acc_6,_,num_param_6] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,20,40,20,10),Epochs=100)
[trainloss_7,testloss_7,train_acc_7,test_acc_7,_,num_param_7] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,50,100,50,10),Epochs=100)
[trainloss_8,testloss_8,train_acc_8,test_acc_8,_,num_param_8] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,100,200,100,10),Epochs=100)
[trainloss_9,testloss_9,train_acc_9,test_acc_9,_,num_param_9] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,200,400,200,10),Epochs=100)
[trainloss_10,testloss_10,train_acc_10,test_acc_10,_,num_param_10] = train_CIFAR10(model_name=DNN_MNIST_N(3*32*32,500,1000,500,10),Epochs=100)

In [None]:
num_param_list = [num_param_1,num_param_2,num_param_3,num_param_4,num_param_5,num_param_6,num_param_7,num_param_8,num_param_9,num_param_10]

trainloss_list = [trainloss_1,trainloss_2,trainloss_3,trainloss_4,trainloss_5,trainloss_6,trainloss_7,trainloss_8,trainloss_9,trainloss_10]
trainloss = np.array(trainloss_list)[:,-1]
testloss_list = [testloss_1,testloss_2,testloss_3,testloss_4,testloss_5,testloss_6,testloss_7,testloss_8,testloss_9,testloss_10]
testloss = np.array(testloss_list)[:,-1]

train_acc_list = [train_acc_1,train_acc_2,train_acc_3,train_acc_4,train_acc_5,train_acc_6,train_acc_7,train_acc_8,train_acc_9,train_acc_10]
train_acc = np.array(train_acc_list)[:,-1]
test_acc_list = [test_acc_1,test_acc_2,test_acc_3,test_acc_4,test_acc_5,test_acc_6,test_acc_7,test_acc_8,test_acc_9,test_acc_10]
test_acc = np.array(test_acc_list)[:,-1]

In [None]:
'''
Plot loss & lr
'''
plt.figure(figsize=(20,10))
plt.plot((num_param_list), (trainloss), label='train_loss')
plt.plot((num_param_list), (testloss*5), label='test_loss')
plt.xlabel('num_param',fontsize=20)
plt.ylabel('loss',fontsize=20)
plt.legend(fontsize=20)
plt.show()

plt.figure(figsize=(20,10))
plt.plot(num_param_list, train_acc, label='train_acc')
plt.plot(num_param_list, test_acc, label='test_acc')
plt.xlabel('num_param',fontsize=20)
plt.ylabel('accuracy',fontsize=20)
plt.legend(fontsize=20)
plt.show()

## Flatness v.s. Generalization - part1

In [None]:
'''
Define train function
'''
def train_MNIST(model_name,
                Epochs = 20,
                Batch  = 2000,
                Data_workers = 0,
                LR = 0.1):
    '''
    Initiate data
    '''
#     transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
#     trainset = torchvision.datasets.CIFAR10(root='./data/',train=True,download=True,transform=transform)
#     testset = torchvision.datasets.CIFAR10(root='./data/',train=False,download=True,transform=transform)
    trainset = torchvision.datasets.MNIST(root='./data/',train=True,download=True,transform=transforms.ToTensor())
    testset = torchvision.datasets.MNIST(root='./data/',train=False,download=True,transform=transforms.ToTensor())
    trainloader = DataLoader(trainset, batch_size=Batch, shuffle=True, num_workers=Data_workers)
    testloader  = DataLoader(testset,  batch_size=Batch, shuffle=True, num_workers=Data_workers)
    print(trainset.classes)
    print(trainset.data.shape)
    print(testset.data.shape)
    '''
    Initiate model
    '''
    torch.cuda.is_available()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
    Model = model_name.to(device)
    '''
    loss & optimizer
    '''
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(Model.parameters(), lr=LR, momentum=0.9)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma = 0.8)
    '''
    Training
    '''
    trainloss_list = []
    testloss_list  = []
    accuracy_list  = []
    lr_list = []
    F = []
   
    for epoch in range(Epochs):
        Model.train()
        train_loss = 0.0
#         with tqdm(total=(len(trainset) - len(trainset) % Batch)) as t:
#             t.set_description('epoch: {}/{}'.format(epoch+1, Epochs))
        for i, data in enumerate(trainloader):
            images, labels = data
#             images = images.to(device)
            images = (images.view(-1, 28*28)).to(device)
            labels = labels.to(device)
            outputs = Model(images)
            loss = criterion(outputs, labels)
            
#             grads = torch.autograd.grad(outputs, images, retain_graph=True)
#             F.append(grads)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            total = (i+1)*Batch
#             t.set_postfix(loss='{:.6f}'.format(train_loss))
#             t.update(len(images))
        '''
        Evaluating
        '''
        Model.eval()
        with torch.no_grad():
            test_loss = 0
            correct = 0
            total = 0
            for data in testloader:
                images, labels = data
#                 images = images.to(device)
                images = (images.view(-1, 28*28)).to(device)
                labels = labels.to(device)
                outputs = Model(images)
                loss = criterion(outputs, labels)
                test_loss += loss.item()
                _, pred = torch.max(outputs.data, 1)
                correct += (pred == labels).cpu().sum()
                total += labels.size(0)
            total = len(testloader.dataset)
            accuracy = 100.0*correct/total
        '''
        Save loss
        '''
#         scheduler.step()
        lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])
        trainloss_list.append(train_loss)
        testloss_list.append(test_loss)
        accuracy_list.append(accuracy)
        print('{}/{} Test set: Average loss: {:.4f}/{:.4f}, Accuracy: {}/{} ({:.2f}%) lr={}'.format(
                epoch, Epochs, train_loss,test_loss, correct, total, accuracy, lr_list[-1]))

    return [Model,
            trainloss_list,
            testloss_list,
            accuracy_list,
            lr_list,
            F]

# [_,trainloss_1,testloss_1,accuracy_1,_,F_1] = train_MNIST(model_name=DNN_MNIST_N(28*28,10,20,10,10),Batch=10)


### Batch 64 vs 2048

In [None]:
[Model_64,trainloss_64,testloss_64,accuracy_64,lr_64,_] = train_MNIST(model_name=DNN_MNIST_N(28*28,100,200,100,10),Batch=64)
[Model_2048,trainloss_2048,testloss_2048,accuracy_2048,lr_2048,_] = train_MNIST(model_name=DNN_MNIST_N(28*28,100,200,100,10),Batch=2048)

### lr 0.01 vs 0.001

In [None]:
[Model_1e3,trainloss_1e3,testloss_1e3,accuracy_1e3,lr_1e3,_] = train_MNIST(model_name=DNN_MNIST_N(28*28,100,200,100,10),LR=0.001)
[Model_1e2,trainloss_1e2,testloss_1e2,accuracy_1e2,lr_1e2,_] = train_MNIST(model_name=DNN_MNIST_N(28*28,100,200,100,10),LR=0.01)

In [None]:
'''
Plot loss & acc
'''
plt.figure()
plt.plot(trainloss_64, label='Model_64')
plt.plot(trainloss_2048, label='Model_2048')
plt.xlabel('epoch',fontsize=20)
plt.ylabel('loss',fontsize=20)
plt.title('Train loss',fontsize=20)
plt.legend(fontsize=20)
plt.show()

plt.figure()
plt.plot(testloss_64, label='Model_64')
plt.plot(testloss_2048, label='Model_2048')
plt.xlabel('epoch',fontsize=20)
plt.ylabel('loss',fontsize=20)
plt.title('Test loss',fontsize=20)
plt.legend(fontsize=20)
plt.show()

In [None]:
trainset = torchvision.datasets.MNIST(root='./data/',train=True,download=True,transform=transforms.ToTensor())
testset = torchvision.datasets.MNIST(root='./data/',train=False,download=True,transform=transforms.ToTensor())
trainloader = DataLoader(trainset, batch_size=2000, shuffle=True, num_workers=0)
testloader  = DataLoader(testset,  batch_size=2000, shuffle=True, num_workers=0)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
Model = DNN_MNIST_N(28*28,100,200,100,10).to(device)
criterion = nn.CrossEntropyLoss()
param_1 = Model_64.state_dict()
param_2 = Model_2048.state_dict()

batch_train_loss = []
batch_test_loss = []
batch_train_acc = []
batch_test_acc = []

alpha_list = np.linspace(-2,2,50)
for i in range(len(alpha_list)):
    alpha = alpha_list[i]
    param_new = {}
    for key in param_1.keys():
        param_new[key] = (1-alpha)*param_1[key] + alpha*param_2[key]
    Model.load_state_dict(param_new)
    Model.eval()
    with torch.no_grad():
        train_loss = 0
        correct = 0
        for data in trainloader:
            images, labels = data
            images = (images.view(-1, 28*28)).to(device)
            labels = labels.to(device)
            outputs = Model(images)
            loss = criterion(outputs, labels)
            train_loss += loss.item()
            _, pred = torch.max(outputs.data, 1)
            correct += (pred == labels).cpu().sum()
        total = len(trainloader.dataset)
        accuracy = 100.0*correct/total
        batch_train_loss.append(loss.detach().cpu().numpy())
        batch_train_acc.append(accuracy.detach().cpu().numpy())
        
        test_loss = 0
        correct = 0
        for data in testloader:
            images, labels = data
            images = (images.view(-1, 28*28)).to(device)
            labels = labels.to(device)
            outputs = Model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, pred = torch.max(outputs.data, 1)
            correct += (pred == labels).cpu().sum()
        total = len(testloader.dataset)
        accuracy = 100.0*correct/total
        batch_test_loss.append(loss.detach().cpu().numpy())
        batch_test_acc.append(accuracy.detach().cpu().numpy())

    print(alpha)

In [None]:
trainset = torchvision.datasets.MNIST(root='./data/',train=True,download=True,transform=transforms.ToTensor())
testset = torchvision.datasets.MNIST(root='./data/',train=False,download=True,transform=transforms.ToTensor())
trainloader = DataLoader(trainset, batch_size=2000, shuffle=True, num_workers=0)
testloader  = DataLoader(testset,  batch_size=2000, shuffle=True, num_workers=0)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
Model = DNN_MNIST_N(28*28,100,200,100,10).to(device)
criterion = nn.CrossEntropyLoss()
param_1 = Model_1e3.state_dict()
param_2 = Model_1e2.state_dict()

lr_train_loss = []
lr_train_acc = []
lr_test_loss = []
lr_test_acc = []

alpha_list = np.linspace(0,1,50)
for i in range(len(alpha_list)):
    alpha = alpha_list[i]
    param_new = {}
    for key in param_1.keys():
        param_new[key] = (1-alpha)*param_1[key] + alpha*param_2[key]
    Model.load_state_dict(param_new)
    Model.eval()
    with torch.no_grad():
        train_loss = 0
        correct = 0
        for data in trainloader:
            images, labels = data
            images = (images.view(-1, 28*28)).to(device)
            labels = labels.to(device)
            outputs = Model(images)
            loss = criterion(outputs, labels)
            train_loss += loss.item()
            _, pred = torch.max(outputs.data, 1)
            correct += (pred == labels).cpu().sum()
        total = len(trainloader.dataset)
        accuracy = 100.0*correct/total
        lr_train_loss.append(loss.detach().cpu().numpy())
        lr_train_acc.append(accuracy.detach().cpu().numpy())
        
        test_loss = 0
        correct = 0
        for data in testloader:
            images, labels = data
            images = (images.view(-1, 28*28)).to(device)
            labels = labels.to(device)
            outputs = Model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, pred = torch.max(outputs.data, 1)
            correct += (pred == labels).cpu().sum()
        total = len(testloader.dataset)
        accuracy = 100.0*correct/total
        lr_test_loss.append(loss.detach().cpu().numpy())
        lr_test_acc.append(accuracy.detach().cpu().numpy())

    print(alpha)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(alpha_list,standardization(batch_train_loss), label='train_loss')
plt.plot(alpha_list,standardization(batch_test_loss),label='test_loss')
plt.plot(alpha_list,standardization(batch_train_acc),label='train_accuracy')
plt.plot(alpha_list,standardization(batch_test_acc),label='test_accuracy')
plt.xlabel('alpha',fontsize=20)
plt.ylabel('loss',fontsize=20)
plt.title('Batch Size',fontsize=20)
plt.legend(fontsize=20)
plt.show()

plt.figure(figsize=(20,10))
plt.plot(alpha_list,(lr_train_loss), label='train_loss')
plt.plot(alpha_list,(lr_test_loss), label='test_loss')
plt.plot(alpha_list,standardization(lr_train_acc), label='train_accuracy')
plt.plot(alpha_list,standardization(lr_test_acc), label='test_accuracy')
plt.xlabel('alpha',fontsize=20)
plt.ylabel('loss',fontsize=20)
plt.title('Learning Rate',fontsize=20)
plt.legend(fontsize=20)
plt.show()

## Flatness v.s. Generalization - part2

In [None]:
[_,trainloss_1,testloss_1,accuracy_1,_,F_1] = train_MNIST(model_name=DNN_MNIST_N(28*28,10,20,10,10),Batch=10)
[_,trainloss_2,testloss_2,accuracy_2,_,F_2] = train_MNIST(model_name=DNN_MNIST_N(28*28,10,20,10,10),Batch=50)
[_,trainloss_3,testloss_3,accuracy_3,_,F_3] = train_MNIST(model_name=DNN_MNIST_N(28*28,10,20,10,10),Batch=100)
[_,trainloss_4,testloss_4,accuracy_4,_,F_4] = train_MNIST(model_name=DNN_MNIST_N(28*28,10,20,10,10),Batch=200)
[_,trainloss_5,testloss_5,accuracy_5,_,F_5] = train_MNIST(model_name=DNN_MNIST_N(28*28,10,20,10,10),Batch=1000)
