In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import grad

In [2]:
def load_data():
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                              shuffle=False, num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                           download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                             shuffle=False, num_workers=2)

    return trainloader, testloader

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [4]:
def train(trainloader, testloader, net):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            #inputs, labels = data
            inputs, labels = data[0].cuda(), data[1].cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

    print('Finished Training')

In [5]:
def save_model(net):
    PATH = './cifar_net.pth'
    torch.save(net.state_dict(), PATH)


def load_model():
    PATH = './cifar_net.pth'
    net = Net()
    net.load_state_dict(torch.load(PATH))
    net.cuda()
    return net


def test(testloader, net):
    correct = 0
    total = 0
    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))
    with torch.no_grad():
        for data in testloader:
            #images, labels = data
            images, labels = data[0].cuda(), data[1].cuda()
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            _, pred = torch.max(outputs, 1)
            c = (pred == labels).squeeze()
            for i in range(4):
                label = labels[i]
                class_correct[label] += c[i].item()
                class_total[label] += 1

    print('Accuracy of the network on the 10000 test images: %d %%' % (
        100 * correct / total))
    classes = ('plane', 'car', 'bird', 'cat',
               'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    for i in range(10):
        print('Accuracy of %5s : %2d %%' % (
            classes[i], 100 * class_correct[i] / class_total[i]))

In [None]:
if __name__ == "__main__":
    trainloader, testloader = load_data()
    model = Net()
    model.cuda()
    train(trainloader, testloader, model)
    test(testloader, model)
    save_model(model)


In [8]:
trainloader, testloader = load_data()
model = Net()
params = [p for p in model.parameters() ]
model.cuda()
train_dataset_size = len(trainloader.dataset)
train_dataset_size

Files already downloaded and verified
Files already downloaded and verified


50000

### 

In [24]:
z, t = trainloader.dataset[0]
z = trainloader.collate_fn([z])
t = trainloader.collate_fn([t])
t

tensor([6])

In [69]:
criterion = nn.CrossEntropyLoss()
for i, data in enumerate(trainloader, ):
    # get the inputs; data is a list of [inputs, labels]
    #inputs, labels = data
    inputs, labels = data[0].cuda(), data[1].cuda() 
    print(labels)
    outputs = model(inputs)
    
    loss = criterion(outputs, labels)#feature和label都放到cuda上成为cuda上的tensor
    loss.backward()

for p in params:
    print(p.grad)

tensor([6, 9, 9, 4], device='cuda:0')
tensor([[[[-1.1271e-02, -1.8612e-02, -1.3273e-02, -4.4192e-03,  1.8658e-03],
          [-3.5417e-03, -2.8069e-03,  1.9984e-03, -1.3986e-02, -1.7427e-02],
          [ 1.9739e-02,  2.1949e-02,  4.9301e-04, -2.4480e-02, -2.2341e-02],
          [ 2.5356e-02,  2.7856e-02,  3.1576e-03, -2.8058e-02, -2.5873e-02],
          [ 1.0477e-02,  7.4383e-04, -1.5314e-02, -3.9951e-02, -2.2294e-02]],

         [[ 5.7551e-03, -1.8481e-03,  3.9416e-03,  1.2813e-02,  2.2032e-02],
          [ 1.2090e-02,  1.4062e-02,  2.0992e-02,  1.0284e-02,  5.9894e-03],
          [ 3.2578e-02,  4.0090e-02,  2.2420e-02,  4.2713e-04,  1.3591e-04],
          [ 3.7518e-02,  4.7492e-02,  2.5585e-02, -5.0053e-03, -3.2246e-03],
          [ 2.5207e-02,  1.9820e-02,  5.4607e-03, -1.8361e-02, -6.7790e-04]],

         [[ 3.1349e-02,  2.0964e-02,  3.0125e-02,  4.0157e-02,  4.1517e-02],
          [ 3.6450e-02,  3.2829e-02,  4.6219e-02,  4.6067e-02,  3.6426e-02],
          [ 5.4380e-02,  5.2281e-0

In [56]:
params

[Parameter containing:
 tensor([[[[-0.0389,  0.0450, -0.0816, -0.0866,  0.0204],
           [-0.0894,  0.0841, -0.0357,  0.1138, -0.1118],
           [ 0.0092,  0.0829, -0.0875, -0.0626,  0.0621],
           [-0.0799,  0.0206,  0.0365,  0.0595,  0.0668],
           [-0.0521, -0.0111, -0.0899,  0.0304, -0.0339]],
 
          [[-0.0477, -0.0977, -0.0968, -0.0845,  0.0312],
           [ 0.0377,  0.0846, -0.0977,  0.0114,  0.0252],
           [ 0.0067, -0.0958,  0.0685, -0.0229,  0.1024],
           [-0.1002,  0.0299, -0.0786, -0.0575, -0.1138],
           [-0.0222, -0.0770,  0.0424,  0.1100,  0.0556]],
 
          [[ 0.0431, -0.0651, -0.0491,  0.0048, -0.0016],
           [ 0.0273,  0.0179, -0.0996, -0.1045, -0.0530],
           [ 0.0724, -0.0777, -0.0018,  0.0535,  0.0490],
           [ 0.0361, -0.0652,  0.0937,  0.0384,  0.0941],
           [-0.0667, -0.0110,  0.0304,  0.0401,  0.0317]]],
 
 
         [[[-0.1082, -0.0140,  0.0470,  0.0209, -0.0360],
           [ 0.0702,  0.0809, -0.1051

In [28]:
i

12499

In [25]:
params = [p for p in model.parameters() ]
criterion = nn.CrossEntropyLoss()
for i, data in enumerate(trainloader, ):
    # get the inputs; data is a list of [inputs, labels]
    #inputs, labels = data
    inputs, labels = data[0].cuda(), data[1].cuda() 
    outputs = model(inputs)
    
    loss = criterion(outputs, labels)#feature和label都放到cuda上成为cuda上的tensor
    grad = torch.autograd.grad(loss,params,create_graph=True,retain_graph=True)

In [26]:
grad

(tensor([[[[ 4.9335e-03,  4.7652e-03,  4.2048e-03,  4.1882e-03,  3.2305e-03],
           [ 6.6262e-04,  1.1085e-03,  2.0202e-03,  2.4665e-03,  2.9339e-03],
           [ 1.4999e-03,  2.2689e-03,  2.9201e-03,  2.4428e-03,  3.1428e-03],
           [ 3.6482e-03,  3.3799e-03,  2.9497e-03,  2.2546e-03,  2.4573e-03],
           [ 3.7687e-03,  2.1986e-03,  1.2257e-03,  9.2843e-04,  1.7228e-03]],
 
          [[ 2.6557e-03,  2.5320e-03,  1.8791e-03,  2.0478e-03,  1.3403e-03],
           [-1.1641e-03, -8.9423e-04, -2.7802e-04,  2.6421e-04,  1.1931e-03],
           [-2.2275e-04,  3.2465e-04,  6.6130e-04,  2.8748e-04,  1.4253e-03],
           [ 2.1160e-03,  1.5682e-03,  8.9929e-04,  3.9552e-04,  1.1168e-03],
           [ 2.0795e-03,  4.4958e-04, -5.6384e-04, -5.0314e-04,  9.1516e-04]],
 
          [[ 6.4866e-03,  6.2365e-03,  5.4511e-03,  4.8579e-03,  3.4656e-03],
           [ 2.5258e-03,  2.8948e-03,  3.4443e-03,  3.1663e-03,  3.0951e-03],
           [ 3.0301e-03,  3.4078e-03,  3.7601e-03,  2.8271

In [52]:
outputs = model(inputs)
outputs.shape
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
loss = criterion(outputs, labels)
_, pred = torch.max(outputs, 1)
c = (pred == labels).squeeze()
loss

tensor(2.2610, device='cuda:0', grad_fn=<NllLossBackward0>)

In [60]:
y = torch.nn.functional.log_softmax(outputs,dim=0)
loss = torch.nn.functional.nll_loss(
        y, labels, weight=None, reduction='mean')
params = [ p for p in model.parameters() if p.requires_grad ]
for p in model.parameters():
    print(p.shape)
temp = grad(loss, params, create_graph=True)
temp

torch.Size([6, 3, 5, 5])
torch.Size([6])
torch.Size([16, 6, 5, 5])
torch.Size([16])
torch.Size([120, 400])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


(tensor([[[[ 3.6602e-03,  2.8086e-03,  2.8197e-03,  4.9069e-03,  7.0855e-03],
           [ 1.0981e-03,  5.7782e-04,  2.6089e-03,  3.7768e-03,  5.9070e-03],
           [-7.8089e-05, -9.7814e-04,  1.6724e-03,  2.2803e-03,  3.8248e-03],
           [ 5.6744e-04, -1.2920e-04,  6.9816e-04, -2.1027e-05,  2.5541e-04],
           [-1.2696e-03, -1.7630e-04, -1.6682e-04, -1.2248e-03, -1.3582e-03]],
 
          [[ 1.6310e-03,  9.9482e-04,  5.9469e-04,  2.2000e-03,  4.1847e-03],
           [-4.2370e-04, -9.5064e-04,  7.7680e-04,  1.5744e-03,  3.3254e-03],
           [-2.1668e-03, -2.9403e-03, -1.7322e-04,  3.1358e-04,  1.3890e-03],
           [-1.7651e-03, -2.6222e-03, -1.7324e-03, -2.4100e-03, -2.4371e-03],
           [-2.9945e-03, -2.4132e-03, -2.4956e-03, -3.5263e-03, -3.8659e-03]],
 
          [[-3.7591e-03, -5.2286e-03, -5.9860e-03, -5.3457e-03, -3.9111e-03],
           [-3.9495e-03, -5.2171e-03, -4.1994e-03, -3.9894e-03, -2.7842e-03],
           [-5.0719e-03, -7.0946e-03, -5.6354e-03, -4.8978

In [72]:
x = torch.tensor([0., 0, 0], requires_grad=True)
b = torch.tensor([1., 3, 5])
A = torch.tensor([[-5, -3, -0.5], [-3, -2, 0], [-0.5, 0, -0.5]])
y = b@x + 0.5*x@A@x
grad = torch.autograd.grad(y, x, retain_graph=True, create_graph=True)
grad

(tensor([1., 3., 5.], grad_fn=<AddBackward0>),)

In [76]:
Print = torch.tensor([])
for anygrad in grad[0]:  # torch.autograd.grad返回的是元组
    
    Print = torch.cat((Print, torch.autograd.grad(anygrad, x, retain_graph=True)[0]))
# print(Print.view(x.size()[0], -1))

In [2]:
class ANN(nn.Module):
    def __init__(self):
        super(ANN, self).__init__()
        self.sigmoid = nn.Sigmoid()
        self.fc1 = nn.Linear(3, 4)
        self.fc2 = nn.Linear(4, 5)

    def forward(self, data):
        x = self.sigmoid(self.fc1(data))
        x = self.fc2(x)

        return x
    
model = ANN()
for param in model.parameters():
    print(param.size())
    
data = torch.tensor([1, 2, 3], dtype=torch.float)
label = torch.tensor([1, 1, 5, 7, 8], dtype=torch.float)
pred = model(data)
loss_fn = nn.MSELoss()
loss = loss_fn(pred, label)
# grads = torch.autograd.grad(loss, model.parameters(), retain_graph=True, create_graph=True)

# hessian_params = []
#     for k in range(len(grads)):
#         hess_params = torch.zeros_like(grads[k])
#         for i in range(grads[k].size(0)):
#             # 判断是w还是b
#             if len(grads[k].size()) == 2:
#                 # w
#                 for j in range(grads[k].size(1)):
#                     hess_params[i, j] = torch.autograd.grad(grads[k][i][j], model.parameters(), retain_graph=True)[k][i, j]
#             else:
#                 # b
#                 hess_params[i] = torch.autograd.grad(grads[k][i], model.parameters(), retain_graph=True)[k][i]
#         hessian_params.append(hess_params)


torch.Size([4, 3])
torch.Size([4])
torch.Size([5, 4])
torch.Size([5])


In [3]:
# First backprop
loss = loss_fn(pred, label)
params = [p for p in model.parameters() if p.requires_grad]
first_grads = grad(loss, params, retain_graph=True, create_graph=True)
# v_elem = v.copy()
# # Elementwise products
# elemwise_products = 0
# for grad_elem, v_elem in zip(first_grads, v_elem):
#     elemwise_products += torch.sum(grad_elem * v_elem)

# # Second backprop
# return_grads = grad(elemwise_products, params, create_graph=True)


In [4]:
first_grads

(tensor([[ 0.2062,  0.4124,  0.6187],
         [ 0.6653,  1.3306,  1.9958],
         [-0.3403, -0.6805, -1.0208],
         [ 0.3144,  0.6288,  0.9431]], grad_fn=<TBackward0>),
 tensor([ 0.2062,  0.6653, -0.3403,  0.3144],
        grad_fn=<SigmoidBackwardBackward0>),
 tensor([[-0.1060, -0.1240, -0.1995, -0.0702],
         [-0.1656, -0.1936, -0.3115, -0.1097],
         [-0.5269, -0.6159, -0.9912, -0.3490],
         [-0.8486, -0.9920, -1.5963, -0.5621],
         [-0.8755, -1.0234, -1.6469, -0.5799]], grad_fn=<TBackward0>),
 tensor([-0.3773, -0.5891, -1.8743, -3.0187, -3.1144],
        grad_fn=<MseLossBackwardBackward0>))

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.linear = nn.Linear(2,1)
    def forward(self,x):
        return self.linear(x).pow(2)

def Hessian_vec_prod(model, x, vecs):
    y = model(x)
    grads = torch.autograd.grad(y,model.parameters(),create_graph=True)
    prod = sum([(g*v).sum() for g,v in zip(grads,vecs) ])
    prod.backward()
    return [p.grad.detach() for p in model.parameters()]

model = Model()
x = torch.Tensor([1.,2.])
vec = [torch.Tensor([[1.,5.]]),torch.Tensor([2.])]
print(Hessian_vec_prod(model,x,vec))


In [12]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.linear = nn.Linear(2,1)
    def forward(self,x):
        return self.linear(x).pow(2)
model = Model()
x = torch.Tensor([1.,2.])
y = model(x)
vec = [torch.Tensor([[1.,5.]]),torch.Tensor([2.])]
grads = torch.autograd.grad(y,model.parameters(),create_graph=True)
for g,v in zip(grads,vec):
    print(g)
    print(v)

tensor([[-2.3394, -4.6788]], grad_fn=<TBackward0>)
tensor([[1., 5.]])
tensor([-2.3394], grad_fn=<MulBackward0>)
tensor([2.])


In [8]:
loss = loss_fn(pred, label)
params = [p for p in model.parameters() if p.requires_grad]
first_grads = grad(loss, params, retain_graph=True, create_graph=True)
element_product = 0
for grad_elem, v_elem in zip(first_grads, first_grads):
    element_product += torch.sum(grad_elem * v_elem)
grad(element_product,params,create_graph=True)

(tensor([[ 6.0277, 12.0553, 18.0830],
         [ 3.7676,  7.5352, 11.3028],
         [ 6.5629, 13.1258, 19.6887],
         [ 4.4919,  8.9838, 13.4757]], grad_fn=<TBackward0>),
 tensor([6.0277, 3.7676, 6.5629, 4.4919], grad_fn=<SigmoidBackwardBackward0>),
 tensor([[ -1.9504,   0.4339,  -1.1859,   0.5630],
         [ -2.7895,   0.4569,  -1.7540,   0.6098],
         [ -9.6170,   2.1110,  -5.8574,   2.7420],
         [-14.1109,   2.8822,  -8.6707,   3.7664],
         [-15.9230,   3.2181,  -9.7963,   4.2092]], grad_fn=<TBackward0>),
 tensor([-0.6623, -1.2184, -3.3130, -5.2177, -5.9444], grad_fn=<DivBackward1>))