In [5]:
import torch
import torchvision
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import time

In [6]:
train = datasets.MNIST("", train = True, download= True, transform = transforms.Compose([transforms.ToTensor()]))
test = datasets.MNIST("", train = False, download= True, transform = transforms.Compose([transforms.ToTensor()]))

In [7]:
trainset = torch.utils.data.DataLoader(train, batch_size = 32)
testset = torch.utils.data.DataLoader(test, batch_size = 32)

In [8]:
n_epochs = 180
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 1024)
        self.fc5 = nn.Linear(1024, 1024)
        self.fc6 = nn.Linear(1024, 10)


    def forward(self, x):
        output_h1 = F.relu(self.fc1(x))
        output_h2 = F.relu(self.fc2(output_h1))
        output_h3 = F.relu(self.fc3(output_h2))
        output_h4 = F.relu(self.fc4(output_h3))
        output_h5 = F.relu(self.fc5(output_h4))
        output = self.fc5(output_h5)
        return F.log_softmax(output_h1, dim=1), F.log_softmax(output_h2, dim=1), F.log_softmax(output_h3, dim=1), F.log_softmax(output_h4, dim=1), F.log_softmax(output_h5, dim=1), F.log_softmax(output, dim=1)

# Train a model on CPU and time it

In [9]:
start_time = time.time()

net = Net()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

for epoch in range(n_epochs): 
    total_loss = 0
    for data in trainset:  
        X, y = data  
        net.zero_grad()  
        _, _, _, _, _, output = net(X.view(-1,28*28)) 
        loss = F.nll_loss(output, y)   
        total_loss += loss
        loss.backward()  
        optimizer.step() 
    print(total_loss)
    
end_time = time.time()

tensor(5464.6211, grad_fn=<AddBackward0>)
tensor(1304.3837, grad_fn=<AddBackward0>)
tensor(667.2972, grad_fn=<AddBackward0>)
tensor(407.7508, grad_fn=<AddBackward0>)
tensor(295.3828, grad_fn=<AddBackward0>)
tensor(231.5015, grad_fn=<AddBackward0>)
tensor(186.1765, grad_fn=<AddBackward0>)
tensor(151.0732, grad_fn=<AddBackward0>)
tensor(122.6007, grad_fn=<AddBackward0>)
tensor(98.7547, grad_fn=<AddBackward0>)
tensor(78.8229, grad_fn=<AddBackward0>)
tensor(62.1846, grad_fn=<AddBackward0>)
tensor(48.1994, grad_fn=<AddBackward0>)
tensor(36.4844, grad_fn=<AddBackward0>)
tensor(28.5337, grad_fn=<AddBackward0>)
tensor(21.2141, grad_fn=<AddBackward0>)
tensor(18.8585, grad_fn=<AddBackward0>)
tensor(13.8411, grad_fn=<AddBackward0>)
tensor(13.4607, grad_fn=<AddBackward0>)
tensor(9.6390, grad_fn=<AddBackward0>)
tensor(15.5284, grad_fn=<AddBackward0>)
tensor(6.9785, grad_fn=<AddBackward0>)
tensor(4.0967, grad_fn=<AddBackward0>)
tensor(2.8819, grad_fn=<AddBackward0>)
tensor(2.1522, grad_fn=<AddBackwa

In [25]:
time_elapsed_cpu = end_time - start_time
time_elapsed_cpu, time_elapsed_cpu/(60*60)

(7308.369572877884, 2.0301026591327456)

# Train a model on GPU and time it

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print (device)

cuda


In [12]:
start_time_gpu = time.time()

net = Net().to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

for epoch in range(n_epochs): 
    total_loss = 0
    for data,target in trainset:  
        X, y = data.to(device), target.to(device)  
        net.zero_grad()  
        _, _, _, _, _, output = net(X.view(-1,28*28)) 
        loss = F.nll_loss(output, y)   
        total_loss += loss
        loss.backward()  
        optimizer.step() 
    print(total_loss)
    
end_time_gpu = time.time()

tensor(5391.7764, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1253.5834, device='cuda:0', grad_fn=<AddBackward0>)
tensor(648.2276, device='cuda:0', grad_fn=<AddBackward0>)
tensor(413.0512, device='cuda:0', grad_fn=<AddBackward0>)
tensor(292.0511, device='cuda:0', grad_fn=<AddBackward0>)
tensor(224.8125, device='cuda:0', grad_fn=<AddBackward0>)
tensor(178.9684, device='cuda:0', grad_fn=<AddBackward0>)
tensor(143.9149, device='cuda:0', grad_fn=<AddBackward0>)
tensor(115.6431, device='cuda:0', grad_fn=<AddBackward0>)
tensor(92.0300, device='cuda:0', grad_fn=<AddBackward0>)
tensor(72.1514, device='cuda:0', grad_fn=<AddBackward0>)
tensor(55.8465, device='cuda:0', grad_fn=<AddBackward0>)
tensor(43.2286, device='cuda:0', grad_fn=<AddBackward0>)
tensor(32.9972, device='cuda:0', grad_fn=<AddBackward0>)
tensor(23.7752, device='cuda:0', grad_fn=<AddBackward0>)
tensor(21.1928, device='cuda:0', grad_fn=<AddBackward0>)
tensor(16.5937, device='cuda:0', grad_fn=<AddBackward0>)
tensor(21.4875, devi

tensor(0.0544, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0539, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0534, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0529, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0524, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0520, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0515, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0511, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0506, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0502, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0498, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0493, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0489, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0485, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0481, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0477, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0473, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0469, device='cuda:0', grad_fn=<AddBack

In [26]:
time_elapsed_gpu = end_time_gpu - start_time_gpu
time_elapsed_gpu, time_elapsed_gpu/(60*60)

(2313.498715400696, 0.6426385320557488)

In [None]:
correct = 0
total = 0
class_dict = {x:[] for x in range(10)}
rand_dim = [c for c in range(10)]
class_dict_h4 = {x:[] for x in range(10)}

with torch.no_grad():
    for data in testset:
        X, y = data
        X = X.view(-1,784)
        _, _, _, _, output_h4, output = net(X)
        h4 = output_h4[:, rand_dim]
        for (idx1, i), (idx2, x) in zip(enumerate(h4), enumerate(X)):
            class_dict_h4[torch.argmax(i).item()].append(x)
        for (idx1, i), (idx2, x) in zip(enumerate(output), enumerate(X)):
            class_dict[torch.argmax(i).item()].append(x)
            if torch.argmax(i) == y[idx1]:
                correct += 1
            total += 1
    print("Accuracy: ", round(correct/total, 3)*100)