In [56]:
import json
import time

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from IPython.core.display_functions import clear_output
from torch.utils import data
from collections import OrderedDict
from collections import namedtuple
from itertools import product
from PIL import Image
from torch.utils.data import DataLoader

torch.set_printoptions(linewidth=120)  # Display options for output
torch.set_grad_enabled(True)

# tensorboard --version
from tensorboardX import SummaryWriter
import sys

print(sys.executable)


E:\python_tools\anaconda3\envs\test\python.exe


In [57]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)

        self.fc1 = nn.Linear(in_features=12 * 4 * 4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        # self.layer = None

    def forward(self, t):
        # t = self.layer(t)
        # implement the forward pass

        # (1) input layer
        t = t

        # (2) hidden conv layer
        t = self.conv1(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        # (3) hidden conv layer
        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        # (4) hidden liner layer
        t = t.reshape(-1, 12 * 4 * 4)
        t = self.fc1(t)
        t = F.relu(t)

        # (5) hidden liner layer
        t = self.fc2(t)
        t = F.relu(t)

        # (6) output layer
        t = self.out(t)
        # t = F.softmax(t, dim=1)

        return t

In [58]:
network = Network()

In [59]:
for name, param in network.named_parameters():
    print(name, '\t\t\t', param.shape)

conv1.weight 			 torch.Size([6, 1, 5, 5])
conv1.bias 			 torch.Size([6])
conv2.weight 			 torch.Size([12, 6, 5, 5])
conv2.bias 			 torch.Size([12])
fc1.weight 			 torch.Size([120, 192])
fc1.bias 			 torch.Size([120])
fc2.weight 			 torch.Size([60, 120])
fc2.bias 			 torch.Size([60])
out.weight 			 torch.Size([10, 60])
out.bias 			 torch.Size([10])


In [60]:
for n, p in network.named_parameters():
    print(p.device, ' ', n)

cpu   conv1.weight
cpu   conv1.bias
cpu   conv2.weight
cpu   conv2.bias
cpu   fc1.weight
cpu   fc1.bias
cpu   fc2.weight
cpu   fc2.bias
cpu   out.weight
cpu   out.bias


In [61]:
network.to('cuda')

Network(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (out): Linear(in_features=60, out_features=10, bias=True)
)

In [62]:
for n, p in network.named_parameters():
    print(p.device, ' ', n)

cuda:0   conv1.weight
cuda:0   conv1.bias
cuda:0   conv2.weight
cuda:0   conv2.bias
cuda:0   fc1.weight
cuda:0   fc1.bias
cuda:0   fc2.weight
cuda:0   fc2.bias
cuda:0   out.weight
cuda:0   out.bias


In [63]:
sample = torch.ones(1, 1, 28, 28)
sample.shape

torch.Size([1, 1, 28, 28])

In [64]:
try:
    network(sample)
except Exception as e:
    print(e)

Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor


In [65]:
try:
    pred = network(sample.to('cuda'))
    print(pred)
except Exception as e:
    print(e)

tensor([[-0.0422,  0.1080,  0.0624,  0.0086, -0.0168, -0.1517, -0.1006, -0.0757, -0.0408,  0.1311]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


# Checking for GPU

In [66]:
torch.cuda.is_available()

True

# Using the GPU：TEST 

In [67]:
class RunBuilder:
    @staticmethod
    def get_runs(params):
        Run = namedtuple('Run', params.keys())

        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs

In [68]:
class RunManager:
    def __init__(self):
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = 0

        self.run_params = 0
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None

        self.network = None
        self.loader = None
        # self.tb = None

    def begin_run(self, run, network, loader):
        self.run_start_time = time.time()
        self.run_params = run
        self.run_count += 1

        self.network = network
        self.loader = loader
        # self.tb = SummaryWriter(comment=f'{run}')

        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)

        # self.tb.add_image('images', grid)
        # self.tb.add_graph(self.network, images)

    def end_run(self):
        # self.tb.close()
        self.epoch_count = 0

    def begin_epoch(self):
        self.epoch_start_time = time.time()

        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0

    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)

        # self.tb.add_scalar('Loss', loss, self.epoch_count)
        # self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)

        # for name, param in self.network.named_parameters():
        #     self.tb.add_histogram(name, param, self.epoch_count)
        #     self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)

        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch duration"] = epoch_duration
        results["run duration"] = run_duration

        for k, v, in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')

        clear_output(wait=True)
        display(df)

    def track_loss(self, loss):
        self.epoch_loss += loss.item() * self.loader.batch_size

    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)

    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()

    def save(self, filename):
        pd.DataFrame.from_dict(
            self.run_data
            , orient='columns'
        ).to_csv(f'{filename}.csv')

        with open(f'{filename}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)

In [71]:
train_set = torchvision.datasets.FashionMNIST(root='./data/FashionMNIST', train=True, download=True, transform=transforms.Compose([transforms.ToTensor()]))

params = OrderedDict(
    lr=[.01]
    , batch_size=[1000, 10000, 20000]
    , num_workers=[0, 1]
    , device=['cuda', 'cpu']
)

m = RunManager()

for run in RunBuilder.get_runs(params):
    device = torch.device(run.device)
    network = Network().to(device)
    loader = DataLoader(train_set, batch_size=run.batch_size, num_workers=run.num_workers)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, loader)
    for epoch in range(1):
        m.begin_epoch()
        for batch in loader:
            images = batch[0].to(device)
            labels = batch[1].to(device)
            preds = network(images)
            loss = F.cross_entropy(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            m.track_loss(loss)
            m.track_num_correct(preds, labels)
        m.end_epoch()
    m.end_run()
m.save('result2')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,device
0,1,1,1.017178,0.6108,6.82919,6.92669,0.01,1000,0,cuda
1,2,1,0.983591,0.612,6.931687,7.027502,0.01,1000,0,cpu
2,3,1,1.030171,0.606017,7.096156,9.850043,0.01,1000,1,cuda
3,4,1,1.040903,0.604583,8.680178,11.462076,0.01,1000,1,cpu
4,5,1,2.059769,0.226667,5.408729,6.432456,0.01,10000,0,cuda
5,6,1,2.145892,0.189467,7.177294,8.144437,0.01,10000,0,cpu
6,7,1,2.148961,0.249283,6.948133,11.357591,0.01,10000,1,cuda
7,8,1,2.111276,0.2672,8.446633,12.631428,0.01,10000,1,cpu
8,9,1,2.279327,0.15795,5.2672,7.174823,0.01,20000,0,cuda
9,10,1,2.273614,0.135917,7.297822,9.26778,0.01,20000,0,cpu


In [72]:
pd.DataFrame.from_dict(m.run_data,orient='columns').sort_values('epoch duration')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,device
8,9,1,2.279327,0.15795,5.2672,7.174823,0.01,20000,0,cuda
4,5,1,2.059769,0.226667,5.408729,6.432456,0.01,10000,0,cuda
0,1,1,1.017178,0.6108,6.82919,6.92669,0.01,1000,0,cuda
1,2,1,0.983591,0.612,6.931687,7.027502,0.01,1000,0,cpu
6,7,1,2.148961,0.249283,6.948133,11.357591,0.01,10000,1,cuda
10,11,1,2.292907,0.128067,7.072798,13.165866,0.01,20000,1,cuda
2,3,1,1.030171,0.606017,7.096156,9.850043,0.01,1000,1,cuda
5,6,1,2.145892,0.189467,7.177294,8.144437,0.01,10000,0,cpu
9,10,1,2.273614,0.135917,7.297822,9.26778,0.01,20000,0,cpu
11,12,1,2.29413,0.159383,8.375601,14.274128,0.01,20000,1,cpu
