In [7]:
import numpy as np
import pandas as pd

import os
import time

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, TensorDataset

from torch.autograd import Variable

print(time.strftime("%Y-%m-%d %H:%M:%S"))

2018-12-13 00:51:06


In [8]:
# We will use ``torch.device`` objects to move tensors in and out of GPU
if torch.cuda.is_available():
    device = torch.device("cuda")       
else:
    device = torch.device("cpu")       

def get_variable(x):
    return Variable(x).to(device)    
    
print(device, type(device))

cuda <class 'torch.device'>


In [9]:
input_path = "/home/marcus/Heisenberg/digit-recognizer/"
files = [os.path.join(input_path, para) for para in sorted(os.listdir(input_path))]
files

['/home/marcus/Heisenberg/digit-recognizer/sample_submission.csv',
 '/home/marcus/Heisenberg/digit-recognizer/test.csv',
 '/home/marcus/Heisenberg/digit-recognizer/train.csv']

In [10]:
df_train = pd.read_csv(files[2])
df_test  = pd.read_csv(files[1])
df_subms = pd.read_csv(files[0])

df_train.shape, df_test.shape, df_subms.shape

((42000, 785), (28000, 784), (28000, 2))

In [63]:
para = df_test.values.reshape((-1,28,28)).astype(np.uint8)[:,:,:,None]
para.shape, type(para)

((28000, 28, 28, 1), numpy.ndarray)

In [11]:
y_train = df_train.label
x_train = df_train.iloc[:,1:].values.reshape(len(df_train),28,28,1)
x_test  = df_test.values.reshape(len(df_test),28,28,1)

x_train = x_train.astype(np.float32)
x_test  = x_test.astype(np.float32)

x_train.shape, x_test.shape

((42000, 28, 28, 1), (28000, 28, 28, 1))

In [12]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((37800, 28, 28, 1), (4200, 28, 28, 1), (37800,), (4200,))

In [19]:
batch_size = 32

train_transform = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor()
])

train_data = Dataset(torch.tensor(x_train),
                           torch.LongTensor(y_train.values))

train_loader = DataLoader(dataset = train_data,
                         batch_size = batch_size,
                         shuffle = True)

val_data = TensorDataset(torch.tensor(x_val),
                           torch.LongTensor(y_val.values))
val_loader = DataLoader(dataset = val_data,
                         batch_size = batch_size,
                         shuffle = False)

TypeError: object() takes no parameters

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, inchannel, outchannel, stride=1):
        super(ResidualBlock, self).__init__()
        self.left = nn.Sequential(
            nn.Conv2d(inchannel, outchannel, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(outchannel),
            nn.ReLU(inplace=True),
            nn.Conv2d(outchannel, outchannel, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(outchannel)
        )
        self.shortcut = nn.Sequential()
        if stride != 1 or inchannel != outchannel:
            self.shortcut = nn.Sequential(
                nn.Conv2d(inchannel, outchannel, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(outchannel)
            )

    def forward(self, x):
        out = self.left(x)
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, ResidualBlock, num_classes=10):
        super(ResNet, self).__init__()
        self.inchannel = 64
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.layer1 = self.make_layer(ResidualBlock, 64,  2, stride=1)
        self.layer2 = self.make_layer(ResidualBlock, 128, 2, stride=2)
        self.layer3 = self.make_layer(ResidualBlock, 256, 2, stride=2)
        self.layer4 = self.make_layer(ResidualBlock, 512, 2, stride=2)
        self.fc = nn.Linear(512, num_classes)

    def make_layer(self, block, channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)   #strides=[1,1]
        layers = []
        for stride in strides:
            layers.append(block(self.inchannel, channels, stride))
            self.inchannel = channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


def ResNet18():
    return ResNet(ResidualBlock)

In [6]:
# 模型定义-ResNet
net = ResNet18().to(device)

# 定义损失函数和优化方式
#损失函数为交叉熵，多用于多分类问题
criterion = nn.CrossEntropyLoss() 

#优化方式为mini-batch momentum-SGD，并采用L2正则化
optimizer = optim.SGD(
    net.parameters(), 
    lr=LR,              # learning rate
    momentum=0.9,       # momentum factor
    weight_decay=5e-4   # weight decay (L2 penalty)
    ) 

In [7]:
def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 2 every epochs"""
    lr = LR * (0.1 ** (epoch // 10))
    
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [8]:
# 获取在测试集上的准确率
def test_net(net, test_loader):  
    net.eval()

    total = 0
    acc_loss = 0.0
    acc_correct = 0.0

    for inputs, labels in test_loader:      
        inputs, labels = get_variable(inputs), get_variable(labels)
        
        outputs = net(inputs)
        
        total += labels.size(0)
        loss = criterion(outputs, labels)        
        acc_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        acc_correct += predicted.eq(labels.data).cpu().sum()
        
    return float(acc_loss)/total, float(acc_correct)/total

In [9]:
# 获取在测试集上的准确率
def pred_net(net, test_loader):  
    net.eval()
    res = np.array([])
    for inputs, labels in test_loader:      
        inputs, labels = get_variable(inputs), get_variable(labels)
        
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)        
        predicted = predicted.to("cpu") 
        res = np.hstack([res, predicted])
    return res        

In [10]:
hist = {"train_loss":[], "train_acc":[],
        "val_loss":[], "val_acc":[], 
        "epoch":[], "lr":[] }

def hist_update(epoch, train_loss, train_acc, val_loss, val_acc, curr_lr):
    hist["epoch"].append(epoch)
    hist["train_loss"].append(train_loss)
    hist["train_acc"].append(train_acc)
    hist["val_loss"].append(val_loss)
    hist["val_acc"].append(val_acc)
    hist["lr"].append(curr_lr)

In [None]:
soa_acc = 0.6
soa_epoch = 0

for epoch in range(pre_epoch, EPOCH):
    print('\nEpoch: %d' % (epoch + 1))
    net.train()
    sum_loss = 0.0
    correct = 0.0
    total = 0.0
    for i, data in enumerate(train_loader, 0):
        # 准备数据
        length = len(train_loader)
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        # forward + backward
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if i%60==59: # 每训练60个batch打印一次loss和准确率
            sum_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += predicted.eq(labels.data).cpu().sum()
            
            train_loss = sum_loss / (i + 1)
            train_acc = float(correct) / total
            
            print(time.strftime("%Y-%m-%d %H:%M:%S")+" - "+'[epoch:%2d, iter:%4d] - Loss: %.03f | Acc: %.2f%% '
                  % (epoch + 1, (i + 1 + epoch * length), train_loss, 100. * train_acc))
        
    # 每训练完一个epoch测试一下准确率
    train_loss, train_acc = test_net(net, train_loader)
    print('Train loss: %.3f; Train Accuracy：%.2f%%' % (train_loss, 100*train_acc))
    val_loss, val_acc = test_net(net, test_loader)
    print('Validation loss: %.3f; Validation Accuracy：%.2f%%' % (val_loss, 100*val_acc))

    if val_acc > soa_acc:
        print("Validation accuracy imporoved from %.2f%% to %.2f%%, Model saved." % (100.*soa_acc, 100.*val_acc))
        torch.save(net.state_dict(), 'model/net_resnet34_best.pth')
        soa_acc = val_acc

    # 返回当前的Learning Rate & 并为下一个epoch调整Learning Rate
    para = optimizer.param_groups[0]
    curr_lr = para["lr"]
    adjust_learning_rate(optimizer, epoch)
        
    hist_update(epoch, train_loss, train_acc, val_loss, val_acc, curr_lr)

print("\nTraining Finished, Total_Epoch: %d" % EPOCH)


Epoch: 1
2018-12-12 16:31:26 - [epoch: 1, iter:  60] - Loss: 0.030 | Acc: 29.69% 
2018-12-12 16:31:32 - [epoch: 1, iter: 120] - Loss: 0.028 | Acc: 34.38% 
2018-12-12 16:31:38 - [epoch: 1, iter: 180] - Loss: 0.027 | Acc: 38.02% 
2018-12-12 16:31:44 - [epoch: 1, iter: 240] - Loss: 0.026 | Acc: 41.80% 
2018-12-12 16:31:50 - [epoch: 1, iter: 300] - Loss: 0.025 | Acc: 44.22% 
2018-12-12 16:31:56 - [epoch: 1, iter: 360] - Loss: 0.024 | Acc: 47.27% 
Train loss: 0.012; Train Accuracy：49.63%
Validation loss: 0.012; Validation Accuracy：49.17%

Epoch: 2
2018-12-12 16:32:18 - [epoch: 2, iter: 451] - Loss: 0.017 | Acc: 64.84% 
2018-12-12 16:32:24 - [epoch: 2, iter: 511] - Loss: 0.017 | Acc: 63.67% 
2018-12-12 16:32:29 - [epoch: 2, iter: 571] - Loss: 0.016 | Acc: 66.93% 
2018-12-12 16:32:35 - [epoch: 2, iter: 631] - Loss: 0.016 | Acc: 67.77% 
2018-12-12 16:32:41 - [epoch: 2, iter: 691] - Loss: 0.016 | Acc: 67.66% 
2018-12-12 16:32:47 - [epoch: 2, iter: 751] - Loss: 0.015 | Acc: 68.88% 
Train loss: 

In [None]:
hist = pd.DataFrame(hist)
hist.plot(x="epoch", y=["train_loss","val_loss"])
hist.plot(x="epoch", y=["train_acc","val_acc"])
hist.plot(x="epoch", y=["lr"])

In [15]:
!nvidia-smi

Thu Dec 13 00:52:06 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.77                 Driver Version: 390.77                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 107...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   32C    P8    15W / 180W |    437MiB /  8119MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [18]:
?TensorDataset