# Распознавание рукописных цифр
В данной работе напишу нейронку на базе Pytorch CNN для распознавания собственных рукописных цифр. Сложность в том что, обученная сеть на MNIST на моих цифрах показывает низкие результаты. 

In [1]:
import os
print("Current working directory:", os.getcwd()) # current way
# directory = os.path.join(os.path.dirname(__file__), "my_folder")

Current working directory: c:\Python3\PyIntel\img_transform\Digits


In [2]:
import os
import json
from PIL import Image

import torch
import torch.utils.data as data
import torchvision.transforms.v2 as tfs
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torchvision.datasets import ImageFolder
# from torchvision import models

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif hasattr(torch, "xpu") and torch.xpu.is_available():
    device = torch.device("xpu")
else:
    device = torch.device("cpu")

# xpu
print("Using device:", device)

Using device: xpu


## Model
здесь в результате тестов подобрана максимально эффективная модель для моих рукописных цифр

In [4]:
conv_model2 = nn.Sequential(
    nn.Conv2d(3, out_channels=16, kernel_size=3, stride=1, padding=1, bias=False),
    nn.BatchNorm2d(16),
    nn.ReLU(inplace=True),

    nn.Conv2d(16, out_channels=16, kernel_size=3, stride=1, padding=1, bias=False),
    nn.BatchNorm2d(16),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(2),

    nn.Conv2d(16, out_channels=32, kernel_size=3, stride=1, padding=1, bias=False),
    nn.BatchNorm2d(32),
    nn.ReLU(inplace=True),

    nn.Conv2d(32, out_channels=20, kernel_size=3, stride=1, padding=1, bias=False),
    nn.BatchNorm2d(20),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(2), # batch, out, 7, 7

    nn.Flatten(1), # batch, 980
    nn.Linear(20*7*7, 16, bias=False),
    nn.BatchNorm1d(16),
    nn.ReLU(inplace=True),

    nn.Linear(16, 16, bias=False),
    nn.BatchNorm1d(16),
    nn.ReLU(inplace=True),
    nn.Linear(16, 10)
)

In [5]:
model = conv_model2
model.to(device)

Sequential(
  (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (4): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): ReLU(inplace=True)
  (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (7): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (8): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (9): ReLU(inplace=True)
  (10): Conv2d(32, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (11): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (12): ReLU(inplace=True)
  (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (14): Flatten(start_dim=1, 

### формирование обучающих данных

In [6]:
transforms = tfs.Compose([tfs.ToImage(),
                          tfs.ToDtype(torch.float32, scale=True),
                          ])

d_train = ImageFolder("dataset/train", transform=transforms)
train_data = data.DataLoader(d_train, batch_size=32, shuffle=True)

x_, y_ = next(iter(train_data))
x_[0].shape

torch.Size([3, 28, 28])

### train 
обучение и сохранение весов

In [7]:
optimizer = optim.Adam(params=model.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()
epochs = 30
model.train()

for _e in range(epochs):
    loss_mean = 0
    lm_count = 0

    train_tqdm = tqdm(train_data, leave=True)
    for x_train, y_train in train_tqdm:
        x_train = x_train.to(device)
        y_train = y_train.to(device)
        predict = model(x_train)
        loss = loss_function(predict, y_train)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        lm_count += 1
        loss_mean = 1/lm_count * loss.item() + (1 - 1/lm_count) * loss_mean
        train_tqdm.set_description(f"Epoch [{_e+1}/{epochs}], loss_mean={loss_mean:.3f}")

Epoch [1/30], loss_mean=0.251: 100%|██████████| 1875/1875 [00:43<00:00, 43.32it/s]
Epoch [2/30], loss_mean=0.057: 100%|██████████| 1875/1875 [00:39<00:00, 47.16it/s]
Epoch [3/30], loss_mean=0.044: 100%|██████████| 1875/1875 [00:40<00:00, 46.36it/s]
Epoch [4/30], loss_mean=0.037: 100%|██████████| 1875/1875 [00:39<00:00, 47.06it/s]
Epoch [5/30], loss_mean=0.032: 100%|██████████| 1875/1875 [00:39<00:00, 47.08it/s]
Epoch [6/30], loss_mean=0.028: 100%|██████████| 1875/1875 [00:40<00:00, 46.79it/s]
Epoch [7/30], loss_mean=0.025: 100%|██████████| 1875/1875 [00:39<00:00, 46.97it/s]
Epoch [8/30], loss_mean=0.023: 100%|██████████| 1875/1875 [00:39<00:00, 47.25it/s]
Epoch [9/30], loss_mean=0.021: 100%|██████████| 1875/1875 [00:39<00:00, 46.98it/s]
Epoch [10/30], loss_mean=0.019: 100%|██████████| 1875/1875 [00:39<00:00, 47.03it/s]
Epoch [11/30], loss_mean=0.018: 100%|██████████| 1875/1875 [00:39<00:00, 47.15it/s]
Epoch [12/30], loss_mean=0.017: 100%|██████████| 1875/1875 [00:39<00:00, 47.21it/s]
E

## Restore!

In [8]:
# save
# torch.save(model.state_dict(), f'model_1_weight_{int(Q*10000)}.pth')

In [9]:
# model.load_state_dict(torch.load('model_weight_9930.pth', map_location='xpu'))

### test
результаты теста на обучающей выбрке

In [10]:
d_test = ImageFolder("dataset/test", transform=transforms)
test_data = data.DataLoader(d_test, batch_size=500, shuffle=False)

Q = 0

# тестирование обученной НС
model.eval()

for x_test, y_test in test_data:
    x_test = x_test.to(device)
    y_test = y_test.to(device)
    with torch.no_grad():
        p = model(x_test)
        p = torch.argmax(p, dim=1)
        Q += torch.sum(p == y_test).item()

Q /= len(d_test)
print(Q)

0.993


In [11]:
transform = tfs.Compose([
    tfs.ToImage(),
    tfs.Resize((28, 28)),                  # подгоняем под MNIST
    tfs.RandomInvert(p=1.0),               # инвертируем цвет
    tfs.ToDtype(torch.float32, scale=True)
])
# transform(img).shape

In [12]:
# img = Image.open("images/im_2.png")
# img = img.convert("RGB")
# img = img.resize((28, 28))
# # tr = tfs.Compose([tfs.RandomInvert(p=1.0), tfs.Grayscale()])
# tr = tfs.Compose([tfs.RandomInvert(p=1.0)])
# img = tr(img)
# img = transform(img)
# img = img.to(device)
# torch.argmax(model(img.unsqueeze(0)).squeeze())

### test
тест на моих цифрах

In [13]:
res_test = {}
with torch.no_grad():
    for i in range(10):
        img = Image.open(f"images/im_{i}.png")
        img = img.convert("RGB")
        tensor_img = transform(img)
        tensor_img = tensor_img.to(device)
        # res = model(tensor_img)
        res = model(tensor_img.unsqueeze(0)).squeeze()
        # res = res.softmax(dim=0) # .sort(descending=True)
        res = torch.argmax(res)
        res_test[i] = res.item()
        # res_test[i] = res
        
    
res_test

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9}

просмотр уверенности модели в предсказаниии

In [14]:
# просмотр уверенности модели в предсказаниии

res_test = {}
with torch.no_grad():
    for i in range(10):
        img = Image.open(f"images/im_{i}.png")
        img = img.convert("RGB")
        tensor_img = transform(img)
        tensor_img = tensor_img.to(device)
        # res = model(tensor_img)
        res = model(tensor_img.unsqueeze(0)).squeeze()
        print(i, res)
        # res = res.softmax(dim=0) # .sort(descending=True)
        res = torch.argmax(res)
        res_test[i] = res.item()
        # res_test[i] = res

0 tensor([  0.7973,  -6.5548,  -7.0180, -18.9206,  -8.3460,  -6.1676,  -2.9512,
         -2.2850, -13.6281, -11.0323], device='xpu:0')
1 tensor([-13.1169,   4.6537, -13.5996, -12.4463, -14.4070,  -8.9870, -13.2534,
          0.3256, -14.2725, -11.6112], device='xpu:0')
2 tensor([-10.5356,  -7.0368,   9.8484,  -8.2798,  -9.3443, -17.9443, -11.4218,
          0.4944, -13.3453,  -9.4240], device='xpu:0')
3 tensor([-12.5134,  -2.8164, -10.5482,  11.2100, -13.8626, -10.3312, -14.5341,
         -3.5596, -14.6161, -13.2917], device='xpu:0')
4 tensor([-10.2602,  -8.4368,  -4.6739, -13.8474,   5.7549,  -6.1382,  -7.3269,
         -3.9511,  -9.6605,  -3.0228], device='xpu:0')
5 tensor([-10.7049,  -5.1367,  -8.9492,  -6.9515, -14.6891,   7.1869,  -5.7139,
        -10.5207,  -6.8137,  -7.7989], device='xpu:0')
6 tensor([ -7.3206, -10.6756,  -1.5794, -11.5748,  -9.0165,   0.5516,   8.2982,
         -8.7794, -10.8787, -11.5371], device='xpu:0')
7 tensor([-18.8902,  -5.5793,  -8.1804,  -7.5899, -12.1

В результате ряда экспериментов подобрана модель и приведен пример распознавания цифр не из обучающей выборки