# Лекция 3: Библиотеки для глубинного обучения. Примитивы фремворка Pytorch. 

#        Пример обучения нейронной сети в numpy

In [1]:
from time import time

In [2]:
time()

1488116441.429096

In [3]:
# -*- coding: utf-8 -*-
import numpy as np

# N - размер батча; D_in - размерность входа;
# H - скрытая размероность; D_out размерность выхода.
N, D_in, H, D_out = 64, 1000, 100, 10

# Инициализируем вход и выход из нормального распределения
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Инициализируем веса из нормального распределения
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
t1 = time()
for t in range(500):

    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    
    # Используем функцию активации ReLU
    
    y_pred = h_relu.dot(w2)

    # Считаем функцию потерь
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Считаем градиенты
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    
    # Считаем композицию с производной ReLU
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Обновляем веса
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
t2 = time()

(0, 31118960.634845525)
(1, 28986309.007656906)
(2, 32625859.296431243)
(3, 36534370.557906404)
(4, 35436174.930721447)
(5, 26980874.653306477)
(6, 16038472.455092344)
(7, 7921745.7438624324)
(8, 3809572.4481635652)
(9, 2033960.1744381748)
(10, 1283732.6907986884)
(11, 930708.8634522853)
(12, 734286.18606640119)
(13, 605394.9199803808)
(14, 510348.44950401766)
(15, 435637.54003521544)
(16, 374827.26173134742)
(17, 324387.58175882266)
(18, 282051.92592800688)
(19, 246272.17142690078)
(20, 215794.79786549602)
(21, 189684.88588497846)
(22, 167218.34695468962)
(23, 147832.68255369243)
(24, 131018.00410826772)
(25, 116395.72645295241)
(26, 103633.27030852422)
(27, 92461.740984773118)
(28, 82661.936509389867)
(29, 74049.133226521313)
(30, 66457.769516926463)
(31, 59744.935377604648)
(32, 53793.332395702848)
(33, 48520.30119766909)
(34, 43834.296961918793)
(35, 39655.916475319129)
(36, 35921.658141018401)
(37, 32578.407048062916)
(38, 29580.676624200816)
(39, 26889.779220617831)
(40, 24471.89

In [4]:
print t2-t1

1.64606904984


# Первая и основная составляющая типичного современного фреймворка для машинного обучения - Tensor

В интерфейсе базовых операций тензор ничем не отличается от np.array, но при этом тензоры можно эффективно использовать при обучении на gpu. 

In [1]:
import torch

In [6]:
# Создаем неинициализированный тензор
x = torch.Tensor(5, 3)

In [7]:
x


 8.3316e-38  0.0000e+00 -5.5271e-38
 4.5609e-41  2.8817e+24  4.5609e-41
 2.9572e-38  0.0000e+00  2.9572e-38
 0.0000e+00  0.0000e+00  0.0000e+00
 0.0000e+00  0.0000e+00  0.0000e+00
[torch.FloatTensor of size 5x3]

In [8]:
# инициализируем тензор нормальным распределением
x = torch.randn(5, 3)

In [9]:
x


-0.2670  0.0724  0.3699
-0.3464 -1.5371 -0.5977
 0.6510  0.0325  0.2199
 0.5227 -0.6069  1.1274
-0.3321 -1.5260  1.7369
[torch.FloatTensor of size 5x3]

In [10]:
x.size()

torch.Size([5, 3])

In [11]:
y = torch.rand(5, 3)

In [12]:
y


 0.9982  0.1485  0.4511
 0.2239  0.6301  0.4746
 0.4650  0.5401  0.4925
 0.2356  0.9363  0.8141
 0.9526  0.8683  0.1060
[torch.FloatTensor of size 5x3]

In [13]:
# Первый способ сложить 2 тензора
x + y


 0.7312  0.2209  0.8210
-0.1224 -0.9071 -0.1231
 1.1160  0.5726  0.7123
 0.7582  0.3293  1.9415
 0.6205 -0.6577  1.8429
[torch.FloatTensor of size 5x3]

In [14]:
# Второй способ сложить 2 тензора
x.add(y)


 0.7312  0.2209  0.8210
-0.1224 -0.9071 -0.1231
 1.1160  0.5726  0.7123
 0.7582  0.3293  1.9415
 0.6205 -0.6577  1.8429
[torch.FloatTensor of size 5x3]

In [15]:
# А еще можно так:
torch.add(x, y)


 0.7312  0.2209  0.8210
-0.1224 -0.9071 -0.1231
 1.1160  0.5726  0.7123
 0.7582  0.3293  1.9415
 0.6205 -0.6577  1.8429
[torch.FloatTensor of size 5x3]

In [16]:
# Сохраняем выход в тензор result
result = torch.Tensor(5, 3)
torch.add(x, y, out=result)


 0.7312  0.2209  0.8210
-0.1224 -0.9071 -0.1231
 1.1160  0.5726  0.7123
 0.7582  0.3293  1.9415
 0.6205 -0.6577  1.8429
[torch.FloatTensor of size 5x3]

In [17]:
# Перевод из numpy в torch
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b) 

[ 2.  2.  2.  2.  2.]

 2
 2
 2
 2
 2
[torch.DoubleTensor of size 5]



In [18]:
a = torch.randn(5, 3) 
b = torch.randn(3, 4)

In [19]:
# Матричное умножение

torch.mm(a,b)
a.mm(b)


 0.0097  0.2683  0.6655  3.5459
-1.3037  1.1624 -2.3810 -5.4057
-3.2868  1.5457 -1.8646 -5.2660
-2.8430  1.6398 -1.0075 -1.0834
 2.1454 -1.7578  0.5441 -2.2567
[torch.FloatTensor of size 5x4]

In [20]:
# для python 3

#a @ b

# предостережение!

В Pytorch пока нет встроенной реализации broadcasting

In [21]:
W = torch.randn(100, 10)
x = torch.randn(1, 100)
b = torch.ones(10)

In [22]:
x.mm(W) + b



Columns 0 to 7 
  6.9189  16.2675   5.6802   7.3530  -9.7121  -5.4097  10.4115  12.2918

Columns 8 to 9 
  8.9905  14.2689
[torch.FloatTensor of size 1x10]

In [23]:
x = torch.randn(5, 100)
x.mm(W) + b

RuntimeError: inconsistent tensor size at /data/users/soumith/miniconda2/conda-bld/pytorch-0.1.9_1487343590888/work/torch/lib/TH/generic/THTensorMath.c:601

In [24]:
# walk-through

x.mm(W) + b.repeat(x.size(0), 1)



Columns 0 to 7 
 25.1511  18.2276  -7.3328  13.9893  12.1010   6.6206  -6.2555   4.5219
 -3.6613  13.8094  -4.1052  -8.7536   6.3013  -7.3092   4.8149  -9.2200
  2.2263  12.1097  -8.7588  10.8792  -8.4138  13.3245  10.8631   3.5090
 -1.7323   8.0811  19.0527 -13.2555 -19.1833   5.6016  22.9160   1.2069
  6.9110  -5.8484   1.0999  -7.9457 -12.6434  13.1023   3.5895  -4.5340

Columns 8 to 9 
  7.4969  -0.6554
 -2.2016   9.1470
 -4.7548   2.9820
  8.1227 -12.8620
  1.7377   6.7146
[torch.FloatTensor of size 5x10]

Поменяем пару строчек в обучении на np и код уже можно запускать и на GPU

In [3]:
from time import time

In [5]:
dtype = torch.FloatTensor
#dtype = torch.cuda.FloatTensor # GPU

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

t1 = time()
learning_rate = 1e-6
for t in range(500):

    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
t2 = time()
print t2-t1

(0, 35013889.247129336)
(1, 32601090.006104976)
(2, 31925385.040134504)
(3, 28109765.607548773)
(4, 20709786.573592227)
(5, 12775438.678287297)
(6, 7036610.602534682)
(7, 3832782.9243932953)
(8, 2243875.6232604478)
(9, 1466885.1898228435)
(10, 1061280.0613490958)
(11, 825144.3961604855)
(12, 670673.4630668685)
(13, 559482.1940121669)
(14, 474083.8116201181)
(15, 405958.92519417964)
(16, 350203.270184157)
(17, 303891.2070024442)
(18, 265036.91149549827)
(19, 232226.7443207888)
(20, 204334.70912032752)
(21, 180489.78446389647)
(22, 159986.99036683375)
(23, 142266.05045526847)
(24, 126887.30538930678)
(25, 113479.2810834439)
(26, 101748.07048603022)
(27, 91455.90389786338)
(28, 82389.19987829853)
(29, 74391.40360661584)
(30, 67304.68757790682)
(31, 61008.35605036013)
(32, 55398.09865702223)
(33, 50387.44459377951)
(34, 45899.51647750952)
(35, 41874.610549579345)
(36, 38261.57634348629)
(37, 35012.28608932835)
(38, 32081.654923820606)
(39, 29430.546795348433)
(40, 27030.93484589894)
(41, 2

In [6]:
#dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # GPU

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

t1 = time()
learning_rate = 1e-6
for t in range(500):

    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
t2 = time()
print t2-t1

RuntimeError: cuda runtime error (8) : invalid device function at /data/users/soumith/miniconda2/conda-bld/pytorch-0.1.9_1487343590888/work/torch/lib/THC/generic/THCTensorMath.cu:35

In [26]:
print t2 - t1

0.918998003006


# Но самое важное в фреймворках - графы вычисления и автоматическое дифференцирование 

In [27]:
# Variable - обертка над тензором, содержащая значения градиента и еще немного полезной информации
from torch.autograd import Variable
x = Variable(torch.ones(2, 2), requires_grad = True)
x  

Variable containing:
 1  1
 1  1
[torch.FloatTensor of size 2x2]

In [28]:
x.data


 1  1
 1  1
[torch.FloatTensor of size 2x2]

In [29]:
x.grad

Variable containing:
 0  0
 0  0
[torch.FloatTensor of size 2x2]

In [30]:
# операция, которая породила переменную.
x.creator is None

True

In [31]:
y = x + 2
y

Variable containing:
 3  3
 3  3
[torch.FloatTensor of size 2x2]

In [32]:

y.creator

<torch.autograd._functions.basic_ops.AddConstant at 0x7f245e043d98>

In [33]:
help(torch.autograd.Function)

Help on class Function in module torch.autograd.function:

class Function(torch._C._FunctionBase)
 |  Records operation history and defines formulas for differentiating ops.
 |  
 |  Every operation performed on :class:`Variable` s creates a new function
 |  object, that performs the computation, and records that it happened.
 |  The history is retained in the form of a DAG of functions, with edges
 |  denoting data dependencies (``input <- output``). Then, when backward is
 |  called, the graph is processed in the topological ordering, by calling
 |  :func:`backward` methods of each :class:`Function` object, and passing
 |  returned gradients on to next :class:`Function` s.
 |  
 |  Normally, the only way users interact with functions is by creating
 |  subclasses and defining new operations. This is a recommended way of
 |  extending torch.autograd.
 |  
 |  Since Function logic is a hotspot in most scripts, almost all of it
 |  was moved to our C backend, to ensure that the framewor

In [34]:
z = y * y * 2
z

Variable containing:
 18  18
 18  18
[torch.FloatTensor of size 2x2]

In [35]:
out = z.mean()
out

Variable containing:
 18
[torch.FloatTensor of size 1]

In [36]:
# Запускаем бэкпроп
out.backward()

In [37]:
x.grad

Variable containing:
 3  3
 3  3
[torch.FloatTensor of size 2x2]

# Что произошло?

autograd строит ациклический граф высчисления из переменных и операций(функций)
out.backward проходит по всему графу начиная от вершины out и считает градиенты вершин

In [38]:
class MyReLU(torch.autograd.Function):

    def forward(self, input):
        # forward pass
        self.save_for_backward(input)
        return input.clamp(min=0)

    def backward(self, grad_output):
        # backward pass
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


Если мы хотим сохранить значения переменных в графе, то используем retain_variables = True. 
Это может быть нужно, если мы хотим несколько раз подряд сделать backprop

In [39]:
x = Variable(torch.ones(2, 2), requires_grad = True)
y = x + 2
y.backward(torch.ones(2, 2))
x.grad


Variable containing:
 1  1
 1  1
[torch.FloatTensor of size 2x2]

In [40]:
gradient = torch.randn(2, 2)

y.backward(gradient)

x.grad

Variable containing:
 2.1863  1.3897
 1.9748  0.3825
[torch.FloatTensor of size 2x2]

In [41]:
x = Variable(torch.ones(2, 2), requires_grad = True)
y = x + 2
y.backward(torch.ones(2, 2), retain_variables=True)
x.grad


Variable containing:
 1  1
 1  1
[torch.FloatTensor of size 2x2]

In [42]:
gradient = torch.randn(2, 2)

y.backward(gradient)

x.grad

Variable containing:
 0.2347  0.3023
-0.7550  1.8709
[torch.FloatTensor of size 2x2]

# Снова вернемся к исходной двухслойной сети

In [44]:
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
#dtype = torch.cuda.FloatTensor # GPU

N, D_in, H, D_out = 64, 1000, 100, 10


x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)


w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

t1 = time()
learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    
    # Обнуляем градиенты
    w1.grad.data.zero_()
    w2.grad.data.zero_()

    loss.backward()

    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
t2 = time()

(0, 34765560.0)
(1, 32018410.0)
(2, 32139958.0)
(3, 29572438.0)
(4, 22981848.0)
(5, 14632168.0)
(6, 8165254.0)
(7, 4402506.5)
(8, 2550213.5)
(9, 1660576.75)
(10, 1207646.0)
(11, 947982.375)
(12, 778737.125)
(13, 656311.0)
(14, 561535.6875)
(15, 485094.90625)
(16, 421942.25)
(17, 369043.09375)
(18, 324302.90625)
(19, 286191.125)
(20, 253461.84375)
(21, 225251.78125)
(22, 200827.734375)
(23, 179603.28125)
(24, 161029.703125)
(25, 144752.984375)
(26, 130422.921875)
(27, 117763.390625)
(28, 106607.0)
(29, 96689.2578125)
(30, 87852.7421875)
(31, 79961.9609375)
(32, 72906.7265625)
(33, 66570.96875)
(34, 60881.53125)
(35, 55750.91015625)
(36, 51116.375)
(37, 46924.109375)
(38, 43124.81640625)
(39, 39677.63671875)
(40, 36542.8515625)
(41, 33686.5234375)
(42, 31083.400390625)
(43, 28707.677734375)
(44, 26536.044921875)
(45, 24549.94921875)
(46, 22731.83984375)
(47, 21065.1875)
(48, 19534.98046875)
(49, 18129.05859375)
(50, 16836.48046875)
(51, 15646.4453125)
(52, 14550.3330078125)
(53, 13540.36

In [45]:
print t2 - t1

0.96022105217


# Наконец, во многих фреймворках базовые слои нейронных сетей уже реализованы. Прямо как в первом домашнем задании!

In [47]:
from torch.autograd import Variable

N, D_in, D_out = 64, 1000, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = torch.nn.Sequential(
          torch.nn.Linear(D_in, D_out)
        )

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    model.zero_grad()

    loss.backward()

    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

(0, 951.4027709960938)
(1, 606.1792602539062)
(2, 391.9593200683594)
(3, 257.03448486328125)
(4, 170.8016357421875)
(5, 114.90416717529297)
(6, 78.17976379394531)
(7, 53.7443733215332)
(8, 37.29325866699219)
(9, 26.096704483032227)
(10, 18.400386810302734)
(11, 13.062150955200195)
(12, 9.329204559326172)
(13, 6.699503421783447)
(14, 4.834671497344971)
(15, 3.5043206214904785)
(16, 2.550144910812378)
(17, 1.8624448776245117)
(18, 1.3646272420883179)
(19, 1.0028300285339355)
(20, 0.7389411926269531)
(21, 0.5458341240882874)
(22, 0.40410116314888)
(23, 0.29979175329208374)
(24, 0.22283178567886353)
(25, 0.16592030227184296)
(26, 0.1237458884716034)
(27, 0.0924309492111206)
(28, 0.06913767755031586)
(29, 0.05178238824009895)
(30, 0.03883099555969238)
(31, 0.029152261093258858)
(32, 0.02190937101840973)
(33, 0.016482485458254814)
(34, 0.012411660514771938)
(35, 0.009354541078209877)
(36, 0.007056237664073706)
(37, 0.0053268191404640675)
(38, 0.0040243836119771)
(39, 0.0030424322467297316)
(

In [48]:
loss_fn(model(x), y)

Variable containing:
1.00000e-12 *
  2.4930
[torch.FloatTensor of size 1]

# А еще нам есть уже готовые оптимизаторы, такие как GD, SGD, ADAM, etc.

In [50]:
N, D_in, D_out = 64, 1000, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = torch.nn.Sequential(
          torch.nn.Linear(D_in, D_out),

        )
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

(0, 929.259521484375)
(1, 913.812744140625)
(2, 898.5112915039062)
(3, 883.359130859375)
(4, 868.3568725585938)
(5, 853.5076904296875)
(6, 838.812744140625)
(7, 824.2747802734375)
(8, 809.8948364257812)
(9, 795.6734619140625)
(10, 781.6139526367188)
(11, 767.7159423828125)
(12, 753.981689453125)
(13, 740.4117431640625)
(14, 727.0078125)
(15, 713.7693481445312)
(16, 700.6981201171875)
(17, 687.7933959960938)
(18, 675.0564575195312)
(19, 662.4876708984375)
(20, 650.0873413085938)
(21, 637.8545532226562)
(22, 625.7894897460938)
(23, 613.8923950195312)
(24, 602.162841796875)
(25, 590.6002197265625)
(26, 579.2041015625)
(27, 567.9739379882812)
(28, 556.9091796875)
(29, 546.0087280273438)
(30, 535.2716064453125)
(31, 524.697265625)
(32, 514.2847900390625)
(33, 504.0326843261719)
(34, 493.9402160644531)
(35, 484.0062561035156)
(36, 474.22906494140625)
(37, 464.6074523925781)
(38, 455.140380859375)
(39, 445.8262634277344)
(40, 436.6641845703125)
(41, 427.6524353027344)
(42, 418.7889709472656)


In [51]:
sgd = torch.optim.SGD
adadelta = torch.optim.Adadelta
adagrad = torch.optim.Adagrad
rmsprop = torch.optim.RMSprop

In [53]:
x[0, :]

Variable containing:
-1.2541
 0.0638
-0.1823
   ⋮   
 2.9278
 0.8388
 0.6463
[torch.FloatTensor of size 1000]