###1.tensors使用
tensor 类似numpy的ndarray，唯一的区别是Tensor可以在GPU上加速运算。
文档地址：https://pytorch.org/docs/torch

In [1]:
from __future__ import print_function
import torch

#创建空矩阵
x = torch.empty(5,3)
print(x)



tensor([[1.6273e+15, 4.5850e-41, 2.8768e+13],
        [4.5850e-41, 2.8768e+13, 4.5850e-41],
        [1.6357e+15, 4.5850e-41, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 2.5610e-26]])


In [2]:
#随机矩阵
x = torch.rand(4,4)
print(x)

tensor([[0.2891, 0.0370, 0.1284, 0.4624],
        [0.4156, 0.2477, 0.5602, 0.2877],
        [0.8501, 0.1543, 0.1121, 0.4336],
        [0.1453, 0.0318, 0.5344, 0.9588]])


In [3]:
#long型全0矩阵
x = torch.zeros(4,4,dtype=torch.long)
print(x)

tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]])


In [4]:
#数组直接构建tensor
x = torch.tensor([1.0,2,3,4])
print(x)

tensor([1., 2., 3., 4.])


从一个已有的tensor构建一个tensor。这些方法会重用原来tensor的特征，例如，数据类型，除非提供新的数据。

In [5]:
x = x.new_ones(4,3,dtype=torch.double) #
print(x)

x = torch.randn_like(x,dtype=torch.float) #修改类型
print(x) #size 跟以前一样

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)
tensor([[-0.4087, -0.0469, -1.3417],
        [ 0.9152, -0.2320, -0.9257],
        [ 0.1312,  0.5948, -1.4433],
        [ 0.0054, -0.0315,  0.3638]])


得到tensor形状

In [6]:
print(x.size())

torch.Size([4, 3])


运算

In [7]:
#加法运算, 两种方式
y = torch.rand(4,3)

print(x+y)

print(torch.add(x,y))

#加法：把输出作为变量

result = torch.empty(4,3)
torch.add(x,y,out= result)
print(result)


tensor([[-0.0480,  0.9206, -0.4493],
        [ 1.1662,  0.2660, -0.3649],
        [ 0.1442,  0.9102, -0.8869],
        [ 0.4269,  0.4418,  0.7345]])
tensor([[-0.0480,  0.9206, -0.4493],
        [ 1.1662,  0.2660, -0.3649],
        [ 0.1442,  0.9102, -0.8869],
        [ 0.4269,  0.4418,  0.7345]])
tensor([[-0.0480,  0.9206, -0.4493],
        [ 1.1662,  0.2660, -0.3649],
        [ 0.1442,  0.9102, -0.8869],
        [ 0.4269,  0.4418,  0.7345]])


in-place加法:y=x+y,结果返回到y上 

In [8]:
y.add_(x)
print(y)

tensor([[-0.0480,  0.9206, -0.4493],
        [ 1.1662,  0.2660, -0.3649],
        [ 0.1442,  0.9102, -0.8869],
        [ 0.4269,  0.4418,  0.7345]])


注意：
任何in-place的运算都会以``_``结尾。 举例来说：``x.copy_(y)``, ``x.t_()``, 会改变 ``x``。

各种类似NumPy的indexing都可以在PyTorch tensor上面使用。

In [9]:
print(x[:2,1:])

tensor([[-0.0469, -1.3417],
        [-0.2320, -0.9257]])


Resizing: 如果你希望resize/reshape一个tensor，可以使用torch.view：

In [10]:
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)
print(x.size(), y.size(),z.size())

torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])


如果你有一个只有一个元素的tensor，使用.item()方法可以把里面的value变成Python数值。

In [11]:
x = torch.randn(1)
print(x)
print(x.item())

tensor([1.3520])
1.3519645929336548


###numpy和tensor之间转换

Torch Tensor和NumPy array会共享内存，所以改变其中一项也会改变另一项。

把Torch Tensor转变成NumPy Array

In [12]:
a = torch.ones(8)
print(a)

tensor([1., 1., 1., 1., 1., 1., 1., 1.])


In [13]:
b = a.numpy() #转numpy
print(b) 

[1. 1. 1. 1. 1. 1. 1. 1.]


改变numpy array里面的值。

In [14]:
a.add_(2)
print(a)
print(b)

tensor([3., 3., 3., 3., 3., 3., 3., 3.])
[3. 3. 3. 3. 3. 3. 3. 3.]


把NumPy ndarray转成Torch Tensor

In [15]:
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a,2,out = a)
print (a)
print(b)

[3. 3. 3. 3. 3.]
tensor([3., 3., 3., 3., 3.], dtype=torch.float64)


所有CPU上的Tensor都支持转成numpy或者从numpy转成Tensor。

###CUDA Tensors
使用.to方法，Tensor可以被移动到别的device上。

In [16]:
print(torch.cuda.is_available())

if torch.cuda.is_available():
    device = torch.device("cuda")
    y = torch.ones_like(x,device=device) #在GPU上创建tensor
    
    x = x.to(device)  #转到gpu，or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu",torch.double)) #转成cpu，并改变type
    


False


### 用numpy实现两层神经网络
一个全连接ReLU神经网络，一个隐藏层，没有bias。用来从x预测y，使用L2 Loss。

这一实现完全使用numpy来计算前向神经网络，loss，和反向传播。

numpy ndarray是一个普通的n维array。它不知道任何关于深度学习或者梯度(gradient)的知识，也不知道计算图(computation graph)，只是一种用来计算数学运算的数据结构。

In [17]:
import numpy as np
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N,D_in,H,D_out = 64,1000,100,10

#创建随机数据
x = np.random.randn(N,D_in)
y = np.random.randn(N,D_out)

#随机初始化权重 Randomly initialize weights
w1 = np.random.randn(D_in,H)
w2 = np.random.randn(H,D_out)


learning_rate = 1e-6

for t in range(500):
    
    #1.前向传播 Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h,0)
    y_hat =h_relu.dot(w2)
    
    #2.计算损失；均方差 compute and print loss
    loss = np.square(y_hat - y).sum()
    if t %10 ==0:
        print(t,loss)
    
    #3.反向传播
    # Backprop to compute gradients of w1 and w2 with respect to loss
    
    
    grad_y_hat =2.0 * (y_hat - y)
    
    grad_w2 = h_relu.T.dot(grad_y_hat)
    grad_h_relu = grad_y_hat.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    #update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    


0 38734087.95115929
10 1028952.7474450135
20 233250.72862875566
30 81501.23296350492
40 34329.87579143994
50 16102.555826770851
60 8077.598086757977
70 4236.492468074377
80 2290.8108589729395
90 1266.7427263615245
100 712.5830936990685
110 406.6159627523841
120 234.73764331276638
130 136.78470039612458
140 80.3338901554973
150 47.49719957273632
160 28.247692714320845
170 16.88528283173057
180 10.139165346063715
190 6.113354587369562
200 3.7002303974501483
210 2.247326213442889
220 1.3693091213809163
230 0.8367703081607656
240 0.5127679725172645
250 0.31502559137289415
260 0.1940281529014829
270 0.11979548243856335
280 0.07412833819246836
290 0.04596685855250905
300 0.02856358101070828
310 0.017784367764394883
320 0.011094608368661708
330 0.0069336602301884485
340 0.004341153503151124
350 0.0027226705583642116
360 0.00171046415065889
370 0.0010763392937160182
380 0.0006784098382189036
390 0.00042825060989919997
400 0.0002707328674206863
410 0.0001714089814507499
420 0.000108672014816915

###pytorch:tensors 实现两层神经网络

我们使用PyTorch tensors来创建前向神经网络，计算损失，以及反向传播。

一个PyTorch Tensor很像一个numpy的ndarray。但是它和numpy ndarray最大的区别是，PyTorch Tensor可以在CPU或者GPU上运算。如果想要在GPU上运算，就需要把Tensor换成cuda类型。



In [18]:
import torch

dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

#create random input and output

x = torch.randn(N,D_in,device=device,dtype = dtype)
y = torch.randn(N,D_out,device = device,dtype = dtype)

#randomly initialize weights
w1 = torch.randn(D_in,H,device=device,dtype=dtype)
w2 = torch.randn(H,D_out,device = device,dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    #forward pass:compute predicted y
    h = x.mm(w1) # mm matrix multiplication
    h_relu = h.clamp(min=0) #Clamp all elements in input into the range [ min, max ] and return a resulting tensor:
    y_pred = h_relu.mm(w2)
    
    # compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 10 ==0:
        print (t, loss)
    
    
    
    #backprop to compute gradinets of w1 and w2 with respect to loss
    #逆向求梯度
    grad_y_pred = 2.0* (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    #update weirghts using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2





0 27288992.0
10 1751569.625
20 250924.796875
30 83638.125
40 32961.265625
50 14367.310546875
60 6760.4638671875
70 3382.64208984375
80 1775.517822265625
90 971.0481567382812
100 550.3554077148438
110 321.5736083984375
120 192.9253692626953
130 118.420654296875
140 74.13906860351562
150 47.23534393310547
160 30.550188064575195
170 20.008148193359375
180 13.24726676940918
190 8.853684425354004
200 5.96598482131958
210 4.048614025115967
220 2.7644591331481934
230 1.8979507684707642
240 1.3089643716812134
250 0.9065794944763184
260 0.6299848556518555
270 0.4391378164291382
280 0.3068615198135376
290 0.2149803340435028
300 0.15094204246997833
310 0.1061391681432724
320 0.07475600391626358
330 0.05275623872876167
340 0.0373055674135685
350 0.026431145146489143
360 0.01879294030368328
370 0.013401303440332413
380 0.009597176685929298
390 0.006913910154253244
400 0.005027065984904766
410 0.0036877659149467945
420 0.0027300056535750628
430 0.002042273757979274
440 0.0015446520410478115
450 0.00

简单的autograd

In [19]:
#create tensor
x = torch.tensor(1.,requires_grad=True)
w = torch.tensor(2.,requires_grad=True)
b = torch.tensor(3.,requires_grad=True)

#build a computational graph
y = w*x +b

#compute gradients
y.backward()

#print put the gradients

print(x.grad) #对x求偏导，= w
print(w.grad) #= x
print(b.grad) #= 1



tensor(2.)
tensor(1.)
tensor(1.)


In [20]:

###pytorch：tensor和autograd
PyTorch的一个重要功能就是autograd，也就是说只要定义了forward pass(前向神经网络)，计算了loss之后，PyTorch可以自动求导计算模型所有参数的梯度。

一个PyTorch的Tensor表示计算图中的一个节点。如果x是一个Tensor并且x.requires_grad=True那么x.grad是另一个储存着x当前梯度(相对于一个scalar，常常是loss)的向量。



SyntaxError: invalid character in identifier (<ipython-input-20-abdf51769544>, line 2)

In [None]:
import torch

dtype = torch.float
device = torch.device("cpu")

N,D_in,H,D_out= 64,1000,100,10

x = torch.randn(N,D_in,device=device,dtype=dtype)
y = torch.randn(N,D_out,device=device,dtype=dtype)

w1 = torch.randn(D_in,H,device=device,dtype=dtype,requires_grad=True)
w2 = torch.randn(H,D_out,device=device,dtype=dtype,requires_grad=True)

learning_rate=1e-6
for t in range(500):
    
    # 前向传播:通过Tensor预测y；这个和普通的神经网络的前向传播没有任何不同，
    # 但是我们不需要保存网络的中间运算结果，因为我们不需要手动计算反向传播。
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # 通过前向传播计算loss
    # loss是一个形状为(1，)的Tensor
    # loss.item()可以给我们返回一个loss的scalar
    loss = (y_pred -y).pow(2).sum()
    if t % 10 ==0:
        print(t,loss.item())
    
    # PyTorch给我们提供了autograd的方法做反向传播。如果一个Tensor的requires_grad=True，
    # backward会自动计算loss相对于每个Tensor的gradient。在backward之后，
    # w1.grad和w2.grad会包含两个loss相对于两个Tensor的gradient信息。
    loss.backward()
    
    # 我们可以手动做gradient descent(后面我们会介绍自动的方法)。
    # 用torch.no_grad()包含以下statements，因为w1和w2都是requires_grad=True，
    # 但是在更新weights之后我们并不需要再做autograd。
    # 另一种方法是在weight.data和weight.grad.data上做操作，这样就不会对grad产生影响。
    # tensor.data会我们一个tensor，这个tensor和原来的tensor指向相同的内存空间，
    # 但是不会记录计算图的历史。
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
    






###pytorch:nn nerve network
这次我们使用PyTorch中nn这个库来构建网络。 用PyTorch autograd来构建计算图和计算gradients， 然后PyTorch会帮我们自动计算gradient。

In [None]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.

model = torch.nn.Sequential(
    torch.nn.Linear(D_in,H),
    torch.nn.ReLU(),
    torch.nn.Linear(H,D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    
    y_pred = model(x)
    
    
    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t %10 ==0:
        print (t,loss.item())
    
    # Zero the gradients before running the backward pass.
    model.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    #计算梯度
    loss.backward()
    
    
    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
            




###pytorch:optim
提供了不同优化算法
这一次我们不再手动更新模型的weights,而是使用optim这个包来帮助我们更新参数。 optim这个package提供了各种不同的模型优化方法，包括SGD+momentum, RMSProp, Adam等等。

In [22]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(reduction='sum')



# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.

learning_rate = 1e-4
optimizer =torch.optim.Adam(model.parameters(),lr =learning_rate)

for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred,y)
    if t % 10 ==0:
        print(t,loss.item())
    
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    
    #参数清零
    optimizer.zero_grad()

    
    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    #求梯度
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters
    #更新参数 
    optimizer.step()






0 626.4588012695312
10 482.22283935546875
20 376.8345642089844
30 295.9488525390625
40 232.41639709472656
50 181.44467163085938
60 139.82139587402344
70 106.07933807373047
80 78.97282409667969
90 57.4207763671875
100 40.77534866333008
110 28.255468368530273
120 18.949909210205078
130 12.295174598693848
140 7.732590198516846
150 4.720807075500488
160 2.807846784591675
170 1.6349819898605347
180 0.9357205629348755
190 0.5270295739173889
200 0.2925480604171753
210 0.16020400822162628
220 0.08636952191591263
230 0.045787934213876724
240 0.023870572447776794
250 0.012318517081439495
260 0.006234304513782263
270 0.003093733685091138
280 0.0015040909638628364
290 0.0007152080652303994
300 0.00033212205744348466
310 0.0001504787360318005
320 6.648754788329825e-05
330 2.8635076887439936e-05
340 1.2018940651614685e-05
350 4.9112395572592504e-06
360 1.9538356355042197e-06
370 7.556902232863649e-07
380 2.8430241627575015e-07
390 1.0387239512965607e-07
400 3.672523263276162e-08
410 1.25940085027309

###pytorch:自定义nn Modules

我们可以定义一个模型，这个模型继承自nn.Module类。如果需要定义一个比Sequential模型更加复杂的模型，就需要定义nn.Module模型。


In [28]:
import torch
#define class  extended nn.Module
class TwoLayerNet(torch.nn.Module):
    def __init__(self,D_in,H,D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet,self).__init__()
        self.linear1 = torch.nn.Linear(D_in,H)
        self.linear2 = torch.nn.Linear(H,D_out)
    def forward(self,x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min = 0)
        y_pred = self.linear2(h_relu)
        return y_pred


        
        
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)


#Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in,H,D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction = 'sum')
optimizer = torch.optim.SGD(model.parameters(),lr =1e-4)
for t in range(500):
    
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)
    
    # Compute and print loss
    loss = criterion(y_pred,y)
    if t %10 ==0:
        print(t,loss.item())
    
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    

0 638.4453735351562
10 331.0203857421875
20 193.6580047607422
30 110.20072174072266
40 61.1053352355957
50 33.92122268676758
60 19.116411209106445
70 10.992533683776855
80 6.4693827629089355
90 3.8941595554351807
100 2.3940045833587646
110 1.5003877878189087
120 0.9564663767814636
130 0.6193969249725342
140 0.4075735807418823
150 0.2725450098514557
160 0.18477416038513184
170 0.12694892287254333
180 0.088416188955307
190 0.06234872713685036
200 0.04453711956739426
210 0.032178498804569244
220 0.023498348891735077
230 0.01732627861201763
240 0.012887961231172085
250 0.009663806296885014
260 0.0072968280874192715
270 0.005544430110603571
280 0.004235605709254742
290 0.003251543967053294
300 0.002506745047867298
310 0.0019400939345359802
320 0.0015064654871821404
330 0.0011733082355931401
340 0.000916208082344383
350 0.0007171262404881418
360 0.0005624534678645432
370 0.00044195461669005454
380 0.0003478627768345177
390 0.0002742106153164059
400 0.00021644846128765494
410 0.00017106081941

###FizzBuzz
FizzBuzz是一个简单的小游戏。游戏规则如下：从1开始往上数数，当遇到3的倍数的时候，说fizz，当遇到5的倍数，说buzz，当遇到15的倍数，就说fizzbuzz，其他情况下则正常数数。

(1).写一个简单的小程序来决定要返回正常数值还是fizz, buzz 或者 fizzbuzz。

In [30]:
# One-hot encode the desired outputs: [number, "fizz", "buzz", "fizzbuzz"]
def fizz_buzz_encode(i):
    if   i % 15 == 0: return 3
    elif i % 5 == 0: return 2
    elif i % 3 == 0:return 1
    else: return 0

def fizz_buzz_decode(i,prediction):
    return [str(i),'fizz','buzz','fizzbuzz'][prediction]

print(fizz_buzz_decode(1,fizz_buzz_encode(1)))
print(fizz_buzz_decode(2,fizz_buzz_encode(2)))
print(fizz_buzz_decode(3,fizz_buzz_encode(3)))
print(fizz_buzz_decode(5,fizz_buzz_encode(5)))
print(fizz_buzz_decode(15,fizz_buzz_encode(15)))


1
2
fizz
buzz
fizzbuzz


1.首先定义模型的输入与输出(训练数据)

In [42]:


import numpy as np
import torch

NUM_DIGITS=10

#represent each input by an array of its binary digits.
#把数字转成二进制，一个数字不好训练，转成二进制可以更好的训练
def binary_encode(i,num_digits):
    return np.array([i >> d & 1 for d in range(num_digits)])

trX = torch.Tensor([binary_encode(i,NUM_DIGITS) for i in range(101,2 ** NUM_DIGITS)])
trY = torch.LongTensor([fizz_buzz_encode(i) for i in range(101,2 ** NUM_DIGITS)])


#2.PyTorch定义模型
NUM_HIDDEN = 100
model = torch.nn.Sequential(
    torch.nn.Linear(NUM_DIGITS,NUM_HIDDEN),
    torch.nn.ReLU(),
    torch.nn.Linear(NUM_HIDDEN,4)
)
# 是否可以使用cuda
if torch.cuda.is_available():
    mode = model.cuda()


#为了让我们的模型学会FizzBuzz这个游戏，我们需要定义一个损失函数，和一个优化算法。
#这个优化算法会不断优化（降低）损失函数，使得模型的在该任务上取得尽可能低的损失值。
#损失值低往往表示我们的模型表现好，损失值高表示我们的模型表现差。
#由于FizzBuzz游戏本质上是一个分类问题，我们选用Cross Entropy Loss函数。
#Cross Entropy Loss：拟合多种分布的相似度有多高。
#优化函数我们选用Stochastic Gradient Descent。

loss_fn = torch.nn.CrossEntropyLoss()
#定义optimizer
optimizer = torch.optim.SGD(model.parameters(),lr = 0.1)

#3.以下是模型的训练代码


BATCH_SIZE = 128
for epoch in range(10000):
    for start in range(0,len(trX),BATCH_SIZE):
        end = start + BATCH_SIZE
        batchX = trX[start:end]
        batchY = trY[start:end]
        y_pred = model(batchX)
        
        loss = loss_fn(y_pred,batchY)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # find loss on training data
    loss = loss_fn(model(trX),trY).item()
    if epoch % 99 ==0: 
        print('Epoch:',epoch,'Loss:',loss)
        
        
        
        







Epoch: 0 Loss: 1.1594605445861816
Epoch: 99 Loss: 1.1266257762908936
Epoch: 198 Loss: 1.0895293951034546
Epoch: 297 Loss: 0.9905086159706116
Epoch: 396 Loss: 0.8403555154800415
Epoch: 495 Loss: 0.709020733833313
Epoch: 594 Loss: 0.8986888527870178
Epoch: 693 Loss: 0.7472803592681885
Epoch: 792 Loss: 0.2779087424278259
Epoch: 891 Loss: 0.21065081655979156
Epoch: 990 Loss: 0.15177930891513824
Epoch: 1089 Loss: 0.12456075102090836
Epoch: 1188 Loss: 0.1050400510430336
Epoch: 1287 Loss: 0.0901724323630333
Epoch: 1386 Loss: 0.07855375856161118
Epoch: 1485 Loss: 0.06922419369220734
Epoch: 1584 Loss: 0.06179199367761612
Epoch: 1683 Loss: 0.05551811680197716
Epoch: 1782 Loss: 0.050394997000694275
Epoch: 1881 Loss: 0.04600830376148224
Epoch: 1980 Loss: 0.042291060090065
Epoch: 2079 Loss: 0.03907600790262222
Epoch: 2178 Loss: 0.03617529198527336
Epoch: 2277 Loss: 0.03338903188705444
Epoch: 2376 Loss: 0.031031055375933647
Epoch: 2475 Loss: 0.02892322465777397
Epoch: 2574 Loss: 0.026990056037902832

4.我们用训练好的模型尝试在1到100这些数字上玩FizzBuzz游戏

In [43]:
testX = torch.Tensor([binary_encode(i,NUM_DIGITS) for i in range(1,101)])
with torch.no_grad(): #在预测的时候不需要gradient，所以用no_grad.
    testY = model(testX) #prediction

print(testY)
predictions = zip(range(1,101), list(testY.max(1)[1].data.tolist())) #testY.max 找出概率最大的那个分类

print([fizz_buzz_decode(i,x ) for (i,x) in predictions])






tensor([[ 1.2038e+01, -8.5395e+00, -4.8167e+00,  5.9974e-01],
        [ 6.1152e+00, -2.0043e+00, -4.0197e+00,  1.9991e-01],
        [-2.0113e+00,  6.7697e+00, -3.5793e+00, -1.8476e+00],
        [ 1.0731e+01, -3.0169e-01,  7.3719e-01, -1.1266e+01],
        [-1.1435e+00, -5.6088e+00,  1.0572e+01, -3.7779e+00],
        [-5.4797e+00,  7.7131e+00,  1.4950e+00, -4.1646e+00],
        [ 6.8943e+00, -7.9624e+00,  1.8340e+00, -1.2174e+00],
        [ 7.5730e+00, -2.6080e+00, -1.8484e+00, -3.2641e+00],
        [ 3.5401e+00,  6.1937e+00, -5.8304e+00, -5.1532e+00],
        [ 4.6590e-01, -5.8496e+00,  8.2583e+00, -2.1719e+00],
        [ 8.5198e+00, -5.0074e+00,  6.9479e-01, -4.3416e+00],
        [ 4.1876e-01,  1.0214e+01, -5.7481e+00, -5.3277e+00],
        [ 1.1851e+01, -2.2556e+00, -5.8587e+00, -4.3487e+00],
        [ 1.1219e+01, -5.4680e+00, -5.0573e+00, -2.3178e-01],
        [-3.3820e+00, -1.3165e+00, -4.3443e+00,  9.1054e+00],
        [ 5.5784e+00, -8.0226e+00, -2.7190e-01,  2.5457e+00],
        

In [47]:

#计算准确率
print(np.sum(testY.max(1)[1].numpy() == np.array([fizz_buzz_encode(i) for i in range(1,101)])))

testY.max(1)[1].numpy() == np.array([fizz_buzz_encode(i) for i in range(1,101)])

97


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True])