In [1]:
from mxnet import ndarray as nd
from mxnet import autograd

In [2]:
num_inputs=2 
num_examples=1000
true_w=[2,-3.4]
true_b=4.2
X=nd.random_normal(shape=(num_examples,num_inputs)) 
y=true_w[0]*X[:,0]+true_w[1]*X[:,1]+true_b
y+=0.01*nd.random_normal(shape=y.shape)

In [3]:
y.shape

(1000,)

In [4]:
print(X[0],y[0])


[1.1630787 0.4838046]
<NDArray 2 @cpu(0)> 
[4.879625]
<NDArray 1 @cpu(0)>


In [5]:
import random 
batch_size=10 
def data_iter():
    # 产生一个随机索引 
    idx=list(range(num_examples))
    random.shuffle(idx)
    for i in range(0,num_examples,batch_size):
        j=nd.array(idx[i:min(i+batch_size,num_examples)]) 
        yield nd.take(X,j),nd.take(y,j)

下面代码读取第一个随机数据块 

In [6]:
n=0 
for data,label in data_iter():
    n=n+1
#     print(data,label)
#     break 
print(n) 

100


In [7]:
w=nd.random_normal(shape=(num_inputs,1))

In [8]:
w


[[ 1.4642214]
 [-1.3058136]]
<NDArray 2x1 @cpu(0)>

In [9]:
b=nd.zeros((1,))

In [10]:
b


[0.]
<NDArray 1 @cpu(0)>

In [11]:
params=[w,b]

In [12]:
params

[
 [[ 1.4642214]
  [-1.3058136]]
 <NDArray 2x1 @cpu(0)>,
 
 [0.]
 <NDArray 1 @cpu(0)>]

# 定义模型

线性模型，就是将输入和模型做乘法再加上偏移：

In [13]:
for param in params:
    param.attach_grad()

In [14]:
def net(X):
    return nd.dot(X,w)+b 

# 损失函数 
我们使用常见的平方差误差来衡量目标和真实目标之间的差距 

In [15]:
def square_loss(yhat,y):
    # 注意我们这里把y变成yhat的形状来避免自动广播 
    return (yhat-y.reshape(yhat.shape))**2 

In [16]:
data 


[[-0.61562586  1.1182038 ]
 [ 1.8987794  -1.1283919 ]
 [ 0.72018015  0.9385755 ]
 [-0.6079571   0.05669585]
 [-0.25059152 -0.06146848]
 [-1.8885834   0.6752996 ]
 [-0.04796145 -0.912849  ]
 [-0.4337191   1.3150975 ]
 [ 0.14341475  0.42924684]
 [ 1.8307649  -1.1468065 ]]
<NDArray 10x2 @cpu(0)>

In [17]:
net(data)


[[-2.3615782 ]
 [ 4.2537026 ]
 [-0.17110145]
 [-0.964218  ]
 [-0.28665507]
 [-3.6471195 ]
 [ 1.1217844 ]
 [-2.3523328 ]
 [-0.3505254 ]
 [ 4.1781607 ]]
<NDArray 10x1 @cpu(0)>

# 优化 
虽然线性回归有显式解，但绝大部分模型并没有，所以我们这里通过随机梯度下降来求解。
每一步，我们将模型参数沿着梯度的反方向走特定距离，这个距离一般叫做学习率。
（我们之后一直使用这个函数，我们将其保存在utils.py 

In [18]:
def SGD(params,lr):
    for param in params:
        param[:]=param -lr *param.grad 

# 训练 
现在我们可以开始训练了。训练通常需要迭代数据数次，一次迭代里，我们每次随机读取固定数个
数据点，计算梯度并更新模型参数 


In [19]:
epochs=10
learning_rate=0.0001 
for e in range(epochs):
    total_loss=0 
    for data,label in data_iter():
        with autograd.record():
            output=net(data) 
            loss=square_loss(output,label) 
        loss.backward()
        SGD(params,learning_rate) 
        total_loss+=nd.sum(loss).asscalar()
    print("Epoch %d , average loss : %f " % (e,total_loss/num_examples)) 

Epoch 0 , average loss : 18.376279 
Epoch 1 , average loss : 12.330371 
Epoch 2 , average loss : 8.272645 
Epoch 3 , average loss : 5.551243 
Epoch 4 , average loss : 3.725228 
Epoch 5 , average loss : 2.500241 
Epoch 6 , average loss : 1.677955 
Epoch 7 , average loss : 1.126274 
Epoch 8 , average loss : 0.755946 
Epoch 9 , average loss : 0.507494 


In [20]:
true_w,w 

([2, -3.4],
 
 [[ 1.9509478]
  [-3.1094334]]
 <NDArray 2x1 @cpu(0)>)

In [21]:
true_b , b 

(4.2,
 
 [3.6283202]
 <NDArray 1 @cpu(0)>)