# 线性回归的从零开始实现

### 1. 生成数据集

In [1]:
from mxnet import nd
from mxnet import autograd as ag

  from ._conv import register_converters as _register_converters


In [127]:
num_inputs = 2
num_examples = 1001
batch_size = 10
true_w = [2, -3.4]
true_b = 4.2
x = nd.random_normal(0, 1, shape=(num_examples, num_inputs))
y = x[:,0] * true_w[0] + x[:, 1] * true_w[1] + true_b 
y = y + 0.01*nd.random_normal(shape=y.shape)

## 2. 读取数据

In [128]:
import random
def data_loader(data, label, batch_size):
    index = nd.arange(len(data))
    random.shuffle(index)
    for i in range(0, len(index),batch_size):
        j = nd.array(index[i: min(i + batch_size, num_examples)])
        yield data.take(j), label.take(j)

### 3. 初始化模型参数

In [129]:
w = 0.01 * nd.random_normal(shape=(num_inputs,1))
b = nd.zeros(1)
params = [w, b]
for param in params:
    param.attach_grad()

### 4.定义模型

In [130]:
def linreg(x):
    return nd.dot(x, w) + b

### 5.损失函数

In [131]:
def L2_loss(y_hat, y):
    return 0.5 *(y_hat - y.reshape(y_hat.shape))**2

### 6.定义优化算法

In [132]:
def sgd(params, lr, batch_size):
    for param in params:
        param[:] = param - lr * param.grad / batch_size

###  7. 训练模型

In [133]:
num_epochs = 5
lr = 0.03
batch_size = 10
for epoch in range(num_epochs):
    total_loss = 0.
    for data, label in data_loader(x, y, batch_size):
        with autograd.record():
            output = linreg(data)
            loss = L2_loss(output, label)
        loss.backward()
        sgd(params, lr, batch_size)
        total_loss += loss.sum().asscalar()
    print("Epoch %d, Loss: %f" %(epoch, total_loss/num_examples))

Epoch 0, Loss: 2.802664
Epoch 1, Loss: 0.016831
Epoch 2, Loss: 0.000115
Epoch 3, Loss: 0.000046
Epoch 4, Loss: 0.000049


In [134]:
params

[
 [[ 1.9995065]
  [-3.3994174]]
 <NDArray 2x1 @cpu(0)>, 
 [4.1994467]
 <NDArray 1 @cpu(0)>]

In [135]:
true_w

[2, -3.4]

In [136]:
true_b

4.2

## 小结

* 可以看出，仅使用 NDArray 和`autograd`就可以很容易地实现一个模型。在接下来的章节中，我们会在此基础上描述更多深度学习模型，并介绍怎样使用更简洁的代码（例如下一节）来实现它们。


## 练习

* 为什么`squared_loss`函数中需要使用`reshape`函数?
- 在ndarray中（10,1）和（10，）表达的不一样，一个是行向量，一个是列向量，如果不用reshape，将会返回（10,10）的矩阵，这不是我们想要的
* 尝试使用不同的学习率，观察损失函数值的下降快慢。
* 如果样本个数不能被批量大小整除，`data_iter`函数的行为会有什么变化？
- data_iter 中去数据时用到 min(i + batch_size, num_examples),如果最后一批不能被整除将用剩下的数据，而不是整个batch_size


## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/743)

![](../img/qr_linear-regression-scratch.svg)

# Gluon 版本

In [137]:
from mxnet import gluon

使用和之前一样的数据集

### 读取数据

In [145]:
dataset = gluon.data.ArrayDataset(x, y)
data_iter = gluon.data.DataLoader(dataset, batch_size=batch_size,shuffle=True)

### 定义模型

In [146]:
net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(1))
net.initialize()

### 损失函数

In [147]:
L2Loss = gluon.loss.L2Loss()

### 定义优化方法

In [148]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':0.05})

### 训练模型

In [150]:
num_epochs = 5
lr = 0.03
batch_size = 10
for epoch in range(num_epochs):
    total_loss = 0.
    for data, label in data_iter:
        with autograd.record():
            output = net(data)
            loss = L2Loss(output, label)
        loss.backward()
        trainer.step(batch_size)
        total_loss += loss.sum().asscalar()
    print("Epoch %d, Loss: %f" %(epoch, total_loss/num_examples))

Epoch 0, Loss: 1.719365
Epoch 1, Loss: 0.000113
Epoch 2, Loss: 0.000046
Epoch 3, Loss: 0.000046
Epoch 4, Loss: 0.000046


In [156]:
lr = 0.5
num_epochs = 10
batch_size = 10

dataset = gluon.data.ArrayDataset(x, y)
data_iter = gluon.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(1))
net.initialize()

L2Loss = gluon.loss.L2Loss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':lr})

for epoch in range(num_epochs):
    total_loss = 0.
    for data, label in data_iter:
        with autograd.record():
            output = net(data)
            loss = L2Loss(output, label)
        loss.backward()
        trainer.step(batch_size)
        total_loss += loss.sum().asscalar()
    print("Epoch %d, Loss: %f" %(epoch, total_loss/num_examples))

Epoch 0, Loss: 0.274439
Epoch 1, Loss: 0.000053
Epoch 2, Loss: 0.000052
Epoch 3, Loss: 0.000051
Epoch 4, Loss: 0.000053
Epoch 5, Loss: 0.000052
Epoch 6, Loss: 0.000051
Epoch 7, Loss: 0.000051
Epoch 8, Loss: 0.000051
Epoch 9, Loss: 0.000051


In [157]:
help(trainer.step)

Help on method step in module mxnet.gluon.trainer:

step(batch_size, ignore_stale_grad=False) method of mxnet.gluon.trainer.Trainer instance
    Makes one step of parameter update. Should be called after
    `autograd.backward()` and outside of `record()` scope.
    
    For normal parameter updates, `step()` should be used, which internally calls
    `allreduce_grads()` and then `update()`. However, if you need to get the reduced
    gradients to perform certain transformation, such as in gradient clipping, then
    you may want to manually call `allreduce_grads()` and `update()` separately.
    
    Parameters
    ----------
    batch_size : int
        Batch size of data processed. Gradient will be normalized by `1/batch_size`.
        Set this to 1 if you normalized loss manually with `loss = mean(loss)`.
    ignore_stale_grad : bool, optional, default=False
        If true, ignores Parameters with stale gradient (gradient that has not
        been updated by `backward` after last 