# 初始化模型参数 
我们仍使用MLP 这个例子来详细解释如何初始化模型参数 。 

In [1]:
from mxnet.gluon import nn 
from mxnet import nd

def get_net():
    net=nn.Sequential()
    with net.name_scope():
        net.add(nn.Dense(4,activation='relu'))
        net.add(nn.Dense(2))
    return net 

x=nd.random.uniform(shape=(3,5))

In [2]:
x 


[[0.5488135  0.5928446  0.71518934 0.84426576 0.60276335]
 [0.8579456  0.5448832  0.8472517  0.4236548  0.6235637 ]
 [0.6458941  0.3843817  0.4375872  0.2975346  0.891773  ]]
<NDArray 3x5 @cpu(0)>

In [3]:
import sys 
try:
    net=get_net()
    net(x)
except RuntimeError as err:
    sys.stderr.write(str(err))

Parameter 'sequential0_dense0_weight' has not been initialized. Note that you should initialize parameters and create Trainer with Block.collect_params() instead of Block.params because the later does not include Parameters of nested child Blocks

In [4]:
net

Sequential(
  (0): Dense(None -> 4, Activation(relu))
  (1): Dense(None -> 2, linear)
)

In [5]:
net.initialize()
net(x)


[[0.00212593 0.00365805]
 [0.00161272 0.00441845]
 [0.00204872 0.00352518]]
<NDArray 3x2 @cpu(0)>

# 访问模型参数
之前我们提到可以通过weight 和 bias 访问 dense 参数，他们是 Parameter 这个类 

In [6]:
w=net[0].weight
b=net[0].bias 
print('name:',net[0].name,'\n weight:',w,'\n bias:',b)

name: sequential0_dense0 
 weight: Parameter sequential0_dense0_weight (shape=(4, 5), dtype=float32) 
 bias: Parameter sequential0_dense0_bias (shape=(4,), dtype=float32)


In [7]:
print('weight:',w.data())

weight: 
[[-0.06206018  0.06491279 -0.03182812 -0.01631819 -0.00312688]
 [ 0.0408415   0.04370362  0.00404529 -0.0028032   0.00952624]
 [-0.01501013  0.05958354  0.04705103 -0.06005495 -0.02276454]
 [-0.0578019   0.02074406 -0.06716943 -0.01844618  0.04656678]]
<NDArray 4x5 @cpu(0)>


In [8]:
print('weight gradient:',w.grad())

weight gradient: 
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
<NDArray 4x5 @cpu(0)>


In [9]:
print('bias:',b.data())


bias: 
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>


In [10]:
print('bias gradient',b.grad())

bias gradient 
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>


我们也可以通过collect_params 来访问Block里面所有的参数（这个会包括所有的Block)
它会返回一个名字到对应Parameter的dict。既可以用正常[]来访问参数，也可以用get(),
它不需要填写名字的前缀 .

In [11]:
params=net.collect_params()
print(params)
print(params['sequential0_dense0_bias'].data())
print(params.get('dense0_weight').data())


sequential0_ (
  Parameter sequential0_dense0_weight (shape=(4, 5), dtype=float32)
  Parameter sequential0_dense0_bias (shape=(4,), dtype=float32)
  Parameter sequential0_dense1_weight (shape=(2, 4), dtype=float32)
  Parameter sequential0_dense1_bias (shape=(2,), dtype=float32)
)

[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>

[[-0.06206018  0.06491279 -0.03182812 -0.01631819 -0.00312688]
 [ 0.0408415   0.04370362  0.00404529 -0.0028032   0.00952624]
 [-0.01501013  0.05958354  0.04705103 -0.06005495 -0.02276454]
 [-0.0578019   0.02074406 -0.06716943 -0.01844618  0.04656678]]
<NDArray 4x5 @cpu(0)>


# 使用不同的初始函数来初始化 
我们一直在使用默认的initialize 来初始化权重（除了指定GPU ctx外）。它会对所有的权重初始化成[-0.07, 0.07] 之间 均匀分布的随机数。我们可以使用别的初始化方法。例如使用均值为0。方差为0.02的正态分布。

In [12]:
from mxnet import init 
params.initialize(init=init.Normal(sigma=0.02),force_reinit=True)
print(net[0].weight.data(),net[0].bias.data())


[[-0.00359026  0.0302582  -0.01496244  0.01725933 -0.02177767]
 [ 0.01344385  0.00272668 -0.00392631 -0.03435376  0.01124353]
 [-0.00622001  0.00689362  0.02062465  0.00675439  0.01104854]
 [ 0.01147354  0.00579418 -0.04144352 -0.02262641  0.00582818]]
<NDArray 4x5 @cpu(0)> 
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>


看得更加清楚点： 

In [13]:
net

Sequential(
  (0): Dense(5 -> 4, Activation(relu))
  (1): Dense(4 -> 2, linear)
)

看得更加清楚点： 

In [14]:
params.initialize(init=init.One(),force_reinit=True)
print(net[0].weight.data(),net[0].bias.data())


[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]
<NDArray 4x5 @cpu(0)> 
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>


更多的方法参见init的 API。下面我们自定义一个初始化方法。

In [15]:
class MyInit(init.Initializer):
    def __init__(self):
        super(MyInit,self).__init__()
        self._verbose=True
    def _init_weight(self,_,arr):
        print('init weight',arr.shape)
        nd.random.uniform(low=5,high=10,out=arr)
    def _init_bias(self,_,arr):
        print('init bias',arr.shape)
        arr[:]=2 

params.initialize(init=MyInit(),force_reinit=True) 
print('*'*10)

init weight (4, 5)
init weight (2, 4)
**********


In [16]:
print(net[0].weight.data(),net[0].bias.data())


[[8.1713705 9.94187   9.794746  5.510224  8.263951 ]
 [6.044384  8.175295  5.8065476 9.976498  8.265541 ]
 [7.9092517 6.266458  7.071843  7.331554  7.3734875]
 [6.222128  8.117551  5.794848  6.690038  5.5518756]]
<NDArray 4x5 @cpu(0)> 
[0. 0. 0. 0.]
<NDArray 4 @cpu(0)>


In [17]:
net=get_net()
print(net.collect_params())

sequential1_ (
  Parameter sequential1_dense0_weight (shape=(4, 0), dtype=float32)
  Parameter sequential1_dense0_bias (shape=(4,), dtype=float32)
  Parameter sequential1_dense1_weight (shape=(2, 0), dtype=float32)
  Parameter sequential1_dense1_bias (shape=(2,), dtype=float32)
)


然后我们初始化

In [18]:
net.initialize(init=MyInit())

你会看到我们并没有看到MyInit打印的东西，这是因为我们仍然不知道形状。
真正的初始化发生在我们看到数据时。


In [19]:
net(x)

init weight (4, 5)
init weight (2, 4)



[[734.3467 660.6511]
 [736.1606 663.3772]
 [590.0534 533.7984]]
<NDArray 3x2 @cpu(0)>

这个时候我们看到shape 里面的0被填上正确的值了。 

In [20]:
print(net.collect_params())

sequential1_ (
  Parameter sequential1_dense0_weight (shape=(4, 5), dtype=float32)
  Parameter sequential1_dense0_bias (shape=(4,), dtype=float32)
  Parameter sequential1_dense1_weight (shape=(2, 4), dtype=float32)
  Parameter sequential1_dense1_bias (shape=(2,), dtype=float32)
)


# 避免延后初始化 
有时候我们不想要延后初始化，这时候可以在创建网络的时候指定输入大小

In [21]:
net=nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(4,in_units=5,activation='relu'))
    net.add(nn.Dense(2,in_units=4))
net.initialize(MyInit())

init weight (4, 5)
init weight (2, 4)


# 共享模型参数
有时候我们想在层之间共享同一份参数，我们可以通过Block 的 Params输出参数来手动指定参数，而不是让系统自动生成。

In [24]:
net=nn.Sequential()
with net.name_scope():
    net.add(nn.Dense(4,in_units=4,activation='relu'))
    net.add(nn.Dense(4,in_units=4,activation='relu'))
    net.add(nn.Dense(4,in_units=4,activation='relu',params=net[-1].params))
    net.add(nn.Dense(2))

# 初始化然后打印 

In [23]:
net.initialize(MyInit())
print(net[0].weight.data())
print(net[1].weight.data())

init weight (4, 4)
init weight (4, 4)

[[9.20693   6.4470305 6.323651  5.915957 ]
 [6.989104  7.9325647 7.764107  5.100538 ]
 [5.8247023 9.1447    6.8490405 5.0234776]
 [5.7322087 8.389082  7.848092  6.35004  ]]
<NDArray 4x4 @cpu(0)>

[[8.518686  8.67597   6.4423823 9.810943 ]
 [7.16644   6.243766  8.780533  7.880787 ]
 [6.9804916 7.96021   9.480192  7.8612595]
 [8.194605  6.115408  9.457772  9.763745 ]]
<NDArray 4x4 @cpu(0)>
