In [3]:
import tensorflow as tf
from tensorflow import keras
from keras import layers, optimizers, Sequential, metrics, datasets

### 模型的度量

ground-truth: 人们已经大概知道某数据应该符合什么分布

## 1.model capacity
模型次方越多， 表达能力越强

模型的层数越多，表达能力越强

## underfit
训练和验证的精确度都很低，不需要特殊检测

## overfit
使用交叉验证,交叉验证的原理是不断的切换从训练集挑选验证集的位置，切换的时机可以是epoch等

In [25]:
# 交叉验证
def preprocess(x, y):
    """
    x is a simple image, not a batch
    """
    x = tf.cast(x, dtype=tf.float32) / 255.
    x = tf.reshape(x, [28*28])
    y = tf.cast(y, dtype=tf.int32)
    y = tf.one_hot(y, depth=10)
    return x,y


batchsz = 128
(x, y), (x_test, y_test) = datasets.mnist.load_data()
print('datasets:', x.shape, y.shape, x.min(), x.max())


datasets: (60000, 28, 28) (60000,) 0 255


In [7]:
db_test = tf.data.Dataset.from_tensor_slices((x_test, y_test))
db_test = db_test.map(preprocess).batch(batchsz) 

In [8]:
network = Sequential([layers.Dense(256, activation='relu'),
                     layers.Dense(128, activation='relu'),
                     layers.Dense(64, activation='relu'),
                     layers.Dense(32, activation='relu'),
                     layers.Dense(10)])
network.build(input_shape=(None, 28*28))
network.summary()

network.compile(optimizer=optimizers.Adam(learning_rate=0.01),
		loss=tf.losses.CategoricalCrossentropy(from_logits=True),
		metrics=['accuracy']
	)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 256)               200960    
                                                                 
 dense_11 (Dense)            (None, 128)               32896     
                                                                 
 dense_12 (Dense)            (None, 64)                8256      
                                                                 
 dense_13 (Dense)            (None, 32)                2080      
                                                                 
 dense_14 (Dense)            (None, 10)                330       
                                                                 
Total params: 244,522
Trainable params: 244,522
Non-trainable params: 0
_________________________________________________________________


In [10]:
for epoch in range(5):
    idx = tf.range(60000)
    idx = tf.random.shuffle(idx) # 每个epoch打乱数据索引
    # x_train, x_val = tf.split(x, num_or_size_splits=[50000, 10000])
    # y_train, y_val = tf.split(y, num_or_size_splits=[50000, 10000])
    x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
    x_val, y_val = tf.gather(x, idx[-10000:]) , tf.gather(y, idx[-10000:])
    print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)
    db_train = tf.data.Dataset.from_tensor_slices((x_train,y_train))
    db_train = db_train.map(preprocess).shuffle(50000).batch(batchsz)

    db_val = tf.data.Dataset.from_tensor_slices((x_val,y_val))
    db_val = db_val.map(preprocess).shuffle(10000).batch(batchsz)

    network.fit(db_train, validation_data=db_val, validation_freq=1)

print('Test performance:') 
network.evaluate(db_test)

(50000, 28, 28) (50000,) (10000, 28, 28) (10000,)
(50000, 28, 28) (50000,) (10000, 28, 28) (10000,)
(50000, 28, 28) (50000,) (10000, 28, 28) (10000,)
(50000, 28, 28) (50000,) (10000, 28, 28) (10000,)
(50000, 28, 28) (50000,) (10000, 28, 28) (10000,)
Test performance:


[0.11246237903833389, 0.9764999747276306]

In [30]:
# 使用tensorflow自动随机划分验证集
x_train_val = tf.convert_to_tensor(x)
x_train_val = tf.cast(x_train_val, dtype=tf.float32) / 255
x_train_val = tf.reshape(x_train_val, (60000, 28 * 28))
y_train_val = tf.cast(y, dtype=tf.int32)
y_train_val = tf.one_hot(y_train_val, depth=10)
print("x shape:", x_train_val.shape, "y shape:", y_train_val.shape)
# db_train_val = tf.data.Dataset.from_tensor_slices((x,y))
# db_train_val = db_train_val.map(preprocess).shuffle(60000).batch(batchsz)
# validation_split不支持dataset，只能使用tensor或者numpy
network.fit(x_train_val, y_train_val, epochs=6, batch_size=batchsz, validation_split=0.15, validation_freq=2)

x shape: (60000, 784) y shape: (60000, 10)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x233b6233130>

### 解决overfit
1. 使用更多的数据
2. 约束模型复杂度
   1. 根据数据集，选择合适的网络结构
   2. 可以先从大（深）的网络开始，不停的降低复杂度（regularization)
3. 动量与学习率，动量是上次梯度和本次梯度的矢量和
   
   ![Momentum](./images/Momentum.png)
   
4. 删除一些参数(Dropout), 不再全连接，层与层之间稀疏连接
5. Early Stopping, 提早结束训练

#### Regularization
核心原理就是让高维参数接近0，降低模型复杂度，从而解决overfit

![regularization](./images/regularization.png)

缺点：限制模型的表达能力，下图上面三个图是使用了Regularization，下面三个没用。

![regularization_short](./images/regularization_short.png)

In [None]:
# Regularization(Weigth decay) 手动实现
# 在计算loss时，为模型变量增加l2范数

def preprocess(x, y):

    x = tf.cast(x, dtype=tf.float32) / 255.
    y = tf.cast(y, dtype=tf.int32)

    return x,y


batchsz = 128
(x, y), (x_val, y_val) = datasets.mnist.load_data()
print('datasets:', x.shape, y.shape, x.min(), x.max())



db = tf.data.Dataset.from_tensor_slices((x,y))
db = db.map(preprocess).shuffle(60000).batch(batchsz).repeat(10)

ds_val = tf.data.Dataset.from_tensor_slices((x_val, y_val))
ds_val = ds_val.map(preprocess).batch(batchsz) 


network = Sequential([layers.Dense(256, activation='relu'),
                     layers.Dense(128, activation='relu'),
                     layers.Dense(64, activation='relu'),
                     layers.Dense(32, activation='relu'),
                     layers.Dense(10)])
network.build(input_shape=(None, 28*28))
network.summary()

optimizer = optimizers.Adam(lr=0.01)

for step, (x,y) in enumerate(db):

    with tf.GradientTape() as tape:
        # [b, 28, 28] => [b, 784]
        x = tf.reshape(x, (-1, 28*28))
        # [b, 784] => [b, 10]
        out = network(x)
        # [b] => [b, 10]
        y_onehot = tf.one_hot(y, depth=10) 
        # [b]
        loss = tf.reduce_mean(tf.losses.categorical_crossentropy(y_onehot, out, from_logits=True))


        loss_regularization = []
        for p in network.trainable_variables:
            loss_regularization.append(tf.nn.l2_loss(p))
        # 这里求和必须是个tensor，所以需要stack合并一下
        loss_regularization = tf.reduce_sum(tf.stack(loss_regularization))

        loss = loss + 0.0001 * loss_regularization
 

    grads = tape.gradient(loss, network.trainable_variables)
    optimizer.apply_gradients(zip(grads, network.trainable_variables))


    if step % 100 == 0:

        print(step, 'loss:', float(loss), 'loss_regularization:', float(loss_regularization)) 


    # evaluate
    if step % 500 == 0:
        total, total_correct = 0., 0

        for step, (x, y) in enumerate(ds_val): 
            # [b, 28, 28] => [b, 784]
            x = tf.reshape(x, (-1, 28*28))
            # [b, 784] => [b, 10]
            out = network(x) 
            # [b, 10] => [b] 
            pred = tf.argmax(out, axis=1) 
            pred = tf.cast(pred, dtype=tf.int32)
            # bool type 
            correct = tf.equal(pred, y)
            # bool tensor => int tensor => numpy
            total_correct += tf.reduce_sum(tf.cast(correct, dtype=tf.int32)).numpy()
            total += x.shape[0]

        print(step, 'Evaluate Acc:', total_correct/total)

In [None]:
# 使用框架
network = Sequential([layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
                     layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
                     layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
                     layers.Dense(32, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001)),
                     layers.Dense(10)])

In [None]:
# Momentum
# 一般是设在optimizer中
# 动量比率一般设为0.9
optimizer = tf.optimizers.SGD(learning_rate=0.02, momentum=0.9)
optimizer = tf.optimizers.RMSprop(learning_rate=0.02, momentum=0.9)

In [None]:
# dynamic learning rate
# 这只是一个示例  可以任意调整
optimizer = tf.optimizers.SGD(learning_rate=0.2)
for epoch in range(100):
    optimizer.learning_rate = 0.2 * (100 - epoch) / 100

In [None]:
# Early Stopping

In [None]:
# Dropout  可以和regularization同用
network = Sequential([layers.Dense(256, activation='relu'),
                     layers.Dropout(0.5), # 0.5 rate to drop 注意在训练和测试时，需要在network标记，看下面代码
                     layers.Dense(128, activation='relu'),
                     layers.Dropout(0.5), # 0.5 rate to drop
                     layers.Dense(64, activation='relu'),
                     layers.Dense(32, activation='relu'),
                     layers.Dense(10)])


optimizer = optimizers.Adam(learning_rate=0.01)



for step, (x,y) in enumerate(db):

    with tf.GradientTape() as tape:
        # [b, 28, 28] => [b, 784]
        x = tf.reshape(x, (-1, 28*28))
        # [b, 784] => [b, 10]
        # 标记这是训练过程
        out = network(x, training=True)
        # [b] => [b, 10]
        y_onehot = tf.one_hot(y, depth=10) 
        # [b]
        loss = tf.reduce_mean(tf.losses.categorical_crossentropy(y_onehot, out, from_logits=True))


        loss_regularization = []
        for p in network.trainable_variables:
            loss_regularization.append(tf.nn.l2_loss(p))
        loss_regularization = tf.reduce_sum(tf.stack(loss_regularization))

        loss = loss + 0.0001 * loss_regularization
 

    grads = tape.gradient(loss, network.trainable_variables)
    optimizer.apply_gradients(zip(grads, network.trainable_variables))


    if step % 100 == 0:

        print(step, 'loss:', float(loss), 'loss_regularization:', float(loss_regularization)) 


    # evaluate
    if step % 500 == 0:
        total, total_correct = 0., 0

        for step, (x, y) in enumerate(ds_val): 
            # [b, 28, 28] => [b, 784]
            x = tf.reshape(x, (-1, 28*28))
            # [b, 784] => [b, 10] 
            out = network(x, training=False)  
            # [b, 10] => [b] 
            pred = tf.argmax(out, axis=1) 
            pred = tf.cast(pred, dtype=tf.int32)
            # bool type 
            correct = tf.equal(pred, y)
            # bool tensor => int tensor => numpy
            total_correct += tf.reduce_sum(tf.cast(correct, dtype=tf.int32)).numpy()
            total += x.shape[0]

        print(step, 'Evaluate Acc with drop:', total_correct/total)

        total, total_correct = 0., 0

        for step, (x, y) in enumerate(ds_val): 
            # [b, 28, 28] => [b, 784]
            x = tf.reshape(x, (-1, 28*28))
            # [b, 784] => [b, 10] 
            # 标记非训练过程
            out = network(x, training=False)  
            # [b, 10] => [b] 
            pred = tf.argmax(out, axis=1) 
            pred = tf.cast(pred, dtype=tf.int32)
            # bool type 
            correct = tf.equal(pred, y)
            # bool tensor => int tensor => numpy
            total_correct += tf.reduce_sum(tf.cast(correct, dtype=tf.int32)).numpy()
            total += x.shape[0]

        print(step, 'Evaluate Acc without drop:', total_correct/total)