<a href="https://colab.research.google.com/github/minghsu0107/ML/blob/master/my-keras/functional_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro

To handle tasks of multiple inputs or outputs, we need to build models that are not sequentially stacked.

In [None]:
from keras.models import Sequential, Model
from keras import layers, Input

seq_model = Sequential()
seq_model.add(layers.Dense(32, activation='relu', input_shape=(64,)))
seq_model.add(layers.Dense(32, activation='relu'))					   #1...
seq_model.add(layers.Dense(10, activation='softmax'))

input_tensor = Input(shape=(64,))   #← 建立一個初始張量

# 將初始張量傳入 Dense 層得到輸出張量 x
x = layers.Dense(32, activation='relu')(input_tensor)
 
# 再將第一層的結果 x 傳入第 2 個 Dense 層得到輸出張量 y                2...
y = layers.Dense(32, activation='relu')(x) 

# 再將第二層的結果 y 傳入最後一個 Dense 層得到最後的輸出張量 output_tensor
output_tensor = layers.Dense(10, activation='softmax')(y) 

# Model 類別 "用" 初始的輸入張量和最後的輸出張量來得到模型物件
model = Model(input_tensor, output_tensor)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64)]              0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_5 (Dense)              (None, 10)                330       
Total params: 3,466
Trainable params: 3,466
Non-trainable params: 0
_________________________________________________________________


# 以函數式 API 實作雙輸入問答模型

In [None]:
from keras import Model
from keras import layers
from keras import Input

text_vocabulary_size = 10000
question_vocabulary_size = 10000
answer_vocabulary_size = 500 # 對於某些事件先定義好詞彙
						 #↓1...                   #↓2...
text_input = Input(shape=(None, ), dtype='int32', name='text') 
embedded_text = layers.Embedding(text_vocabulary_size, 64)(text_input) #← 3...
print(embedded_text.shape)  	#→ (?, ?, 64)
encoded_text = layers.LSTM(32)(embedded_text) #← 4...
print(encoded_text.shape)  #	→ (?, 32)

question_input = Input(shape=(None, ), dtype='int32', name='question')
embedded_question = layers.Embedding(question_vocabulary_size, 32)(question_input) #5..
print(embedded_question.shape)  	#→ (?, ?, 32)
encoded_question = layers.LSTM(16)(embedded_question)
print(encoded_question.shape)  	#→ (?, 16)
													#↓6...
concatenated = layers.concatenate([encoded_question, encoded_text], axis=-1) 
print(concatenated.shape)  #→ (?, 48)

answer = layers.Dense(answer_vocabulary_size, activation='softmax')(concatenated) #← 7...
print(answer.shape)  #→ (?, 500) 

model = Model([text_input, question_input], answer) #← 8...
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

#1. shape = (None, ) 代表不限定張量的 shape 大小, 所以文字輸入可以是可變長度的整數序列。
#2. 請注意, 可以選擇是否為輸入命名
#3. 將輸入送進嵌入層, 編碼成大小 64 的文字嵌入向量 (處理 「參考文字」輸入)。
#4. 再透過 LSTM 層將向量序列編碼成單一個向量
#5. 處理「問題」輸入的流程 (與處理「參考文字」輸入的流程相同)
#6. 串接編碼後的「問題」和「參考文字」資料 (向量), 將兩份資料合而為一。axis 參數為 -1 代表以輸入的最後一個軸進行串接。
#7. 最後增加一個 Dense層 (softmax分類器), 將串接向量送入, 輸出模型的結果張量 answer。
#8. 在模型實例化時, 因為有兩個輸入, 所以將它們組成一個 list 一起做為輸入, 而輸出為 answer。

(None, None, 64)
(None, 32)
(None, None, 32)
(None, 16)
(None, 48)
(None, 500)
Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
question (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
text (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 32)     320000      question[0][0]                   
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 64)     640000      text[0][0]                       
________

In [None]:
import numpy as np

num_samples = 1000
max_length = 100

# 產生 text 資料：1000 筆, 每筆 100 個字 (數字)
text = np.random.randint(1, text_vocabulary_size, 
                         size=(num_samples, max_length))
#  [  [2, 15, 8000,..... 共 100 個], [],....共 1000 筆  ]  
#      ↑   ↑    ↑         
#     產生 1 ~ 10000 (text_vocabulary_size) 區間的數字 
print(text.shape)       # (1000, 100)

# 產生 question 資料, 與上面 text 產生方式相同
question = np.random.randint(1, question_vocabulary_size, 
                             size=(num_samples, max_length))
print(question.shape)   # (1000, 100)

# 產生 answers 資料, 需為 One-hot 編碼, 共 1000 個正確答案
answers = np.random.randint(0, 1, size=(num_samples, 
                                        answer_vocabulary_size))
#  [  [0, 1, 1,..... 共 100 個], [],.... 共 1000 筆  ]
#      ↑  ↑  ↑         
#     產生 0 ~ 1 的數字 
# 此為分類器要用的 One-encoding 編碼答案    
print(answers.shape)    # (1000, 500)

# 訓練方法 1：使用 list 方式送入資料進行擬合 
#model.fit([text, question], answers, epochs=10, batch_size=128)
# 訓練方法 2：使用 dict 方式送入資料進行擬合, 鍵為 Input 層的名稱, 值為 Numpy 資料
model.fit({'text': text, 'question': question}, answers, epochs=10,  batch_size=128)

(1000, 100)
(1000, 100)
(1000, 500)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f34d334fc88>

# 多輸出模型

In [None]:
from keras import layers, Input
from keras.models import Model

vocabulary_size = 50000 	#← 文章大小
num_income_groups = 10 	#← 將收入分成 10 群
                            
                          # ↓不限定輸入向量的 shape 大小
posts_input = Input(shape=(None,), dtype='int32', name='posts') 

# 用函數式 API 將輸入向量傳入 Embedding 層, 得到維度 256 的嵌入向量
embedding_posts = layers.Embedding(vocabulary_size, 256)(posts_input)
print(embedding_posts.shape)   # ← (?, ?, 256)

# 以下以函數式 API 將嵌入向量傳入一層層之中進行處理
x = layers.Conv1D(128, 5, activation='relu')(embedding_posts)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.GlobalMaxPooling1D()(x)  
x = layers.Dense(128, activation='relu')(x)
print(x.shape)  #← 走過一連串層之後, x.shape 為 (?, 128)

# 接下來將 x 向量分別送入 3 個輸出層。請注意, 
# 需為輸出層指定名稱(原因請見程式 7.5 中的編譯方法 2)

# 預測年紀的輸出層：純量迴歸任務
age_prediction = layers.Dense(1, name='age')(x)

# 預測收入族群的輸出層多分類任務 (10 類)
income_prediction = layers.Dense(num_income_groups, 
                                 activation='softmax', 
                                 name='income')(x)
# 預測性別的輸出層：二元分類任務
gender_prediction = layers.Dense(1, 
                                 activation='sigmoid', 
                                 name='gender')(x)

# 用輸入向量與輸出向量實例化 Model 物件
model = Model(posts_input, 
              [age_prediction, income_prediction, gender_prediction])
                 #↑ 因為輸出向量有 3 個, 所以用 list 來組成

model.summary()

(None, None, 256)
(None, 128)
Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
posts (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    12800000    posts[0][0]                      
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, None, 128)    163968      embedding_2[0][0]                
__________________________________________________________________________________________________
max_pooling1d (MaxPooling1D)    (None, None, 128)    0           conv1d[0][0]                     
_________________________________________________________

若有不平衡的損失會導致模型優先針對最大損失的任務優化，而犧牲其他任務。此時可以使用 loss_weights 為損失值分配不同程度的重要性。尤其當損失值使用不同單位時特別有用。比如 MSE 通常取值在 3 ~ 5，而 cross-entropy 則可低至 0.1，所以我們可以為 cross-entropy 配置 10 的權重，並為 MSE 配置 0.25 權重。

In [None]:
# 編譯方式 1 
model.compile(optimizer='rmsprop', 
              loss=['mse',		#← (需照建立層的順序)
                    'categorical_crossentropy', 
                    'binary_crossentropy'],
              loss_weights=[0.25, 1., 10.])
# 編譯方式 2 
model.compile(optimizer='rmsprop', 
              loss={'age': 'mse',	#← (需為輸出層指定名稱)
                    'income': 'categorical_crossentropy', 
                    'gender': 'binary_crossentropy'},
              loss_weights={'age': 0.25,
                            'income': 1.,
                            'gender': 10.})


In [None]:
# first way
model.fit(posts,
          [age_targets, income_targets, gender_targets],
          epochs=10,
          batch_size=64)
# second way
model.fit(posts,
          {'age', age_targets, 
           'income', income_targets, 
           'gender', gender_targets},
          epochs=10,
          batch_size=64)

NameError: ignored

# Inception

The extracted feature channels do not contain space information if we use 1 x 1 Conv layer in the beginning. In Inception, this helps separate channel features from space features (assuming that every channel is highly related to the space information and channels are not mutually related).

On the other hand, 1 x 1 Conv layer also plays a role in performing dimension reduction. For example, a (28, 28, 192) tensor processed by 5 x 5 conv kernel with a (28, 28, 32) output takes (28 x 28 x 192) x (5 x 5 x 32) = 120422400 operations. However, if we reduce the dimension to (28, 28, 16) by 1 x 1 conv layer first, it only takes (28 x 28 x 192) x (1 x 1 x 16) + (28 x 28 x 16) x (5 x 5 x 32) = 12443648 operations. Now we have reduced the computation time down to 1/10. Therefore, 1 x 1 conv layer is also called the bottleneck layer.

In [None]:
from keras import layers, Input

x = Input(batch_shape=(1000, 28, 28, 256))

branch_a = layers.Conv2D(64, 1, activation='relu', strides=2)(x) # stides=2: height and width become half
print(branch_a.shape) # (1000, 14, 14, 64)

branch_b = layers.Conv2D(128, 1, activation='relu')(x) # by default, strides=1 and no padding
# padding='same': 用zero-padding的手法，讓輸入的圖不會受到kernel map的大小影響
branch_b = layers.Conv2D(128, 3, activation='relu', strides=2, padding='same')(branch_b)
print(branch_b.shape) # (1000, 14, 14, 128)

branch_c = layers.AveragePooling2D(3, strides=2, padding='same')(x)
print(branch_c.shape) # (1000, 14, 14, 256)
branch_c = layers.Conv2D(128, 3, activation='relu', padding='same')(branch_c)
print(branch_c.shape) # (1000, 14, 14, 128)

branch_d = layers.Conv2D(128, 1, activation='relu')(x)
branch_d = layers.Conv2D(128, 3, activation='relu', padding='same')(branch_d)
print(branch_d.shape) # (1000, 28, 28, 128)
branch_d = layers.Conv2D(128, 3, activation='relu', strides=2, padding='same')(branch_d)
print(branch_d.shape) # (1000, 14, 14, 128)

output = layers.concatenate([branch_a, branch_b, branch_c, branch_d], axis=-1) # concatenate by the last dimension
print(output.shape) # (1000, 14, 14, 448)

(1000, 14, 14, 64)
(1000, 14, 14, 128)
(1000, 14, 14, 256)
(1000, 14, 14, 128)
(1000, 28, 28, 128)
(1000, 14, 14, 128)
(1000, 14, 14, 448)


In [None]:
from keras.applications.inception_v3 import InceptionV3

conv_base = InceptionV3(weights='imagenet',
                        include_top=False, # do not include the top dense layer (1000 categories)
                        input_shape=(224, 224, 3))

conv_base.summary()

Model: "inception_v3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv2d_484 (Conv2D)             (None, 111, 111, 32) 864         input_8[0][0]                    
__________________________________________________________________________________________________
batch_normalization_470 (BatchN (None, 111, 111, 32) 96          conv2d_484[0][0]                 
__________________________________________________________________________________________________
activation_470 (Activation)     (None, 111, 111, 32) 0           batch_normalization_470[0][0]    
_______________________________________________________________________________________

# Residual Learning

In [None]:
from keras import layers, Input

x = Input(batch_shape=(1000, 32, 32, 256))

y = layers.Conv2D(128, 3, activation='relu', padding='same')(x) # by default, strides=1 and no padding
z = layers.Conv2D(128, 3, activation='relu', padding='same')(y)
print(z.shape) # (1000, 32, 32, 128)
t = layers.MaxPool2D(pool_size=2, strides=2)(z) # by default, strides=pool_size
print(t.shape) # (1000, 16, 16, 128)

# linear transformation (use strides=2 for sampling)
residual = layers.Conv2D(128, 1, strides=2, padding='same')(x)
print(residual.shape) # (1000, 16, 16, 128)

op = layers.add([t, residual])
print(op.shape) # (1000, 16, 16, 128)

(1000, 32, 32, 128)
(1000, 16, 16, 128)
(1000, 16, 16, 128)
(1000, 16, 16, 128)


# Siamese CNN

Double camera:

In [None]:
from keras import layers
from keras import applications
from keras import Input

# 我們使用 Xception 神經網路的卷積基底 (不包含最上層的分類器) 進行影像的特徵萃取
xception_base = applications.Xception(weights=None, include_top=False)

# 建立左、右輸入張量 (左、右鏡頭影像), 其 shape 為 (250, 250, 3), 即為 250x250 的彩色影像。
left_input = Input(shape=(250, 250, 3))
right_input = Input(shape=(250, 250, 3))

# 呼叫相同的視覺模型兩次, 也就是將影像張量傳入 Xception 神經網路物件。
left_features = xception_base(left_input)
right_features = xception_base(right_input)

# 萃取出的左、右影像特徵張量 shape = (?, 8, 8, 2048)
print(left_features.shape)
print(right_features.shape)

# 串接左右影像特徵張量, shape = (?, 8, 8, 4096)
merged_features = layers.concatenate([left_features, right_features], axis=-1)
print(merged_features.shape)

(None, 8, 8, 2048)
(None, 8, 8, 2048)
(None, 8, 8, 4096)


# Siamese LSTM 

Example: A model for evaluating the similarity between two sentences.

Since the relation between sentence A and sentence B is the same as B and A, we don't need to train two independent LSTM for each sentence.

In [None]:
from keras import layers, Input
from keras.models import Model

lstm = layers.LSTM(32)

left_input = Input(shape=(None, 128))
right_input = Input(shape=(None, 128))

left_output = lstm(left_input) # (?, 32)
right_output = lstm(right_input) # (?, 32)

merged = layers.concatenate([left_output, right_output], axis=-1)
print(merged.shape) # (None, 64)

predictions = layers.Dense(1, activation='sigmoid')(merged)
model = Model([left_input, right_input], predictions)

(None, 64)


# Callbacks

In [None]:
import tensorflow as tf
import keras

def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1) # e^(-0.1)

callbacks_list = [
    keras.callbacks.EarlyStopping(monitor='val_acc', patience=15), # when val_acc does not improve for more than 15 epoch, stop training
    keras.callbacks.ModelCheckpoint(filepath='my_model.h5', monitor='val_loss', save_best_only=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                      factor=0.1, # when the callback is triggered, lr *= 0.1
                                      patience=10), # trigger callback if val_loss doesn't decrease more than 10 epochs                  
    keras.callbacks.LearningRateScheduler(scheduler),
]

# use model.fit(..., callbacks=callbacks_list)

In [None]:
import numpy as np

# save activations from each layer in each epoch
class ActivationLogger(keras.callbacks.Callback):

    def set_model(self, model):
        self.model = model # tell which model should use this callback
        layer_outputs = [layer.output for layer in model.layers]
        # custom model that gives outputs(activations) from each layer 
        self.activations_model = keras.models.Model(model.input, layer_outputs)
    
    def on_epoch_end(self, epoch, logs=None):
        if self.validation is None:
            raise RuntimeError('Require validation_data')
        
        validation_sample = self.validation_data[0][0:1]
        activations = self.activations_model.predict(validation_sample)
        with open(f'activation_at_epoch_{str(epoch)}.npz', 'w') as f:
            np.savez(f, activations)
    
    def on_epoch_begin(self, epoch, logs=None):
        pass
    
    def on_batch_begin(self, epoch, logs=None):
        pass

    def on_batch_end(self, epoch, logs=None):
        pass
    
    def on_train_begin(self, epoch, logs=None):
        pass

    def on_train_end(self, epoch, logs=None):
        pass


# TensorBoard

In [None]:
import keras 
from keras import layers
from keras.datasets import imdb
from keras.preprocessing import sequence

max_features = 2000
max_len = 500

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)

model = keras.models.Sequential()
model.add(layers.Embedding(max_features, 128, input_length=max_len, name='embed'))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPool1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(1))
model.summary()
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed (Embedding)            (None, 500, 128)          256000    
_________________________________________________________________
conv1d (Conv1D)              (None, 494, 32)           28704     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 98, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 92, 32)            7200      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 291,937
Trainable

In [None]:
!mkdir ./drive/"My Drive"/my_log_dir

In [None]:
callbacks = [
    keras.callbacks.TensorBoard(
        log_dir='./drive/My Drive/my_log_dir', 
        histogram_freq=1, # draw the activation histogram every 1 epoch
        embeddings_freq=1 # draw the 3D embeddings every 1 epoch
    )
]

history = model.fit(x_train, y_train, 
                    epochs=20, batch_size=128, 
                    validation_split=0.2, 
                    callbacks=callbacks)

Epoch 1/20
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# on linux
!tensorboard --logdir=./drive/"My Drive"/my_log_dir

# 深度可分離卷積神經網路

The depthwise separable convolution compute space convolution (3 x 3 kernel) for each input channel and concatecate them eventually. This separates the channel features from space features. Therefore, the computation cost is reduced significantly since there are less parameters.

In [45]:
from keras.models import Sequential, Model
from keras import layers

height = 64
width = 64
channels = 3
num_classes = 10

model = Sequential()
model.add(layers.SeparableConv2D(32, 3, 
                                 activation='relu', 
                                 input_shape=(height, width, channels)))
model.add(layers.SeparableConv2D(64, 3, activation='relu'))
model.add(layers.MaxPool2D(2))

model.add(layers.SeparableConv2D(64, 3, activation='relu'))
model.add(layers.SeparableConv2D(128, 3, activation='relu'))
model.add(layers.MaxPooling2D(2))

#model.add(layers.SeparableConv2D(64, 3, activation='relu'))  # 怪怪的
model.add(layers.SeparableConv2D(128, 3, activation='relu'))
model.add(layers.GlobalAveragePooling2D()) # 對張量一、二軸取最大值

model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(num_classes, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
separable_conv2d_67 (Separab (None, 62, 62, 32)        155       
_________________________________________________________________
separable_conv2d_68 (Separab (None, 60, 60, 64)        2400      
_________________________________________________________________
max_pooling2d_26 (MaxPooling (None, 30, 30, 64)        0         
_________________________________________________________________
separable_conv2d_69 (Separab (None, 28, 28, 64)        4736      
_________________________________________________________________
separable_conv2d_70 (Separab (None, 26, 26, 128)       8896      
_________________________________________________________________
max_pooling2d_27 (MaxPooling (None, 13, 13, 128)       0         
_________________________________________________________________
separable_conv2d_71 (Separab (None, 11, 11, 128)     