# 资料增补 (Data Augmentation)

In [1]:
# 载入套件
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## 从网路取得压缩档，并解压缩

In [None]:
# 从网路取得压缩档，并解压缩
import os
import zipfile

# 压缩档 URL
zip_file_path = 'https://download.microsoft.com/download/3/E/1/'
zip_file_path += '3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip'

# 存档路径
zip_file = os.path.join(os.getcwd(), 'CatAndDog.zip')

# 若压缩档案不存在，则下载档案
if not os.path.exists(zip_file):
    tf.keras.utils.get_file(
        os.path.join(zip_file),
        zip_file_path,
        archive_format='auto'
    )

# 若解压缩目录不存在，则解压缩档案至 unzip_path
unzip_path = os.path.join(os.getcwd(), 'CatAndDog')
if not os.path.exists(unzip_path):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(unzip_path)

Downloading data from https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip
191922176/824894548 [=====>........................] - ETA: 5:20

## 过滤不合格的档案

#### 扫描每一个档案，若表头不含"JFIF"，即为不合格的档案，不纳入训练资料内。


In [None]:
# 扫描每一个档案，若表头不含"JFIF"，即为不合格的档案，不纳入训练资料内。
num_skipped = 0   # 记录删除的档案个数
# 扫描目录
for folder_name in ("Cat", "Dog"):
    folder_path = os.path.join(unzip_path, "PetImages", folder_name)
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            fobj = open(fpath, "rb")
            is_jfif = tf.compat.as_bytes("JFIF") in fobj.peek(10)
        finally:
            fobj.close()

        if not is_jfif:
            num_skipped += 1
            # 删除档案
            os.remove(fpath)

print(f"删除 {num_skipped} 个档案")

## 以档案目录为基础，建立训练(Training)及验证(Validation)资料集(Dataset)

In [None]:
# image_dataset_from_directory：读取目录中的档案，存入 dataset
# image_dataset_from_directory：tf v2.3.0 才支援

image_size = (180, 180)  # 影像尺寸
batch_size = 32          # 批量

# 训练资料集(Dataset)
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    os.path.join(unzip_path, "PetImages"),
    validation_split=0.2,
    subset="training",
    seed=1337,
    image_size=image_size,
    batch_size=batch_size,
)
# 验证(Validation)资料集
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    os.path.join(unzip_path, "PetImages"),
    validation_split=0.2,
    subset="validation",
    seed=1337,
    image_size=image_size,
    batch_size=batch_size,
)

## 显示训练资料前9笔影像
### 标注为1是狗(dog)，0是猫(cat)


In [None]:
import matplotlib.pyplot as plt

# 显示训练资料前9笔影像
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(int(labels[i]))
        plt.axis("off")

## 定义资料增补(Data Augmentation)

In [None]:
# RandomFlip("horizontal")：水平翻转
# RandomRotation(0.1)：旋转 0.1 比例 
data_augmentation = keras.Sequential(
    [
        layers.experimental.preprocessing.RandomFlip("horizontal"),
        layers.experimental.preprocessing.RandomRotation(0.1),
    ]
)

## 显示资料增补后的影像

In [None]:
# 显示资料增补后的影像
plt.figure(figsize=(10, 10))
for images, _ in train_ds.take(1):
    for i in range(9):
        augmented_images = data_augmentation(images)
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(augmented_images[0].numpy().astype("uint8"))
        plt.axis("off")

## prefetch：预先读取训练资料，以提升效能

In [None]:
train_ds = train_ds.prefetch(buffer_size=32)
val_ds = val_ds.prefetch(buffer_size=32)

## 建立模型

In [None]:
# 定义模型
def make_model(input_shape, num_classes):
    inputs = keras.Input(shape=input_shape)
    # Image augmentation block
    x = data_augmentation(inputs)

    # 特征缩放
    x = layers.experimental.preprocessing.Rescaling(1.0 / 255)(x)
    x = layers.Conv2D(32, 3, strides=2, padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.Conv2D(64, 3, padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    previous_block_activation = x  # Set aside residual

    for size in [128, 256, 512, 728]:
        x = layers.Activation("relu")(x)
        x = layers.SeparableConv2D(size, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.Activation("relu")(x)
        x = layers.SeparableConv2D(size, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.MaxPooling2D(3, strides=2, padding="same")(x)

        # Project residual
        residual = layers.Conv2D(size, 1, strides=2, padding="same")(
            previous_block_activation
        )
        x = layers.add([x, residual])  # Add back residual
        previous_block_activation = x  # Set aside next residual

    x = layers.SeparableConv2D(1024, 3, padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.GlobalAveragePooling2D()(x)
    if num_classes == 2:
        activation = "sigmoid"
        units = 1
    else:
        activation = "softmax"
        units = num_classes

    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(units, activation=activation)(x)
    return keras.Model(inputs, outputs)

# 建立模型
model = make_model(input_shape=image_size + (3,), num_classes=2)

# 绘制模型结构
keras.utils.plot_model(model, show_shapes=True)

## 训练模型

In [None]:
epochs = 5

# 设定优化器(optimizer)、损失函数(loss)、效能衡量指标(metrics)的类别
model.compile(
    optimizer=keras.optimizers.Adam(1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

# 模型训练
model.fit(
    train_ds, epochs=epochs, validation_data=val_ds
)

### 训练 50 epochs，验证准确率可达 96%.

## 从目录中任选一个档案测试

In [None]:
# 模型存档
model.save('./pet_model.h5')

In [None]:
# 模型载入
model = tf.keras.models.load_model('./pet_model.h5')

In [None]:
# 任取一笔资料测试
img = keras.preprocessing.image.load_img(
    os.path.join(unzip_path, "PetImages/Cat/18.jpg"), target_size=image_size
)
img_array = keras.preprocessing.image.img_to_array(img) # 将影像转为阵列
img_array = tf.expand_dims(img_array, 0)  # 增加一维在最前面，代表一笔资料

predictions = model.predict(img_array)
score = predictions[0][0]
print(f"是猫的机率= {(100 * score):.2f}%")