<a href="https://colab.research.google.com/github/mogi240/AI_begin/blob/master/%ED%95%9C%EA%B8%80%EB%9D%84%EC%96%B4%EC%93%B0%EA%B8%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive') #mount안에 경로를 던져준다.

Mounted at /content/drive


In [2]:
import json
from argparse import ArgumentParser
from typing import List, Tuple

import tensorflow as tf

from keras.losses import LossFunctionWrapper

parser = ArgumentParser()
parser.add_argument("--train-file", type=str, required=True)
parser.add_argument("--dev-file", type=str, required=True)
parser.add_argument("--training-config", type=str, required=True)
parser.add_argument("--char-file", type=str, required=True)


class SpacingModel(tf.keras.Model):
    def __init__(
        self,
        vocab_size: int,
        hidden_size: int,
        num_classes: int = 3,
        conv_activation: str = "relu",
        dense_activation: str = "relu",
        conv_kernel_and_filter_sizes: List[Tuple[int, int]] = [
            (2, 8),
            (3, 8),
            (4, 8),
            (5, 8),
        ],
        dropout_rate: float = 0.3,
    ):
        super().__init__()

        self.embeddings = tf.keras.layers.Embedding(vocab_size, hidden_size)
        self.convs = [
            tf.keras.layers.Conv1D(
                filter_size,
                kernel_size,
                padding="same",
                activation=conv_activation,
            )
            for kernel_size, filter_size in conv_kernel_and_filter_sizes
        ]
        self.pools = [
            tf.keras.layers.MaxPooling1D(pool_size=filter_size, data_format="channels_first")
            for _, filter_size in conv_kernel_and_filter_sizes
        ]
        self.dropout1 = tf.keras.layers.Dropout(rate=dropout_rate)
        self.output_dense1 = tf.keras.layers.Dense(hidden_size, activation=dense_activation)
        self.dropout2 = tf.keras.layers.Dropout(rate=dropout_rate)
        self.output_dense2 = tf.keras.layers.Dense(num_classes)

    def call(self, input_tensor):
        """
        input_tensor: Tokenized Sequences, Shape: (Batch Size, Sequence Length)
        """

        # embeddings: (Batch Size, Sequence Length, Hidden Size)
        embeddings = self.embeddings(input_tensor)
        # features: (Batch Size, Sequence Length, sum(#filters))
        features = self.dropout1(
            tf.concat([pool(conv(embeddings)) for conv, pool in zip(self.convs, self.pools)], axis=-1)
        )
        # projected: (Batch Size, Sequence Length, Hidden Size)
        projected = self.dropout2(self.output_dense1(features))
        # (Batch Size, Sequence Length, 2)
        return self.output_dense2(projected)


def string_to_example(
    vocab_table: tf.lookup.StaticHashTable,
    encoding: str = "UTF-8",
    max_length: int = 256,
    delete_prob: float = 0.5,
    add_prob: float = 0.15,
):
    @tf.function
    def _inner(tensors: tf.Tensor):
        bytes_array = tf.strings.unicode_split(tf.strings.regex_replace(tensors, " +", " "), encoding)
        space_positions = bytes_array == " "
        sequence_length = tf.shape(space_positions)[0]

        while_condition = lambda i, *_: i < sequence_length

        def while_body(i, strings, labels):
            # 다음 char가 space가 아니고, 문장 끝이 아닐 때 add_prob의 확률로 space 추가
            # 이번 char가 space일 때
            is_next_char_space = tf.cond(i < sequence_length - 1, lambda: bytes_array[i + 1] == " ", lambda: False)

            state = tf.cond(
                is_next_char_space,
                lambda: tf.cond(tf.random.uniform([]) < delete_prob, lambda: 2, lambda: 0),
                lambda: tf.cond(bytes_array[i] != " " and tf.random.uniform([]) < add_prob, lambda: 1, lambda: 0),
            )
            # 0: 그대로 진행
            # 1: 다음 인덱스에 space 추가
            # 2: 다음 space 삭제
            strings = tf.cond(
                state != 1,
                lambda: tf.concat([strings, [bytes_array[i]]], axis=0),
                lambda: tf.concat([strings, [bytes_array[i], " "]], axis=0),
            )
            # label 0: 변화 x
            # label 1: 다음 인덱스에 space 추가
            # label 2: 현재 space 삭제
            labels = tf.cond(
                state == 0,
                lambda: tf.concat([labels, [0]], axis=0),
                lambda: tf.cond(
                    state == 1,
                    lambda: tf.concat([labels, [0, 2]], axis=0),
                    lambda: tf.concat([labels, [1]], axis=0),
                ),
            )
            i += tf.cond(state == 2, lambda: 2, lambda: 1)

            return (i, strings, labels)

        i, strings, labels = tf.while_loop(
            while_condition,
            while_body,
            (
                tf.constant(0),
                tf.constant([], dtype=tf.string),
                tf.constant([], dtype=tf.int32),
            ),
            shape_invariants=(tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([None])),
        )

        strings = vocab_table.lookup(tf.concat([["<s>"], strings, ["</s>"]], axis=0))
        labels = tf.concat([[0], labels, [0]], axis=0)

        strings = tf.cond(tf.shape(strings)[0] > max_length, lambda: strings[:max_length], lambda: strings)
        labels = tf.cond(tf.shape(labels)[0] > max_length, lambda: labels[:max_length], lambda: labels)

        length_to_pad = max_length - tf.shape(strings)[0]
        strings = tf.pad(strings, [[0, length_to_pad]])
        labels = tf.pad(labels, [[0, length_to_pad]], constant_values=-1)

        return (strings, labels)

    return _inner


def sparse_categorical_crossentropy_with_ignore(y_true, y_pred, from_logits=False, axis=-1, ignore_id=-1):
    positions = tf.where(y_true != ignore_id)

    y_true = tf.gather_nd(y_true, positions)
    y_pred = tf.gather_nd(y_pred, positions)

    return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=from_logits, axis=axis)


def sparse_categorical_accuracy_with_ignore(y_true, y_pred, ignore_id=-1):
    positions = tf.where(y_true != ignore_id)

    y_true = tf.gather_nd(y_true, positions)
    y_pred = tf.gather_nd(y_pred, positions)

    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)


#class SparseCategoricalCrossentropyWithIgnore(tf.python.keras.losses.LossFunctionWrapper):
class SparseCategoricalCrossentropyWithIgnore(LossFunctionWrapper):
    def __init__(
        self,
        from_logits=False,
        reduction=tf.keras.losses.Reduction.AUTO,
        ignore_id=-1,
        name="sparse_categorical_crossentropy_with_ignore",
    ):
        super(SparseCategoricalCrossentropyWithIgnore, self).__init__(
            sparse_categorical_crossentropy_with_ignore,
            name=name,
            reduction=reduction,
            ignore_id=ignore_id,
            from_logits=from_logits,
        )



In [None]:
 '''
 모델링
 '''

 def main():
    print('main start....')
    
    config =  {
    "train_batch_size": 64,
    "val_batch_size": 1024,
    "epochs": 5,
    "learning_rate": 0.01,
    "vocab_size": 5000,
    "hidden_size": 48,
    "conv_activation": "relu",
    "dense_activation": "relu",
    "conv_kernel_and_filter_sizes": [[2, 8], [3, 8], [4, 8], [5, 16], [6, 16], [7, 16], [8, 16], [9, 16], [10, 16]],
    "dropout_rate": 0.1
    }

    with open('/content/drive/MyDrive/input/hangeul/chars-4996') as f:
    #with open(args.char_file) as f:
        content = f.read()
        keys = ["<pad>", "<s>", "</s>", "<unk>"] + list(content)
        values = list(range(len(keys)))

    vocab_initializer = tf.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.string, value_dtype=tf.int32)
    vocab_table = tf.lookup.StaticHashTable(vocab_initializer, default_value=3)

    train_dataset = (
        tf.data.TextLineDataset(tf.constant('/content/drive/MyDrive/input/hangeul/namuwikitext_20200302.train.zip'))
        .shuffle(10000)
        .map(
            string_to_example(vocab_table),
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        )
        .batch(config["train_batch_size"])
    )
    dev_dataset = (
        tf.data.TextLineDataset(tf.constant('/content/drive/MyDrive/input/hangeul/namuwikitext_20200302.dev.zip'))
        .shuffle(10000)
        .map(
            string_to_example(vocab_table),
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        )
        .batch(config["val_batch_size"])
        .take(4)
    )

    model = SpacingModel(
        config["vocab_size"],
        config["hidden_size"],
        conv_activation=config["conv_activation"],
        dense_activation=config["dense_activation"],
        conv_kernel_and_filter_sizes=config["conv_kernel_and_filter_sizes"],
        dropout_rate=config["dropout_rate"],
    )
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=config["learning_rate"]),
        loss=SparseCategoricalCrossentropyWithIgnore(from_logits=True, ignore_id=-1),
        metrics=[sparse_categorical_accuracy_with_ignore],
    )
    model.fit(
        train_dataset,
        epochs=config["epochs"],
        validation_data=dev_dataset,
        steps_per_epoch=400,
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint(filepath="./models/checkpoint-{epoch}.ckpt"),
            tf.keras.callbacks.TensorBoard(log_dir="./logs"),
            tf.keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1),
        ],
    )

    # tf.saved_model.save(
    #     model,
    #     '/content/drive/MyDrive/input/',
    #     serve.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int32, name="input_tensor")),
    # )
    tf.saved_model.save(
        model
        ,'/content/drive/MyDrive/input/hangeul/my_custom_model_1'
        , signatures=None, options=None
    )
    model.save_weights('/content/drive/MyDrive/input/hangeul/weight/my_custom_model_weight')

    model = SpacingModel(
    config["vocab_size"],
    config["hidden_size"],
    conv_activation=config["conv_activation"],
    dense_activation=config["dense_activation"],
    conv_kernel_and_filter_sizes=config["conv_kernel_and_filter_sizes"],
    dropout_rate=config["dropout_rate"],
    )
    model.load_weights('/content/drive/MyDrive/input/hangeul/weight/my_custom_model_weight')


    @tf.function()
    def serve(input_tensor):
        return model(input_tensor)


    tf.saved_model.save(
        model,
        '/content/drive/MyDrive/input/hangeul/my_custom_model_2',
        serve.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int32, name="input_tensor")),
    )

    print('main end....')

if __name__ == "__main__":
    main()


main start....
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30

Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 7/30
Epoch 8/30

Epoch 8: ReduceLROnPlateau reducing learning rate to 9.999999310821295e-05.
Epoch 9/30
Epoch 10/30

Epoch 10: ReduceLROnPlateau reducing learning rate to 9.999999019782991e-06.
Epoch 11/30
Epoch 12/30

Epoch 12: ReduceLROnPlateau reducing learning rate to 9.99999883788405e-07.
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30

In [None]:
'''
모델평가
'''

import json
from argparse import ArgumentParser

import tensorflow as tf

# from train import (
#     SpacingModel,
#     string_to_example,
#     sparse_categorical_accuracy_with_ignore,
#     SparseCategoricalCrossentropyWithIgnore,
# )

# parser = ArgumentParser()
# parser.add_argument("--char-file", type=str, required=True)
# parser.add_argument("--model-file", type=str, required=True)
# parser.add_argument("--training-config", type=str, required=True)
# parser.add_argument("--test-file", type=str, required=True)
# parser.add_argument("--add-prob", type=float, required=True)
# parser.add_argument("--delete-prob", type=float, required=True)


def main():
    # args = parser.parse_args()

    config =  {
    "train_batch_size": 64,
    "val_batch_size": 1024,
    "epochs": 30,
    "learning_rate": 0.01,
    "vocab_size": 5000,
    "hidden_size": 48,
    "conv_activation": "relu",
    "dense_activation": "relu",
    "conv_kernel_and_filter_sizes": [[2, 8], [3, 8], [4, 8], [5, 16], [6, 16], [7, 16], [8, 16], [9, 16], [10, 16]],
    "dropout_rate": 0.1
    }

    with open('/content/drive/MyDrive/input/hangeul/chars-4996') as f:
        content = f.read()
        keys = ["<pad>", "<s>", "</s>", "<unk>"] + list(content)
        values = list(range(len(keys)))

    vocab_initializer = tf.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.string, value_dtype=tf.int32)
    vocab_table = tf.lookup.StaticHashTable(vocab_initializer, default_value=3)

    test_dataset = (
        tf.data.TextLineDataset('/content/drive/MyDrive/input/hangeul/namuwikitext_20200302.test.zip')).shuffle(10000).map(
            string_to_example(vocab_table),
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        ).batch(config["val_batch_size"])

    model = SpacingModel(
        config["vocab_size"],
        config["hidden_size"],
        conv_activation=config["conv_activation"],
        dense_activation=config["dense_activation"],
        conv_kernel_and_filter_sizes=config["conv_kernel_and_filter_sizes"],
        dropout_rate=config["dropout_rate"],
    )

    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=config["learning_rate"]),
        loss=SparseCategoricalCrossentropyWithIgnore(from_logits=True, ignore_id=-1),
        metrics=[sparse_categorical_accuracy_with_ignore],
    )

    model.load_weights('/content/drive/MyDrive/input/hangeul/weight/my_custom_model_weight')
    model(tf.keras.Input([None], dtype=tf.int32))
    model.summary()
    model.evaluate(test_dataset)


if __name__ == "__main__":
    main()


In [None]:
'''
시뮬레이션
'''
import time
from datetime import timedelta

def main():
    # args = parser.parse_args()

   

    config =  {
    "train_batch_size": 64,
    "val_batch_size": 1024,
    "epochs": 30,
    "learning_rate": 0.01,
    "vocab_size": 5000,
    "hidden_size": 48,
    "conv_activation": "relu",
    "dense_activation": "relu",
    "conv_kernel_and_filter_sizes": [[2, 8], [3, 8], [4, 8], [5, 16], [6, 16], [7, 16], [8, 16], [9, 16], [10, 16]],
    "dropout_rate": 0.1
    }

    with open('/content/drive/MyDrive/input/hangeul/chars-4996') as f:
    #with open(args.char_file) as f:
        content = f.read()
        keys = ["<pad>", "<s>", "</s>", "<unk>"] + list(content)
        values = list(range(len(keys)))

    vocab_initializer = tf.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.string, value_dtype=tf.int32)
    vocab_table = tf.lookup.StaticHashTable(vocab_initializer, default_value=3)

    model = SpacingModel(
        config["vocab_size"],
        config["hidden_size"],
        conv_activation=config["conv_activation"],
        dense_activation=config["dense_activation"],
        conv_kernel_and_filter_sizes=config["conv_kernel_and_filter_sizes"],
        dropout_rate=config["dropout_rate"],
    )

    model.load_weights('/content/drive/MyDrive/input/hangeul/orginal/variables/')
    #model.load_weights('/content/drive/MyDrive/input/hangeul/my_custom_model_2')
    model(tf.keras.Input([None], dtype=tf.int32))
    model.summary()

    #tf.keras.models.load_model('/content/drive/MyDrive/input/hangeul/my_custom_model_2')


    inference = get_inference_fn(model, vocab_table)

    while True:
 #       %%time
        input_str = input("Str: ")
        start = time.process_time()
        input_str = tf.constant(input_str)
        result = inference(input_str).numpy()
        end = time.process_time()
        print("Time elapsed: ", end - start)  # seconds
        print(b"".join(result).decode("utf8"))
        
        


def get_inference_fn(model, vocab_table):
    @tf.function
    def inference(tensors):
        byte_array = tf.concat(
            [["<s>"], tf.strings.unicode_split(tf.strings.regex_replace(tensors, " +", " "), "UTF-8"), ["</s>"]], axis=0
        )
        strings = vocab_table.lookup(byte_array)[tf.newaxis, :]

        model_output = tf.argmax(model(strings), axis=-1)[0]
        return convert_output_to_string(byte_array, model_output)

    return inference


def convert_output_to_string(byte_array, model_output):
    sequence_length = tf.size(model_output)
    while_condition = lambda i, *_: i < sequence_length

    def while_body(i, o):
        o = tf.cond(
            model_output[i] == 1,
            lambda: tf.concat([o, [byte_array[i], " "]], axis=0),
            lambda: tf.cond(
                (model_output[i] == 2) and (byte_array[i] == " "),
                lambda: o,
                lambda: tf.concat([o, [byte_array[i]]], axis=0),
            ),
        )
        return i + 1, o

    _, strings_result = tf.while_loop(
        while_condition,
        while_body,
        (tf.constant(0), tf.constant([], dtype=tf.string)),
        shape_invariants=(tf.TensorShape([]), tf.TensorShape([None])),
    )
    return strings_result


if __name__ == "__main__":
    main()
