# Embedding Flatten Classifier

## 0. Prepare Environment

### Install and import dependent libraries
This section installs required package. Version should be specified for reproducibility.

In [None]:
! pip install janome==0.3.10 attrdict==2.0.1

In [None]:
# Import libraries
import attrdict
from janome.tokenizer import Tokenizer as JanomeTokenizer
import numpy as np
import random
import os
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Define parameters
Declare parameters set by `papermill` .

In [None]:
_params = dict(
    data_dir="data_sample",
    output_dir="output",
    num_words=500,
    embedding_size=100,
    batch_size=32,
    input_length=50,
    num_epochs=10,
    seed=1234,
)

In [None]:
# Convert provided parameter dictionary to attribute object
_params = attrdict.AttrDict(_params)

### Set Seed for Reproducibility

In [None]:
def set_seed(seed):
    import numpy as np
    import tensorflow as tf
    import random
    import os

    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
set_seed(_params.seed)

## 1. Define Problem

See README.md

## 2. Create Dataset

See README.md

## 3. Select Evaluation Metrics

See README.md

## 4. Determine Eavaluation Protocol

See README.md

## 5. Prepare Data

Load Data from File

In [None]:
def load_dataset(path):
    wakati_tokenizer = JanomeTokenizer(wakati=True)
    items = [x.split("\t") for x in open(path)]
    return [wakati_tokenizer.tokenize(item[1]) for item in items], [item[0] for item in items]

train_texts, train_labels = load_dataset(_params.data_dir + "/train.tsv")
valid_texts, valid_labels = load_dataset(_params.data_dir + "/valid.tsv")
test_texts, test_labels = load_dataset(_params.data_dir + "/test.tsv")

Define preprocessor and tokenizer

In [None]:
# Build tokenizer
_text_tokenizer = keras.preprocessing.text.Tokenizer(num_words=_params.num_words)
_text_tokenizer.fit_on_texts(train_texts)
_text_tokenizer.word_index

# [TODO] process unknown words

In [None]:
# Build label tmapper
# num_words is set to large number to cover all the labels
_label_tokenizer = keras.preprocessing.text.Tokenizer(num_words=10000, filters="", lower=False)
_label_tokenizer.fit_on_texts(train_labels)
_label_tokenizer.word_index

Save tokenizers

In [None]:
import json


def save(tokenizer, name, output_dir):
    config = tokenizer.to_json()
    with open(os.path.join(output_dir, name), "w") as f:
        print(config, file=f)


save(_text_tokenizer, "text_tokenizer.json", _params.output_dir)
save(_label_tokenizer, "label_tokenizer.json", _params.output_dir)

Preprocess data to convert data to input to the model

In [None]:
def build_data(texts, labels, text_tokenizer, label_tokenizer, max_len):
    x_data = pad_sequences(text_tokenizer.texts_to_sequences(texts), maxlen=max_len)
    y_data = np.array([label_tokenizer.word_index[l] for l in labels])
    return x_data, y_data

# Prepare train/validation set
_x_train, _y_train = build_data(train_texts, train_labels, _text_tokenizer, _label_tokenizer, _params.input_length)
_x_valid, _y_valid = build_data(train_texts, train_labels, _text_tokenizer, _label_tokenizer, _params.input_length)
_x_test, _y_test = build_data(train_texts, train_labels, _text_tokenizer, _label_tokenizer, _params.input_length)

## 6-8. Develop and Tune Models

6. Develop a model to overcome baseline model
7. Develop a overfitting model
8. Regularize the model and tune hyperparameters

In [None]:
def build_model(num_words, embedding_size, label_size, input_length):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(_params.num_words, _params.embedding_size, input_length=input_length))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(32, activation="relu"))
    model.add(keras.layers.Dense(label_size, activation="softmax"))
    return model

In [None]:
def train(params, label_size, x_train, y_train, x_valid, y_valid):
    model = build_model(params.num_words, params.embedding_size, label_size, params.input_length)
    model.summary()
    
    # Train
    model.compile(
        optimizer="rmsprop",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy", "sparse_top_k_categorical_accuracy"],
    )
    callbacks_list = [
        keras.callbacks.EarlyStopping(
            monitor="val_accuracy",
            patience=1, # Stop training if the monitor metric is not improved in 2 epochs in the row
        ),
        keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(params.output_dir, "model.h5"),
            monitor="accuracy",
            save_best_only=True,
        ),
        keras.callbacks.TensorBoard(
            log_dir=params.output_dir,
            histogram_freq=1,
            embeddings_freq=1,
        )
    ]

    history = model.fit(
        x_train,
        y_train,
        epochs=_params.num_epochs,
        batch_size=_params.batch_size,
        callbacks=callbacks_list,
        validation_data=(x_valid, y_valid),
    )
    return history.model

# label starts from 0 index, so 1 is added
_val_best_model = train(_params, len(_label_tokenizer.word_index)+1, _x_train, _y_train, _x_valid, _y_valid)

## 9. Evaluate Test Data

In [None]:
def evaluate(params, model, x_test, y_test):
    model.summary()
    print(model.evaluate(x_test, y_test))
    

evaluate(_params, _val_best_model, _x_test, _y_test)

In [None]:
def load_and_evaluate(params, x_test, y_test):
    model = keras.models.load_model(os.path.join(params.output_dir, "model.h5"))
    print(model.evaluate(x_test, y_test))
    
load_and_evaluate(_params, _x_test, _y_test)