# Embedding Flatten Classifier

## Install and import dependent libraries
This section installs required package. Version should be specified for reproducibility.

In [None]:
! pip install janome==0.3.10 attrdict==2.0.1

In [None]:
# Import libraries
import attrdict
from janome.tokenizer import Tokenizer as JanomeTokenizer
import numpy as np
import random
import os
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Define parameters
Declare parameters set by `papermill` .

In [None]:
name = "test"
data_dir = "data_sample"
output_dir ="output"
num_words = 500
embedding_size = 100
batch_size = 32
input_length = 50
num_epochs = 10

Create an attribute object `param` from parameters, then delete parameter variables to clean this namespace.

In [None]:
_params = attrdict.AttrDict({
    "name": name,
    "data_dir": data_dir,
    "output_dir": output_dir,
    "num_words": num_words,
    "embedding_size": embedding_size,
    "batch_size": batch_size,
    "input_length": input_length,
    "num_epochs": num_epochs,
})
del data_dir
del output_dir
del embedding_size
del num_words
del batch_size
del input_length
del num_epochs

## Load Dataset

In [None]:
def load_dataset(path):
    wakati_tokenizer = JanomeTokenizer(wakati=True)
    items = [x.split("\t") for x in open(path)]
    return [wakati_tokenizer.tokenize(item[1]) for item in items], [item[0] for item in items]

train_texts, train_labels = load_dataset(_params.data_dir + "/train.tsv")
valid_texts, valid_labels = load_dataset(_params.data_dir + "/valid.tsv")
test_texts, test_labels = load_dataset(_params.data_dir + "/test.tsv")

## Define preprocessor and tokenizer

In [None]:
# Build tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(num_words=_params.num_words)
tokenizer.fit_on_texts(train_texts)
tokenizer.word_index

# [TODO] process unknown words

In [None]:
# Build labels
# num_words is set to large number to cover all the labels
label_tokenizer = keras.preprocessing.text.Tokenizer(num_words=10000, filters="", lower=False)
label_tokenizer.fit_on_texts(train_labels)
label_tokenizer.word_index

## Define model

In [None]:
def build_model(num_words, embedding_size, label_size, input_length):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(_params.num_words, _params.embedding_size, input_length=input_length))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(32, activation="relu"))
    model.add(keras.layers.Dense(label_size, activation="softmax"))
    return model

## Tune hyper parameters fitting on validation data

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

In [None]:
def train(params):
    label_size = len(label_tokenizer.word_index)+1  # The number starts from 0, so add one
    model = build_model(params.num_words, params.embedding_size, label_size, params.input_length)
    model.summary()
    
    # Prepare train/validation set
    x_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=params.input_length)
    y_train = np.array([label_tokenizer.word_index[l] for l in train_labels])
    x_valid = pad_sequences(tokenizer.texts_to_sequences(valid_texts), maxlen=params.input_length)
    y_valid = np.array([label_tokenizer.word_index[l] for l in valid_labels])

    # Train
    model.compile(
        optimizer="rmsprop",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy", "sparse_top_k_categorical_accuracy"],
    )
    callbacks_list = [
        keras.callbacks.EarlyStopping(
            monitor="val_accuracy",
            patience=1, # Stop training if the monitor metric is not improved in 2 epochs in the row
        ),
        keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(params.output_dir, "model.h5"),
            monitor="accuracy",
            save_best_only=True,
        ),
        keras.callbacks.TensorBoard(
            log_dir=params.output_dir,
            histogram_freq=1,
            embeddings_freq=1,
        )
    ]

    history = model.fit(
        x_train,
        y_train,
        epochs=_params.num_epochs,
        batch_size=_params.batch_size,
        callbacks=callbacks_list,
        validation_data=(x_valid, y_valid),
    )
    return history.model

_val_best_model = train(_params)

## Evaluate the best model

In [None]:
def evaluate(params, model):
    # Prepare train/validation set
    x_test = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=params.input_length)
    y_test = np.array([label_tokenizer.word_index[l] for l in test_labels])
    
    model.summary()
    print(model.evaluate(x_test, y_test))
    

evaluate(_params, _val_best_model)