<a href="https://colab.research.google.com/github/m3yrin/aligned-cross-entropy/blob/master/demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Demo code of m3yrin/aligned-cross-entropy
auther: @m3yrin



In [None]:
!nvidia-smi

In [None]:
!git clone https://github.com/m3yrin/aligned-cross-entropy.git
%cd aligned-cross-entropy
!ls

## AXE

In [None]:
!cat axe/axe.py

## Demo
This notebook is a example code for testing axe on
Latent alignment model. (CMLM model is still under construction..)  
https://arxiv.org/abs/2004.07437
* Base implementatios is https://github.com/m3yrin/nar-latent-alignment. Built on AllenNLP 0.9.0

In [None]:
!pip install allennlp==0.9.0

Dataset is an En-Ja parallel provided by @odashi.  
See detail at https://github.com/odashi/small_parallel_enja

In [None]:
%cd demos/datasets
!git clone https://github.com/odashi/small_parallel_enja.git
!ls small_parallel_enja
%cd ../../

### Config

In [None]:
# check common config.
!cat demos/configs/common.jsonnet

In [None]:
%%writefile demos/configs/common.jsonnet
# if you don't use cuda, cuda_device=-1
local cuda_device = 0;

# https://arxiv.org/pdf/2004.07437.pdf
# Our models consists of 12 self-attention
# layers, with 512 hidden size, 2048 filter size, and
# 8 attention heads per layer. We use 0.1 dropout
# for regularization. 

{
    "direction" : "ja-en",
    "train_data_path": "demos/datasets/small_parallel_enja/train",
    "validation_data_path": "demos/datasets/small_parallel_enja/dev",

    "embedding_dim" : 128,
    "feedforward_hidden_dim" : 2048,
    "num_layers" : 12,

    "batch_size" : 256,
    "min_count" : 2,

    "num_epochs" : 150,
    "patience": 10,
    "cuda_device" : cuda_device,
}

In [None]:
# check model config.
!cat demos/configs/axe.jsonnet

In [None]:
%%writefile demos/configs/axe.jsonnet
# common settings
local COMMON = import 'common.jsonnet';

local direction = COMMON['direction'];
local train_data_path = COMMON['train_data_path'];
local validation_data_path = COMMON['validation_data_path'];

local embedding_dim = COMMON['embedding_dim'];
local feedforward_hidden_dim = COMMON['feedforward_hidden_dim'];
local num_layers = COMMON['num_layers'];
local num_epochs = COMMON['num_epochs'];
local batch_size = COMMON['batch_size'];
local patience = COMMON['patience'];
local min_count = COMMON['min_count'];
local cuda_device = COMMON['cuda_device'];

local learning_rate_ctc = 0.001;
local SPECIAL_BLANK_TOKEN = "@@BLANK@@";

{
    "dataset_reader": {
      "type": "small_parallel_enja_reader",
      "direction" : direction,
      "add_start_end_tokens" : false, # the model doesn't use BOS/EOS for now.
    },
    "train_data_path": train_data_path,
    "validation_data_path": validation_data_path,
    "model": {
        "type": "latent_alignment_ctc",
        "source_embedder": {
            "type": "basic",
            "token_embedders": {
                "tokens": {
                    "type": "embedding",
                    "embedding_dim": embedding_dim
                }
            },
        },
        "net": {
            "type": "bidirectional_language_model_transformer",
            "input_dim": embedding_dim,
            "hidden_dim": feedforward_hidden_dim,
            "num_layers": num_layers,
        },
        "loss_type" : "axe",
        "label_smoothing" : 0.1,
    },
    "iterator": {
        "type": "bucket",
        "sorting_keys": [
            [
                "source_tokens",
                "num_tokens"
            ]
        ],
        "batch_size": batch_size
    },
    "trainer": {
        "optimizer": {
            "type": "adam",
            "lr": learning_rate_ctc
        },
        "patience": patience,
        "validation_metric": "+BLEU",
        "num_epochs": num_epochs,
        "cuda_device": cuda_device,
        "learning_rate_scheduler": {
            "type": "exponential",
            "gamma": 0.98
        },
    },
    "vocabulary": {
        "min_count": {
            "source_tokens": min_count,
            "target_tokens": min_count
        },
        "tokens_to_add": {
            "target_tokens": [
                SPECIAL_BLANK_TOKEN
            ]
        }
    }
}

### Training

In [None]:
!allennlp train -f --include-package demos -s demos/tmp demos/configs/axe.jsonnet

### Evaluation

In [None]:
!allennlp evaluate --output-file demos/tmp/output_test.json --include-package demos demos/tmp/model.tar.gz demos/datasets/small_parallel_enja/test

### Prediction

In [None]:
!python demos/datasets/make_json.py -I demos/datasets/small_parallel_enja/test.ja -O demos/datasets/test.ja.json

In [None]:
!allennlp predict --output-file demos/tmp/output_pred.json --include-package demos --predictor small_parallel_enja_predictor  demos/tmp/model.tar.gz demos/datasets/test.ja.json  --silent

In [None]:
!cat demos/tmp/output_pred.json

### For comparison
Using CTC as loss function as https://arxiv.org/abs/2004.07437

In [None]:
!cat demos/configs/ctc.jsonnet

In [None]:
# %%writefile demos/configs/ctc.jsonnet

In [None]:
!allennlp train -f --include-package demos -s demos/tmp_ctc demos/configs/ctc.jsonnet

In [None]:
!allennlp evaluate --output-file demos/tmp_ctc/output_test.json --cuda-device 0 --include-package demos demos/tmp_ctc/model.tar.gz demos/datasets/small_parallel_enja/test

In [None]:
#!python datasets/make_json.py -I datasets/small_parallel_enja/test.ja -O datasets/test.ja.json

In [None]:
!allennlp predict --output-file demos/tmp_ctc/output_pred.json --cuda-device 0 --include-package demos --predictor small_parallel_enja_predictor  demos/tmp_ctc/model.tar.gz demos/datasets/test.ja.json --silent

In [None]:
!cat demos/tmp_ctc/output_pred.json