# **ACL22 MLC Paper - Pyramid Baseline**

In [None]:
# First, we set up the working environment in google drive. If you are working locally, it will not be necessary but make sure that you are using the GPU.
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# We will clone the repositories in the "MyDrive" folder.
%cd gdrive/MyDrive/

In [None]:
# We create a folder where we will clone each repository. If the folder is already created, then skip this step.
!mkdir pyramid-baseline

In [None]:
# We advance to the folder where we will save the baselines.
%cd pyramid-baseline/

In [None]:
# Clone the project from the official repository. If you have already cloned it, skip this step.
!git clone https://github.com/LorrinWWW/Pyramid.git

In [None]:
%cd Pyramid/

In [None]:
# We install the repository dependencies.
%%capture
!pip install gpustat
!pip install transformers
!pip install sentencepiece
!pip install allennlp
!pip install flair

In [None]:
# We generate the contextualized embeddings file. Make the necessary changes in the file to generate each combination of contextualized embeddings (flair, bert, bert+flair), and make sure that the delivered JSON files are placed in the requested folder.
# Consult the readme to see which model was used to generate the contextualized embeddings.
!python runs/gen_bert_flair_emb.py 

In [None]:
# If an error related to storage appears.
!pip install --upgrade google-cloud-storage

In [None]:
# Train the model, the default settings are for the wl dataset with flair embeddings.
# If this error appears: IndexError: index 20000 is out of bounds for dimension 0 with size 20000. Then, after line 121 add: if idx+1==20000: break
# If this error appears: RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor. Then, use the following line in the 58 line in seqs.py: packed_words = pack_padded_sequence(sorted_seq_tensor, sorted_seq_len.cpu(), True)
!python train_ner.py \
        --batch_size 32 \
        --evaluate_interval 500 \
        --dataset wl \
        --pretrained_wv cwlce.txt \
        --max_epoches 500 \
        --model_class PyramidNestNER  \
        --model_write_ckpt output_model \
        --optimizer sgd \
        --lr 0.01 \
        --tag_form iob2  \
        --cased 0 \
        --token_emb_dim 300 \
        --char_emb_dim 30 \
        --char_encoder lstm \
        --lm_emb_dim 4096 \
        --lm_emb_path wl_flair.emb.pkl \
        --tag_vocab_size 100 \
        --vocab_size 20000 \
        --dropout 0.4 \
        --max_depth 16

In [None]:
!python train_ner.py \
        --batch_size 64 \
        --evaluate_interval 500 \
        --dataset wl \
        --pretrained_wv cwlce.txt  \
        --max_epoches 500 \
        --model_class PyramidNestNER  \
        --model_read_ckpt output_model_path \
        --optimizer sgd \
        --lr 0.01 \
        --tag_form iob2  \
        --token_emb_dim 300 \
        --char_emb_dim 60 \
        --char_encoder lstm \
        --lm_emb_dim 0 \
        --tag_vocab_size 100 \
        --vocab_size 20000 \
        --dropout 0.40 \
        --max_depth 16 \
        --output_filename wl_predictions_file \
        --evaluate 1