# Test Script for Trained COMET Metrics

## Preparation

In [5]:
#Import packages
from comet import load_from_checkpoint
import time
from scipy.stats import pearsonr
import numpy as np
import pandas as pd

In [18]:
#Extract WMT data
wmt_data = {}
language_pairs = ['de-en', 'cs-en', 'fi-en', 'ru-en']
srcs = []
hyps = []
refs = []
gold = []
langs = []
for language_pair in language_pairs:
    with open(f'data/DAseg.newstest2015.source.{language_pair}', encoding='utf-8') as f:
              srcs += [line.strip() for line in f]
    with open(f'data/DAseg.newstest2015.mt-system.{language_pair}', encoding='utf-8') as f:
              hyps += [line.strip() for line in f]
    with open(f'data/DAseg.newstest2015.reference.{language_pair}', encoding='utf-8') as f:
              refs += [line.strip() for line in f]
    with open(f'data/DAseg.newstest2015.human.{language_pair}', encoding='utf-8') as f:
              gold += [float(line.strip()) for line in f]
    langs += [language_pair]*(len(gold)-len(langs))

wmt_data = {'srcs':srcs, 'hyps':hyps, 'refs':refs, 'gold':gold, 'langs':langs}

In [19]:
#Create test dataset
test_data = [{"src":'', "mt":hyp, "ref":ref} for src, hyp, ref in zip(wmt_data['srcs'], wmt_data['hyps'], wmt_data['refs'])]

In [21]:
#Load checkpoint paths
minilm_checkpoint_path = '../comet_train/lightning_logs/minilm/checkpoints/last.ckpt'
minilm_adapter_checkpoint_path = '../comet_train/lightning_logs/minilm_adapter/checkpoints/last.ckpt'
xlmr_checkpoint_path = '../comet_train/lightning_logs/xlmr/checkpoints/last.ckpt'
xlmr_adapter_checkpoint_path = '../comet_train/lightning_logs/xlmr_adapter/checkpoints/last.ckpt'
#Load checkpoints
minilm = load_from_checkpoint(minilm_checkpoint_path)
minilm_adapter = load_from_checkpoint(minilm_adapter_checkpoint_path)
xlmr = load_from_checkpoint(xlmr_checkpoint_path)
xlmr_adapter = load_from_checkpoint(xlmr_adapter_checkpoint_path)
#Create a dict containing checkpoints
checkpoint_dict = {'minilm':minilm, 'minilm_adapter':minilm_adapter, 'xlmr':xlmr, 'xlmr_adapter':xlmr_adapter}

Encoder model frozen.
Encoder model frozen.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Encoder model frozen.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.we

## Evaluation

In [32]:
#Create lists to store the evaluation values
checkpoint_name_list = []
duration_list = []
corr_list = []

#Evaluation of each of the four models under consideration
for checkpoint_name in checkpoint_dict.keys():
    #Determine start time
    start_time = time.time()
    #Compute predictions
    seg_scores, sys_score = checkpoint_dict[checkpoint_name].predict(test_data, gpus=0)
    #Compute test runtime/duration
    duration = time.time() - start_time
    #Compute Pearson correlation
    corr = pearsonr(seg_scores, wmt_data['gold']).statistic
    #Append evaluation values to lists
    checkpoint_name_list.append(checkpoint_name)
    duration_list.append(duration)
    corr_list.append(corr)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████| 250/250 [03:11<00:00,  1.31it/s]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████| 250/250 [03:20<00:00,  1.24it/s]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████| 250/250 [10:32<00:00,  2.53s/it]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████████████████

In [43]:
#Save the training runtime 
#----------------------------------------------------------------#
#This had to be done manually, because the training via the bash #
#did not offer the possibility to save the corresponding runtimes#
#----------------------------------------------------------------#
#Computation: Over five epochs calculate the sum of minutes times# 
#             60 (seconds) plus sum of seconds over five epochs  #
#----------------------------------------------------------------#
dur_train_xlmr = (13 + 34 + 35 + 34 + 34)*60 +  (22 + 46 + 13 + 49 + 31)
dur_train_xlmr_adapter = (13 + 35 + 35 + 36 + 37)*60 + (34 + 30 + 38 + 15 + 32) 
dur_train_minilm = (6 + 14 + 11 + 10 + 10)*60 + (24 + 30 + 47 + 59 + 57)
dur_train_minilm_adapter = (5 + 11 + 11 + 11 + 11)*60 + (1 + 21 + 21 + 18 + 15)
dur_train_list = [dur_train_minilm, dur_train_minilm_adapter, dur_train_xlmr, dur_train_xlmr_adapter]

In [45]:
#Collect the evalaution values in a dataframe
df = pd.DataFrame(list(zip(dur_train_list, duration_list, corr_list)),
                  columns =['TrainDur','TestDur', 'Correlation'],
                  index = checkpoint_name_list)
df

Unnamed: 0,TrainDur,TestDur,Correlation
minilm,3277,191.033967,0.69233
minilm_adapter,3016,200.881294,0.69233
xlmr,9161,632.64712,0.64985
xlmr_adapter,9509,717.769104,0.687421


## Models contained in checkpoint files

### minilm

In [53]:
minilm.encoder.model.config

BertConfig {
  "_name_or_path": "microsoft/Multilingual-MiniLM-L12-H384",
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "XLMRobertaTokenizer",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 250037
}

In [54]:
minilm_adapter.encoder.model.config

BertConfig {
  "_name_or_path": "microsoft/Multilingual-MiniLM-L12-H384",
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "XLMRobertaTokenizer",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 250037
}

### xlmr

In [55]:
xlmr.encoder.model.config

XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

In [56]:
xlmr_adapter.encoder.model.config

XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "adapters": {
    "adapters": {
      "en": "16eaa0b5fae9ed68"
    },
    "config_map": {
      "16eaa0b5fae9ed68": {
        "adapter_residual_before_ln": false,
        "cross_adapter": false,
        "inv_adapter": "nice",
        "inv_adapter_reduction_factor": 2,
        "leave_out": [],
        "ln_after": false,
        "ln_before": false,
        "mh_adapter": false,
        "non_linearity": "relu",
        "original_ln_after": true,
        "original_ln_before": true,
        "output_adapter": true,
        "reduction_factor": 2,
        "residual_before_ln": true
      }
    },
    "fusion_config_map": {},
    "fusions": {}
  },
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 

In the COMET model configuration for MiniLM trained with adapters, no adapter is displayed. In contrast, this is done for XLM-R trained with adapters. Consequently, in the case of MiniLM, the adapter was not included in the COMET metric. 