### load the ALL data set and split it into 5 cross validation fold with correspond ids

In [None]:
import json
with open('data/data.jsonl') as f:
    sents = [json.loads(line.strip()) for line in f]

with open('data/split.jsonl') as f:
    cv_data = []
    for line in f:
        ids = json.loads(line.strip())
        train_ids, test_ids = ids['train_ids'], ids['test_ids']
        train_data = [sents[idx] for idx in train_ids]
        test_data = [sents[idx]  for idx in test_ids]
        cv_data.append((train_data, test_data))

### load model checkpoint for each fold and test it on corresponding validation set
- `model.test_report` stores test result

In [1]:
import logging
import pandas as pd
import pytorch_lightning as pl
from copy import deepcopy
from model import *
from utils import *

logging.basicConfig(level=logging.INFO)
trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=False,
    gpus=1
)

report = []
for i in range(5):
    ckpt_name = f'checkpoint/CV_{i}.ckpt'
    bert_name = 'pretrained/scibert_domain_adaption'
    train_data, val_data = cv_data[i]
    model = BERTSpan.load_from_checkpoint(ckpt_name, model_name=bert_name, train_dataset=[], val_dataset=[], test_dataset=val_data)
    trainer.test(model)
    report.append(deepcopy(model.test_report))

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Global seed set to 12345
Some weights of the model checkpoint at pretrained/scibert_domain_adaption were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were n

HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…

INFO:root:#### Strict Match Report ####
INFO:root:
                  precision    recall  f1-score   support

        Catalyst     0.8589    0.8804    0.8695      1279
Characterization     0.7662    0.8324    0.7979       185
         Product     0.8949    0.9183    0.9065      1224
        Reactant     0.9248    0.9132    0.9189      1198
        Reaction     0.9338    0.9417    0.9377       943
       Treatment     0.7995    0.8505    0.8242       408

       micro avg     0.8870    0.9038    0.8953      5237
       macro avg     0.8630    0.8894    0.8758      5237
    weighted avg     0.8880    0.9038    0.8957      5237


INFO:root:#### Soft Match Report ####
INFO:root:
                    precision   recall f1-score  support
Catalyst               0.9103   0.9249   0.9176     1279
Characterization       0.8275   0.8891   0.8572      185
Product                0.9224   0.9418   0.9320     1224
Reactant               0.9525   0.9398   0.9461     1198
Reaction               0.9507  

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'micro_f1': 0.8952993750572205}
--------------------------------------------------------------------------------



Global seed set to 12345
Some weights of the model checkpoint at pretrained/scibert_domain_adaption were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at pretrained/scibert_domain_adaption and are newly initialized: ['b

HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…

INFO:root:#### Strict Match Report ####
INFO:root:
                  precision    recall  f1-score   support

        Catalyst     0.8537    0.8774    0.8654      1264
Characterization     0.8140    0.8750    0.8434       200
         Product     0.8729    0.9094    0.8908      1148
        Reactant     0.8915    0.8998    0.8956      1068
        Reaction     0.9112    0.9247    0.9179       877
       Treatment     0.8017    0.8942    0.8455       416

       micro avg     0.8697    0.8993    0.8842      4973
       macro avg     0.8575    0.8968    0.8764      4973
    weighted avg     0.8705    0.8993    0.8845      4973


INFO:root:#### Soft Match Report ####
INFO:root:
                    precision   recall f1-score  support
Catalyst               0.9064   0.9212   0.9137     1264
Characterization       0.8438   0.9107   0.8760      200
Product                0.9137   0.9500   0.9315     1148
Reactant               0.9317   0.9367   0.9342     1068
Reaction               0.9276  

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'micro_f1': 0.8842313289642334}
--------------------------------------------------------------------------------



Global seed set to 12345
Some weights of the model checkpoint at pretrained/scibert_domain_adaption were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at pretrained/scibert_domain_adaption and are newly initialized: ['b

HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…

INFO:root:#### Strict Match Report ####
INFO:root:
                  precision    recall  f1-score   support

        Catalyst     0.8569    0.8645    0.8607      1240
Characterization     0.8112    0.8670    0.8381       218
         Product     0.8942    0.9237    0.9087      1180
        Reactant     0.9093    0.9055    0.9074      1185
        Reaction     0.9200    0.9329    0.9264       850
       Treatment     0.8195    0.8860    0.8514       456

       micro avg     0.8822    0.9010    0.8915      5129
       macro avg     0.8685    0.8966    0.8821      5129
    weighted avg     0.8828    0.9010    0.8916      5129


INFO:root:#### Soft Match Report ####
INFO:root:
                    precision   recall f1-score  support
Catalyst               0.9133   0.9047   0.9090     1240
Characterization       0.8511   0.9128   0.8809      218
Product                0.9227   0.9441   0.9333     1180
Reactant               0.9395   0.9390   0.9393     1185
Reaction               0.9358  

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'micro_f1': 0.8914825916290283}
--------------------------------------------------------------------------------



Global seed set to 12345
Some weights of the model checkpoint at pretrained/scibert_domain_adaption were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at pretrained/scibert_domain_adaption and are newly initialized: ['b

HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…

INFO:root:#### Strict Match Report ####
INFO:root:
                  precision    recall  f1-score   support

        Catalyst     0.8602    0.8478    0.8539      1393
Characterization     0.7796    0.8056    0.7923       180
         Product     0.8929    0.9023    0.8975      1136
        Reactant     0.8915    0.9019    0.8967      1121
        Reaction     0.9101    0.9189    0.9145       925
       Treatment     0.8076    0.8662    0.8359       441

       micro avg     0.8754    0.8841    0.8797      5196
       macro avg     0.8570    0.8738    0.8651      5196
    weighted avg     0.8757    0.8841    0.8798      5196


INFO:root:#### Soft Match Report ####
INFO:root:
                    precision   recall f1-score  support
Catalyst               0.9194   0.9019   0.9106     1393
Characterization       0.8116   0.8579   0.8341      180
Product                0.9309   0.9320   0.9315     1136
Reactant               0.9285   0.9442   0.9362     1121
Reaction               0.9281  

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'micro_f1': 0.8797395825386047}
--------------------------------------------------------------------------------



Global seed set to 12345
Some weights of the model checkpoint at pretrained/scibert_domain_adaption were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at pretrained/scibert_domain_adaption and are newly initialized: ['b

HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…

INFO:root:#### Strict Match Report ####
INFO:root:
                  precision    recall  f1-score   support

        Catalyst     0.8618    0.8677    0.8648      1179
Characterization     0.8678    0.8728    0.8703       173
         Product     0.8763    0.9041    0.8900      1168
        Reactant     0.9091    0.9059    0.9075      1126
        Reaction     0.8991    0.9238    0.9113       945
       Treatment     0.7945    0.8810    0.8356       496

       micro avg     0.8754    0.8964    0.8858      5087
       macro avg     0.8681    0.8926    0.8799      5087
    weighted avg     0.8762    0.8964    0.8860      5087


INFO:root:#### Soft Match Report ####
INFO:root:
                    precision   recall f1-score  support
Catalyst               0.9150   0.9194   0.9172     1179
Characterization       0.9090   0.8996   0.9043      173
Product                0.9072   0.9324   0.9196     1168
Reactant               0.9396   0.9338   0.9367     1126
Reaction               0.9174  

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'micro_f1': 0.8857808709144592}
--------------------------------------------------------------------------------



### group 5 fold result together
- the mean column is what we report in the paper

In [12]:
def report_summary(report_list):
    metric_name = ['precision', 'recall', 'f1-score', 'support']
    outer_index = []
    inner_index = []
    val_mat = []
    for label in ['Catalyst', 'Characterization', 'Product', 'Reactant', 'Reaction', 'Treatment'] + ['micro avg', 'macro avg']:
        for name in metric_name:
            val_list = []
            for val_res in report_list:
                if label not in val_res:
                    val_list.append(0)
                else:
                    val_list.append(val_res[label][name])
            val_list.append(np.mean(val_list))
            val_mat.append(val_list)
            outer_index.append(label)
            inner_index.append(name)
    df_index = pd.MultiIndex.from_arrays([outer_index, inner_index], names=['label', 'metric'])
    df = pd.DataFrame(val_mat, columns=[f'fold {i + 1}' for i in range(len(report_list))] + ['mean'], index=df_index)
    with pd.option_context('expand_frame_repr', False):
        print(df)

#summerize 5 fold result     
print('#'*20 +' Strict Match Report '+'#'*20)
report_summary([r[-1][0] for r in report])
print('\n\n')
print('#'*20 +' Soft Match Report '+'#'*20)
report_summary([r[-1][1] for r in report])
print('\n\n')

#################### Strict Match Report ####################
                                 fold 1       fold 2       fold 3       fold 4       fold 5         mean
label            metric                                                                                 
Catalyst         precision     0.858886     0.853734     0.856914     0.860160     0.861837     0.858306
                 recall        0.880375     0.877373     0.864516     0.847810     0.867684     0.867552
                 f1-score      0.869498     0.865392     0.860699     0.853941     0.864751     0.862856
                 support    1279.000000  1264.000000  1240.000000  1393.000000  1179.000000  1271.000000
Characterization precision     0.766169     0.813953     0.811159     0.779570     0.867816     0.807733
                 recall        0.832432     0.875000     0.866972     0.805556     0.872832     0.850559
                 f1-score      0.797927     0.843373     0.838137     0.792350     0.870317     0.