### clean and tokenize your text

In [1]:
import stanza
import json
import logging
import pandas as pd
import pytorch_lightning as pl
from copy import deepcopy
from model import *
from utils import *


test_txt = '''
In order to reveal the influences of metal-incorporation and regeneration of ZSM-5 zeolites on naphtha catalytic cracking, the fresh and regenerated Sr, Zr and La-loaded ZSM-5 zeolites have been prepared and evaluated using n-pentane catalytic cracking as a model reaction.
It was found that the metal-incorporated ZSM-5 zeolites promoted hydride transfer reactions, and the Zr-incorporation helped to promote and maintain the catalytic activity while reduced alkenes selectivity;
the regenerated ZSM-5 zeolites promoted C–H bond breaking that increased alkenes selectivity and n-pentane conversion but accelerated catalyst deactivation.
The regenerated metal-incorporated ZSM-5 zeolites combined the feature roles of metal-incorporation and regeneration in modulating reaction pathways, and seemed a promising way to balance the activity, stability and alkenes selectivity, facilitating the optimal production for light olefins.
Within the research scope, the regenerated Zr-loaded ZSM-5 zeolites reached an optimal production (0.97 g) for light olefins in n-pentane catalytic cracking at 550 °C with a weight hourly space velocity of 3.7 h−1 in 3 h, which was 24% higher than that of the parent HZSM-5 (0.78 g).
'''
#load stanza tokenizer
nlp = stanza.Pipeline('en', package='craft', processors='tokenize', use_gpu=False)

test_sents = []
idx = 0
test_txt = cleanup_text(test_txt)
for sent in nlp(test_txt).sentences:
    sent_token = []
    for token in sent.tokens:
        # it is fine to label all token as O because it is not training
        sent_token.append({
            'text':token.text,
            'label':'O',
            "id":  idx,
            "start": token.start_char,
            "end": token.end_char,
        })
        idx += 1
    test_sents.append((sent.text, sent_token))
test_sents = stanza_fix(test_sents)


2022-03-25 16:49:30 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | craft   |

2022-03-25 16:49:30 INFO: Use device: cpu
2022-03-25 16:49:30 INFO: Loading: tokenize
2022-03-25 16:49:30 INFO: Done loading processors!


### predict using model checkpoint

In [2]:
import json
import logging
import pandas as pd
import pytorch_lightning as pl
from copy import deepcopy
from model import *
from utils import *

#use the checkpoint trained on first fold
ckpt_name = 'checkpoint/CV_0.ckpt'
bert_name = 'pretrained/scibert_domain_adaption'
model = BERTSpan.load_from_checkpoint(ckpt_name, model_name=bert_name, train_dataset=[], val_dataset=[], test_dataset=[])

def pred_model_dataset(model, sent):
    output_tensor_buf = []
    pred_dataset, pred_dataloader = model.gen_pred_dataloader(sent)
    
    model.setup('test')
    model = model.cuda()
    model.eval()
    with torch.no_grad():
        offset = 0
        for batch in tqdm(pred_dataloader):
            batch = model.batch_cuda(batch)
            model.pred_dataset_step(offset, batch, pred_dataset)
            offset += len(batch[0])
    return pred_dataset.output_pred()


output_sents = pred_model_dataset(model, test_sents)
for sent in output_sents:
    sent_tag = [t['pred'] for t in sent]
    print(assemble_token_text(sent))
    for i,j,l in get_bio_spans(sent_tag):
        print(assemble_token_text(sent[i:j+1]), l)
    print('\n\n')

Global seed set to 12345
Some weights of the model checkpoint at pretrained/scibert_domain_adaption were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at pretrained/scibert_domain_adaption and are newly initialized: ['b

In order to reveal the influences of metal-incorporation and regeneration of ZSM-5 zeolites on naphtha catalytic cracking, the fresh and regenerated Sr, Zr and La-loaded ZSM-5 zeolites have been prepared and evaluated using n-pentane catalytic cracking as a model reaction.
ZSM-5 zeolites Catalyst
naphtha Reactant
catalytic cracking Reaction
Sr, Zr and La-loaded ZSM-5 zeolites Catalyst
n-pentane Reactant
catalytic cracking Reaction



It was found that the metal-incorporated ZSM-5 zeolites promoted hydride transfer reactions, and the Zr-incorporation helped to promote and maintain the catalytic activity while reduced alkenes selectivity; the regenerated ZSM-5 zeolites promoted C-H bond breaking that increased alkenes selectivity and n-pentane conversion but accelerated catalyst deactivation.
metal-incorporated ZSM-5 zeolites Catalyst
hydride transfer reactions Reaction
alkenes Product
ZSM-5 zeolites Catalyst
alkenes Product
n-pentane Reactant



The regenerated metal-incorporated ZSM-5 


