In [1]:
# For Google Cloud
from google.cloud import storage
import google.auth

client = storage.Client()
bucket = client.get_bucket('gd-gcp-healthcare-search')
dataset_blob = bucket.blob('qa_ncbi_dataset.tsv')

dataset_file_name = 'data/qa_ncbi_dataset.tsv'
dataset_blob.download_to_filename(dataset_file_name)

In [2]:
import sys
import numpy as np

import math
import scipy
from scipy.sparse import hstack
import pandas as pd
from tqdm import tqdm
import re

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
import logging

In [3]:
import pecos
from pecos.utils.featurization.text.preprocess import Preprocessor
from pecos.xmc.xtransformer.model import XTransformer, XLinearModel
from pecos.xmc.xtransformer.module import MLProblemWithText
from pecos.utils import smat_util
from pecos.xmc import Indexer, LabelEmbeddingFactory
from pecos.xmc.xtransformer.matcher import TransformerMatcher

from pecos.utils import logging_util
logging_util.setup_logging_config(level=2) # LOGGING CONFIG

In [4]:
from sentence_transformers import models, losses, SentenceTransformer

### Bert vectorizer class

In [5]:
class BertVectorizer:

    def __init__(self, model_name, device=None, batch_size=4):
        # Load pre-trained model tokenizer (vocabulary-multilingual)
        self.model = SentenceTransformer(model_name, 
                                         device=device)
        
        self.device = device
        self.batch_size = batch_size

    @property
    def max_seq_length(self):
        return self.model.max_seq_length

    def fit(self):
        pass

    def transform(self, sentences, return_sparse=False):
        sentences_vectors = self.model.encode(sentences,
                                              batch_size=self.batch_size, 
                                              normalize_embeddings=True,
                                              show_progress_bar=True,
                                              device=self.device)
        
        if return_sparse:
            sentences_vectors = scipy.sparse.csr_matrix(sentences_vectors, 
                                                        dtype=np.float32)

        return sentences_vectors

### Load dataset

In [6]:
dataset_filename = None
try:
    credentials, project = google.auth.default()
    dataset_filename = dataset_file_name # GCP
except:
    if 'google.colab' in sys.modules:
        dataset_filename = '/content/drive/MyDrive/qa_ncbi_dataset.tsv' # google colab
    else:
        dataset_filename = 'data/qa_ncbi_dataset.tsv' # local run
        
df = pd.read_csv(dataset_filename, sep='\t')
df.shape

(29319, 15)

In [7]:
df.question_id.nunique()

4040

### Split into dataframes

In [8]:
# Paper DF # Z
papers_df = df[['snippet_url', 'title', 'abstract']] \
    .drop_duplicates() \
    .reset_index(drop=True).reset_index() \
    .rename(columns={'index': 'paper_id'})
papers_df['title_abstract'] = papers_df['title'] + '. ' + papers_df['abstract']

df = df.merge(papers_df[['paper_id', 'snippet_url']])
papers_df = papers_df.set_index('paper_id')

# # Create paper id from url
# df['paper_id'] = df.snippet_id.str.split('_')
# df['paper_id'] = df['paper_id'].apply(lambda a: a[0]).astype(int)
papers_df.sample()

Unnamed: 0_level_0,snippet_url,title,abstract,title_abstract
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
25006,http://www.ncbi.nlm.nih.gov/pubmed/22553235,"Detection of NDM-1, VIM-1, KPC, OXA-48, and OX...",Matrix-assisted laser desorption ionization-ti...,"Detection of NDM-1, VIM-1, KPC, OXA-48, and OX..."


In [9]:
# question-paper matrix # Y
question_paper_df = pd.crosstab(df['question_id'], df['paper_id']).astype(int)
question_paper_df = question_paper_df[sorted(question_paper_df.columns)]
question_paper_df.sample()

paper_id,0,1,2,3,4,5,6,7,8,9,...,27794,27795,27796,27797,27798,27799,27800,27801,27802,27803
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5e42d1a748dab47f26000010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Question df # X
questions_df = df[['question_id', 'question_text']] \
    .drop_duplicates('question_id') \
    .set_index('question_id').loc[question_paper_df.index]
questions_df.sample()

Unnamed: 0_level_0,question_text
question_id,Unnamed: 1_level_1
54e0c3e71388e8454a000013,Which enzyme is inhibited by Orteronel?


### Train-test splitting

In [11]:
train_question_ids, valid_question_ids = train_test_split(questions_df.index, 
                                                          test_size=0.1,
                                                          random_state=5)
print(f'Train: {len(train_question_ids)}')
print(f'Valid: {len(valid_question_ids)}')

Train: 3636
Valid: 404


In [12]:
# Questions splitting
train_questions_df = questions_df.loc[train_question_ids]
valid_questions_df = questions_df.loc[valid_question_ids]

# Question-Paper matrix splitting
train_question_paper_df = question_paper_df.loc[train_question_ids]
train_question_paper_csr = scipy.sparse.csr_matrix(train_question_paper_df.values, dtype=np.float32)
valid_question_paper_df = question_paper_df.loc[valid_question_ids]
valid_question_paper_csr = scipy.sparse.csr_matrix(valid_question_paper_df.values, dtype=np.float32)

### Bert vectorization

In [13]:
paper_column = 'title' # title_abstract / title / abstract

# bert_model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
# bert_model_name = 'dmis-lab/biobert-v1.1'
# bert_model_name = 'sentence-transformers/roberta-large-nli-stsb-mean-tokens' # Don't fit into gpu memory
# bert_model_name = 'sentence-transformers/roberta-base-nli-mean-tokens' # Bad quality

bert_model_name = 'sentence-transformers/bert-base-nli-cls-token'
# bert_model_name = 'sentence-transformers/bert-base-nli-mean-tokens'

vectorizer = BertVectorizer(model_name=bert_model_name, 
                            device='cuda',
                            batch_size=512)
print(f'Max sentence size: {vectorizer.max_seq_length}')

02/21/2022 14:51:05 - INFO - sentence_transformers.SentenceTransformer - Load pretrained SentenceTransformer: sentence-transformers/bert-base-nli-cls-token


Max sentence size: 128


In [14]:
train_questions_csr = vectorizer.transform(train_questions_df.question_text.values,
                                               return_sparse=True)
valid_questions_csr = vectorizer.transform(valid_questions_df.question_text.values,
                                               return_sparse=True)
papers_csr = vectorizer.transform(papers_df[paper_column].values,
                                      return_sparse=True)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

### Hyperparameter tuning with Bayesian Optimization

In [15]:
# Creating cluster chain during training
cluster_chain = None

In [16]:
# Create train and valid ML problems
train_prob = MLProblemWithText(
    X_text=train_questions_df.question_text.values.tolist(), 
    Y=train_question_paper_csr, 
    X_feat=train_questions_csr
)

valid_prob = MLProblemWithText(
    X_text=valid_questions_df.question_text.values.tolist(), 
    Y=valid_question_paper_csr, 
    X_feat=valid_questions_csr
)

In [17]:
from bayes_opt import BayesianOptimization

In [18]:
def black_box_function(learning_rate, 
                       hidden_dropout_prob, 
                       weight_decay, 
                       adam_epsilon):
    
    train_params_dict = {
        'do_fine_tune': True,
        'fix_clustering': False,
        'matcher_params_chain': {
            'model_shortcut': bert_model_name,
            'loss_function': 'squared-hinge',
            'batch_size': 16,
            'max_no_improve_cnt': 50,
            'num_train_epochs': TRAIN_EPOCHS,
            'use_gpu': True,
            'logging_steps': 200,
            'save_steps': 200,
            'bootstrap_method': 'linear',
            'gradient_accumulation_steps': 1,
            'loss_function': 'squared-hinge',

            'learning_rate': learning_rate, 
            'hidden_dropout_prob': hidden_dropout_prob,
            'weight_decay': weight_decay,
            'adam_epsilon': adam_epsilon,
        },
    }
    
    predict_params_dict = {
        'only_topk': 10,
        'truncate_length': 128
    }
    
    train_params = XTransformer.TrainParams.from_dict(train_params_dict, recursive=True)
    pred_params = XTransformer.PredParams.from_dict(predict_params_dict, recursive=True)
    
    xtf = XTransformer.train(
        prob=train_prob,
        clustering=cluster_chain,
        val_prob=valid_prob,
        train_params=train_params,
        pred_params=pred_params,
        y_feat=papers_csr
    )
    
    predicted_question_paper_csr = xtf.predict(X_text=valid_questions_df.question_text.values.tolist(), 
                                               X_feat=valid_questions_csr,
                                               beam_size=10,
                                               only_topk=10)
    
    metric = smat_util.Metrics.generate(valid_question_paper_csr, 
                                    predicted_question_paper_csr, 
                                    topk=10)
    print(metric)
    
    prec = metric.prec[0] # Take first precision
    
    return prec

In [19]:
TRAIN_EPOCHS = 3

pbounds = {
    'learning_rate': (1e-8, 1e-2),
    'hidden_dropout_prob': (0, 0.65),
    'weight_decay': (1e-3, 0.2),
    'adam_epsilon': (1e-8, 1e-3)
}

optimizer = BayesianOptimization(f=black_box_function,
                                 pbounds=pbounds, 
                                 verbose=2,
                                 random_state=5)

In [20]:
optimizer.maximize(init_points = 5, n_iter = 20)
print("Best result: {}; f(x) = {}.".format(optimizer.max["params"], optimizer.max["target"]))

|   iter    |  target   | adam_e... | hidden... | learni... | weight... |
-------------------------------------------------------------------------


02/21/2022 14:51:40 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 14:51:40 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 14:51:40 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 14:51:44 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 14:51:44 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 14:51:44 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.3824124336242676 *****
02/21/2022 14:51:44 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpife9xawj/X_trn.pt
02/21/2022 14:51:44 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 14:51:44 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.73 1.73 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.73 1.00 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 1       [0m | [0m 0.01733 [0m | [0m 0.000222[0m | [0m 0.566   [0m | [0m 0.002067[0m | [0m 0.1838  [0m |


02/21/2022 15:00:20 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 15:00:20 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 15:00:20 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 15:00:24 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 15:00:24 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 15:00:24 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.4100492000579834 *****
02/21/2022 15:00:24 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpiqdm2k78/X_trn.pt
02/21/2022 15:00:24 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 15:00:24 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.73 1.73 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.73 1.00 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 2       [0m | [0m 0.01733 [0m | [0m 0.000488[0m | [0m 0.3976  [0m | [0m 0.007659[0m | [0m 0.1042  [0m |


02/21/2022 15:09:12 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 15:09:12 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 15:09:12 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 15:09:15 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 15:09:15 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 15:09:15 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.2177426815032959 *****
02/21/2022 15:09:15 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp9vboit34/X_trn.pt
02/21/2022 15:09:15 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 15:09:15 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.98 1.86 1.57 1.30 1.14 1.03 0.92 0.84 0.80 0.72
recall = 0.56 1.12 1.29 1.38 1.43 1.48 1.49 1.51 1.55 1.55
| [95m 3       [0m | [95m 0.0198  [0m | [95m 0.000296[0m | [95m 0.122   [0m | [95m 0.000807[0m | [95m 0.1479  [0m |


02/21/2022 15:18:10 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 15:18:10 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 15:18:10 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 15:18:14 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 15:18:14 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 15:18:14 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21616268157958984 *****
02/21/2022 15:18:14 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpwdydplfz/X_trn.pt
02/21/2022 15:18:14 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 15:18:14 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.73 1.73 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.73 1.00 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 4       [0m | [0m 0.01733 [0m | [0m 0.000441[0m | [0m 0.1029  [0m | [0m 0.008799[0m | [0m 0.05554 [0m |


02/21/2022 15:26:52 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 15:26:52 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 15:26:52 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 15:26:56 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 15:26:56 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 15:26:56 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21396183967590332 *****
02/21/2022 15:26:56 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp0e_vjgwz/X_trn.pt
02/21/2022 15:26:56 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 15:26:56 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.98 1.36 1.40 1.24 1.29 1.28 1.17 1.02 0.91 0.82
recall = 0.51 0.61 0.99 1.07 1.26 1.41 1.45 1.45 1.45 1.45
| [0m 5       [0m | [0m 0.0198  [0m | [0m 0.000414[0m | [0m 0.1925  [0m | [0m 0.006288[0m | [0m 0.1164  [0m |


02/21/2022 15:35:32 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 15:35:32 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 15:35:32 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 15:35:35 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 15:35:35 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 15:35:35 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.22510409355163574 *****
02/21/2022 15:35:36 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpgc37ewjb/X_trn.pt
02/21/2022 15:35:36 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 15:35:36 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.98 1.73 1.73 1.61 1.63 1.53 1.34 1.24 1.13 1.04
recall = 0.52 0.77 1.18 1.31 1.74 1.85 1.87 1.91 1.93 1.94
| [0m 6       [0m | [0m 0.0198  [0m | [0m 0.000755[0m | [0m 0.5497  [0m | [0m 0.002198[0m | [0m 0.02263 [0m |


02/21/2022 15:44:13 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 15:44:13 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 15:44:13 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 15:44:16 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 15:44:16 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 15:44:16 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21133804321289062 *****
02/21/2022 15:44:17 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpj77pqqbp/X_trn.pt
02/21/2022 15:44:17 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 15:44:17 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.24 1.11 1.24 1.11 1.04 0.99 0.99 0.96 0.88 0.79
recall = 0.35 0.46 0.83 0.92 1.00 1.09 1.18 1.25 1.26 1.26
| [0m 7       [0m | [0m 0.01238 [0m | [0m 0.000676[0m | [0m 0.02533 [0m | [0m 0.003135[0m | [0m 0.1984  [0m |


02/21/2022 15:52:55 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 15:52:55 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 15:52:55 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 15:52:59 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 15:52:59 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 15:52:59 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.2237865924835205 *****
02/21/2022 15:52:59 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp20zn2y8y/X_trn.pt
02/21/2022 15:52:59 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 15:52:59 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.49 1.61 1.65 1.55 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.48 0.76 1.15 1.29 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 8       [0m | [0m 0.01485 [0m | [0m 0.000310[0m | [0m 0.5612  [0m | [0m 0.001516[0m | [0m 0.05049 [0m |


02/21/2022 16:01:40 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 16:01:40 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 16:01:40 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 16:01:44 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 16:01:44 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 16:01:44 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21218347549438477 *****
02/21/2022 16:01:44 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp2mj3jc6u/X_trn.pt
02/21/2022 16:01:44 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 16:01:44 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.73 1.73 1.73 1.61 1.63 1.49 1.38 1.27 1.16 1.04
recall = 0.50 0.77 1.18 1.32 1.70 1.78 1.85 1.91 1.94 1.94
| [0m 9       [0m | [0m 0.01733 [0m | [0m 0.000652[0m | [0m 0.1705  [0m | [0m 0.003688[0m | [0m 0.1964  [0m |


02/21/2022 16:10:11 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 16:10:11 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 16:10:11 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 16:10:14 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 16:10:14 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 16:10:15 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.3787214756011963 *****
02/21/2022 16:10:15 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp7fa1ik9d/X_trn.pt
02/21/2022 16:10:15 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 16:10:15 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.73 1.73 1.65 1.55 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.50 0.77 1.15 1.29 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 10      [0m | [0m 0.01733 [0m | [0m 0.000692[0m | [0m 0.524   [0m | [0m 0.006583[0m | [0m 0.06072 [0m |


02/21/2022 16:18:58 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 16:18:58 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 16:18:58 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 16:19:02 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 16:19:02 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 16:19:02 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.3706974983215332 *****
02/21/2022 16:19:02 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmphr9_gcew/X_trn.pt
02/21/2022 16:19:02 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 16:19:02 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.73 1.73 1.65 1.55 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.50 0.77 1.15 1.29 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 11      [0m | [0m 0.01733 [0m | [0m 0.000920[0m | [0m 0.6495  [0m | [0m 0.008222[0m | [0m 0.043   [0m |


02/21/2022 16:27:44 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 16:27:44 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 16:27:44 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 16:27:47 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 16:27:47 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 16:27:47 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.2170729637145996 *****
02/21/2022 16:27:47 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp9xvoitq3/X_trn.pt
02/21/2022 16:27:47 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 16:27:47 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.98 1.86 1.73 1.61 1.58 1.49 1.31 1.21 1.13 1.04
recall = 0.74 1.02 1.40 1.53 1.71 1.82 1.85 1.88 1.93 1.94
| [0m 12      [0m | [0m 0.0198  [0m | [0m 0.000197[0m | [0m 0.05102 [0m | [0m 0.004264[0m | [0m 0.1103  [0m |


02/21/2022 16:36:33 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 16:36:33 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 16:36:33 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 16:36:36 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 16:36:36 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 16:36:37 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21663808822631836 *****
02/21/2022 16:36:37 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp2s2g8zdv/X_trn.pt
02/21/2022 16:36:37 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 16:36:37 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.73 1.73 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.73 1.00 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 13      [0m | [0m 0.01733 [0m | [0m 0.000618[0m | [0m 0.2921  [0m | [0m 0.007229[0m | [0m 0.06431 [0m |


02/21/2022 16:45:32 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 16:45:32 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 16:45:32 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 16:45:35 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 16:45:35 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 16:45:35 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.23044037818908691 *****
02/21/2022 16:45:36 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpnmun3tka/X_trn.pt
02/21/2022 16:45:36 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 16:45:36 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.98 1.86 1.82 1.67 1.49 1.36 1.20 1.05 0.94 0.84
recall = 0.64 0.81 1.23 1.37 1.45 1.54 1.56 1.56 1.56 1.56
| [0m 14      [0m | [0m 0.0198  [0m | [0m 0.000745[0m | [0m 0.09051 [0m | [0m 0.007801[0m | [0m 0.1893  [0m |


02/21/2022 16:54:12 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 16:54:12 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 16:54:12 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 16:54:15 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 16:54:15 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 16:54:16 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21903419494628906 *****
02/21/2022 16:54:16 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp3txt3ret/X_trn.pt
02/21/2022 16:54:16 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 16:54:16 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.98 1.73 1.65 1.55 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.52 0.77 1.15 1.29 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 15      [0m | [0m 0.0198  [0m | [0m 0.000559[0m | [0m 0.1628  [0m | [0m 0.004185[0m | [0m 0.04963 [0m |


02/21/2022 17:02:55 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 17:02:55 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 17:02:55 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 17:02:59 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 17:02:59 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 17:02:59 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21631813049316406 *****
02/21/2022 17:02:59 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpsiu61dir/X_trn.pt
02/21/2022 17:02:59 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 17:02:59 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.98 1.86 1.57 1.36 1.19 1.03 0.92 0.84 0.77 0.69
recall = 0.74 1.06 1.24 1.38 1.67 1.71 1.72 1.74 1.75 1.75
| [0m 16      [0m | [0m 0.0198  [0m | [0m 0.000994[0m | [0m 0.598   [0m | [0m 0.005193[0m | [0m 0.1576  [0m |


02/21/2022 17:11:44 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 17:11:44 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 17:11:44 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 17:11:48 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 17:11:48 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 17:11:48 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.23035120964050293 *****
02/21/2022 17:11:48 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp8a7uao96/X_trn.pt
02/21/2022 17:11:48 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 17:11:48 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.73 1.73 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.73 1.00 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 17      [0m | [0m 0.01733 [0m | [0m 0.000504[0m | [0m 0.0508  [0m | [0m 0.009002[0m | [0m 0.06762 [0m |


02/21/2022 17:20:24 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 17:20:24 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 17:20:24 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 17:20:27 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 17:20:27 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 17:20:28 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.3675193786621094 *****
02/21/2022 17:20:28 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpf2o_p954/X_trn.pt
02/21/2022 17:20:28 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 17:20:28 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.73 1.73 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.73 1.00 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 18      [0m | [0m 0.01733 [0m | [0m 0.000403[0m | [0m 0.1951  [0m | [0m 0.003064[0m | [0m 0.09667 [0m |


02/21/2022 17:29:08 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 17:29:08 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 17:29:08 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 17:29:12 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 17:29:12 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 17:29:12 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.36335182189941406 *****
02/21/2022 17:29:12 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp88lydbd0/X_trn.pt
02/21/2022 17:29:12 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 17:29:12 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 2.23 1.86 1.65 1.42 1.29 1.16 1.06 0.96 0.91 0.84
recall = 0.74 0.98 1.12 1.22 1.31 1.38 1.42 1.45 1.49 1.51
| [95m 19      [0m | [95m 0.02228 [0m | [95m 0.000602[0m | [95m 0.07526 [0m | [95m 0.002661[0m | [95m 0.06857 [0m |


02/21/2022 17:38:00 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 17:38:00 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 17:38:00 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 17:38:03 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 17:38:03 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 17:38:04 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.3664591312408447 *****
02/21/2022 17:38:04 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpgo6fjeov/X_trn.pt
02/21/2022 17:38:04 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 17:38:04 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.98 1.86 1.73 1.61 1.58 1.49 1.31 1.21 1.13 1.04
recall = 0.74 1.02 1.40 1.53 1.71 1.82 1.85 1.88 2.15 2.16
| [0m 20      [0m | [0m 0.0198  [0m | [0m 0.000394[0m | [0m 0.2329  [0m | [0m 0.007471[0m | [0m 0.1376  [0m |


02/21/2022 17:46:47 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 17:46:47 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 17:46:47 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 17:46:50 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 17:46:50 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 17:46:50 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21802043914794922 *****
02/21/2022 17:46:50 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpz7610z1y/X_trn.pt
02/21/2022 17:46:50 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 17:46:50 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.73 1.73 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.73 1.00 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 21      [0m | [0m 0.01733 [0m | [0m 0.000592[0m | [0m 0.2384  [0m | [0m 0.003228[0m | [0m 0.08255 [0m |


02/21/2022 17:55:37 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 17:55:37 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 17:55:37 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 17:55:41 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 17:55:41 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 17:55:41 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21714258193969727 *****
02/21/2022 17:55:41 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpbxo4ooj9/X_trn.pt
02/21/2022 17:55:41 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 17:55:41 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.98 1.86 1.73 1.61 1.58 1.49 1.31 1.21 1.13 1.06
recall = 0.74 1.02 1.40 1.53 1.71 1.82 1.85 1.88 2.15 2.19
| [0m 22      [0m | [0m 0.0198  [0m | [0m 0.000435[0m | [0m 0.005611[0m | [0m 0.006144[0m | [0m 0.01127 [0m |


02/21/2022 18:04:19 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 18:04:19 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 18:04:19 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 18:04:23 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 18:04:23 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 18:04:23 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21836638450622559 *****
02/21/2022 18:04:23 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpbijb27e6/X_trn.pt
02/21/2022 18:04:23 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 18:04:23 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.73 1.73 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.73 1.00 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 23      [0m | [0m 0.01733 [0m | [0m 0.000155[0m | [0m 0.1906  [0m | [0m 0.004854[0m | [0m 0.1612  [0m |


02/21/2022 18:13:06 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 18:13:06 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 18:13:06 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 18:13:09 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 18:13:09 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 18:13:09 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.21700453758239746 *****
02/21/2022 18:13:10 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmpapupxink/X_trn.pt
02/21/2022 18:13:10 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 18:13:10 - INFO - pecos.xmc.xtransformer.matcher - 

prec   = 1.73 1.61 1.82 1.79 1.68 1.53 1.34 1.18 1.05 0.94
recall = 0.39 0.53 1.10 1.39 1.52 1.60 1.62 1.62 1.62 1.62
| [0m 24      [0m | [0m 0.01733 [0m | [0m 0.000639[0m | [0m 0.5404  [0m | [0m 0.001087[0m | [0m 0.1207  [0m |


02/21/2022 18:21:58 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 18:21:58 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 18:21:58 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 18:22:02 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 18:22:02 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=128*****
02/21/2022 18:22:02 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.3708474636077881 *****
02/21/2022 18:22:02 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmp0d5x386w/X_trn.pt
02/21/2022 18:22:02 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=128*****
02/21/2022 18:22:02 - INFO - pecos.xmc.xtransformer.matcher - *

prec   = 1.98 1.86 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.74 1.02 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91
| [0m 25      [0m | [0m 0.0198  [0m | [0m 0.000551[0m | [0m 0.1092  [0m | [0m 0.003251[0m | [0m 0.1751  [0m |
Best result: {'adam_epsilon': 0.0006027209623799, 'hidden_dropout_prob': 0.07526068743757297, 'learning_rate': 0.002661347396322538, 'weight_decay': 0.06856982448680521}; f(x) = 0.022277227722772276.


### Train

In [21]:
# Creating cluster chain during training
cluster_chain = None

In [22]:
# Create train and valid ML problems
train_prob = MLProblemWithText(
    X_text=train_questions_df.question_text.values.tolist(), 
    Y=train_question_paper_csr, 
    X_feat=train_questions_csr
)

valid_prob = MLProblemWithText(
    X_text=valid_questions_df.question_text.values.tolist(), 
    Y=valid_question_paper_csr, 
    X_feat=valid_questions_csr
)

In [23]:
train_params_dict = {
    'do_fine_tune': True,
    'fix_clustering': False,
    'matcher_params_chain': {
        'model_shortcut': bert_model_name,
        'loss_function': 'squared-hinge',
        'batch_size': 16,
        'max_no_improve_cnt': 50,
        'num_train_epochs': 3,
        'use_gpu': True,
        'logging_steps': 200,
        'save_steps': 200,
        'learning_rate': 1e-3,
        'bootstrap_method': 'linear',
        'hidden_dropout_prob': 0.45
    },
}

predict_params_dict = {
    'only_topk': 10,
    'truncate_length': 300
}

train_params = XTransformer.TrainParams.from_dict(train_params_dict, recursive=True)
pred_params = XTransformer.PredParams.from_dict(predict_params_dict, recursive=True)

In [24]:
xtf = XTransformer.train(
    prob=train_prob,
    clustering=cluster_chain,
    val_prob=valid_prob,
    train_params=train_params,
    pred_params=pred_params,
    y_feat=papers_csr
)

02/21/2022 18:30:44 - INFO - pecos.xmc.xtransformer.model - Hierarchical label tree: [2, 32, 512, 27804]
02/21/2022 18:30:44 - INFO - pecos.xmc.xtransformer.model - Fine-tune Transformers with nr_labels=[2, 32, 512, 27804]
02/21/2022 18:30:44 - INFO - pecos.xmc.xtransformer.model - Fine-tuning XR-Transformer with tfn at level 0, nr_labels=2, avr_M_nnz=2
02/21/2022 18:30:48 - INFO - pecos.xmc.xtransformer.matcher - Downloaded sentence-transformers/bert-base-nli-cls-token model from s3.
02/21/2022 18:30:48 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=3636 truncation=300*****
02/21/2022 18:30:48 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.5879473686218262 *****
02/21/2022 18:30:48 - INFO - pecos.xmc.xtransformer.matcher - trn tensors saved to /tmp/tmplbg79d6d/X_trn.pt
02/21/2022 18:30:48 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=300*****
02/21/2022 18:30:49 - INFO - pecos.xmc.xtransformer.matcher - *

In [25]:
xtf.save("models/xmc_transformer_4e")

02/21/2022 18:47:24 - INFO - pecos.xmc.xtransformer.model - Parameters saved to models/xmc_transformer_4e/param.json
02/21/2022 18:47:25 - INFO - pecos.xmc.xtransformer.model - Model saved to models/xmc_transformer_4e


### Validation

In [26]:
predicted_question_paper_csr = xtf.predict(X_text=valid_questions_df.question_text.values.tolist(), 
                                           X_feat=valid_questions_csr,
                                           beam_size=10,
                                           only_topk=10)

02/21/2022 18:47:25 - INFO - pecos.utils.torch_util - Setting device to cuda, number of active GPUs: 1
02/21/2022 18:47:25 - INFO - pecos.xmc.xtransformer.matcher - ***** Encoding data len=404 truncation=300*****
02/21/2022 18:47:25 - INFO - pecos.xmc.xtransformer.matcher - ***** Finished with time cost=0.05874300003051758 *****
02/21/2022 18:47:25 - INFO - pecos.xmc.xtransformer.matcher - Predict on input text tensors(torch.Size([404, 300]))


In [27]:
metric = smat_util.Metrics.generate(valid_question_paper_csr, 
                                    predicted_question_paper_csr, 
                                    topk=10)
print(metric)

prec   = 1.73 1.73 1.73 1.61 1.58 1.49 1.31 1.21 1.10 1.01
recall = 0.73 1.00 1.40 1.53 1.71 1.82 1.85 1.88 1.90 1.91


### Checking

In [28]:
def get_papers_by_text(text, 
                       vectorizer,
                       number_of_papers=10, 
                       check_correctness=False):
  
    vector_text_csr = vectorizer.transform([text])

    papers_csr = xlm.predict(vector_text_csr,
                             beam_size=4,
                             only_topk=number_of_papers)

    prediction = papers_csr.toarray()
    paper_labels = np.nonzero(prediction)[1].tolist()

    # print()
    # print('Found papers: ', len(paper_labels))
    # print(paper_labels)

    paper_logists = np.take(prediction, paper_labels).tolist()

    result = papers_df.loc[paper_labels, ['title', 'abstract', 'snippet_url']]
    result['score'] = paper_logists
    result = result.sort_values('score', ascending=False)

    if check_correctness:
        correct_papers = df.loc[df.question_text == text, 'paper_id'].unique()
        print(f'\nTrue number of papers: {len(correct_papers)}')
        result['correctness'] = result.index.isin(correct_papers)

    return result

In [29]:
correct_examples = [
    'Which genome browser database for DNA shape annotations is available?',
    'Has strimvelis been approved by the European Medicines Agency?',
    'What is the target of Volanesorsen?',
    'Is eptinezumab a small molecule?'
]

interesting_examples = [
    'For which type of diabetes can empagliflozin be used?',
    "Is Semagacestat effective for Alzheimer's Disease?",
    'Which gene is associated with Muenke syndrome?'     
]

In [30]:
get_papers_by_text('Which gene is associated with Muenke syndrome?',
                   vectorizer=vectorizer, 
                   number_of_papers=30,
                   check_correctness=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

NameError: name 'xlm' is not defined

In [None]:
# Random example
temp = valid_questions_df.question_text.sample().values[0]
print(temp)

get_papers_by_text(temp,
                   vectorizer=vectorizer,
                   number_of_papers=30,
                   check_correctness=True)

### Highlight answers

In [None]:
question_text = 'Which genome browser database for DNA shape annotations is available?'

In [None]:
result = get_papers_by_text(
    question_text,
    vectorizer=vectorizer, 
    number_of_papers=30,
    check_correctness=True
)

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("ozcangundes/T5-base-for-BioQA")
model = T5ForConditionalGeneration.from_pretrained("ozcangundes/T5-base-for-BioQA")

def get_answer(question, context):
    source_encoding=tokenizer(
        question,
        context,
        max_length=512,
        padding="max_length",
        truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt")

    generated_ids=model.generate(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"])

    preds=[tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for gen_id in generated_ids]
    
    return "".join(preds)

# To highlight the answers in the results
def highlight_selected_text(row):
    abstract = row["abstract"]
    title = row["title"]
    ext = []
    ext.append(row["abstract_answer"])
    ext.append(row['title_answer'])
    # for k, v in color.items():
    for k in ext:
      if len(k) > 2:
        abstract = abstract.replace(k, f'<span style="color: red; background-color: yellow; font-weight: bold">{k}</span>')
        title = title.replace(k, f'<span style="color: red; background-color: yellow; font-weight: bold">{k}</span>')
    return abstract, title

In [None]:
result['abstract_answer'] = result['abstract'].apply(lambda x: get_answer(question_text, x))
result['title_answer'] = result['title'].apply(lambda x: get_answer(question_text, x))

In [None]:
from IPython.core.display import HTML
result['abstract'], result['title'] = zip(*result.apply(highlight_selected_text, axis=1))
display(HTML(result.sample(10).to_html(escape=False)))