In [7]:
BASE_DATASETS_PATH = '../DATASETS/'

In [8]:
datasets = [
    {
        "dataset_desc": "CSAbstruct",
        "dataset_path": f"{BASE_DATASETS_PATH}CSAbstruct/",
        "file_prefix": "csabstruct",
        "cls_models": [("gubartz/cls_minilm_abstruct", "nreimers/MiniLM-L6-H384-uncased"),
                       ("gubartz/cls_scibert_abstruct", "allenai/scibert_scivocab_uncased")]
    },
    {
        "dataset_desc": "PubMed-RCT",
        "dataset_path": f"{BASE_DATASETS_PATH}PubMed-RCT/",
        "file_prefix": "pubmed_rct",
        "cls_models": [("gubartz/cls_minilm_pubmed_rct", "nreimers/MiniLM-L6-H384-uncased"),
                       ("gubartz/cls_scibert_pubmed_rct", "allenai/scibert_scivocab_uncased")]
    },
    {
        "dataset_desc": "PMC-Sents-FULL",
        "dataset_path": f"{BASE_DATASETS_PATH}PMC-Sents-FULL/",
        "file_prefix": "pmc_sents_full",
        "cls_models": [("gubartz/cls_minilm_pmc_sents_full", "nreimers/MiniLM-L6-H384-uncased"),
                       ("gubartz/cls_scibert_pmc_sents_full", "allenai/scibert_scivocab_uncased")]
    }
]

In [9]:
import json
import os

already_processed = set()

if os.path.exists('cls_lm_results.jsonl'):
    with open('cls_lm_results.jsonl', 'r', encoding="utf-8") as f:
        for l in f:
            item = json.loads(l)
            already_processed.add((item['dataset'], item['lm_model'], item['cls_model']))

In [10]:
already_processed

{('CSAbstruct', 'gubartz/cls_minilm_abstruct', '-'),
 ('CSAbstruct', 'gubartz/cls_scibert_abstruct', '-'),
 ('PMC-Sents-FULL', 'gubartz/cls_minilm_pmc_sents_full', '-'),
 ('PMC-Sents-FULL', 'gubartz/cls_scibert_pmc_sents_full', '-'),
 ('PubMed-RCT', 'gubartz/cls_minilm_pubmed_rct', '-'),
 ('PubMed-RCT', 'gubartz/cls_scibert_pubmed_rct', '-')}

In [11]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, i):
        item = self.data.iloc[i]
        
        return item['sentence']

In [16]:
import pandas as pd
from tqdm.auto import tqdm
import json
from transformers import pipeline
from transformers import AutoTokenizer, AutoConfig
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score
import sys

pbar_datasets = tqdm(datasets)
batch_size = 256

for dataset in pbar_datasets:
    pbar_datasets.set_description(dataset['dataset_desc'])
   
    test_dataset = f"{dataset['dataset_path']}{dataset['file_prefix']}_test.parquet"
    df_test = pd.read_parquet(test_dataset)
    
    # sort for less padding
    df_test = df_test.sort_values(by="sentence", key=lambda x: x.str.len())
    mydataset = MyDataset(df_test)
    
    print(f"Test Size: {len(df_test.index)}")
    
    pbar_cls_models = tqdm(dataset['cls_models'], leave=False)
    
    for cls_model in pbar_cls_models:
        pbar_cls_models.set_description(cls_model[0])

        item = (dataset['dataset_desc'], cls_model[0], '-')
        if item in already_processed:
            continue               
        
        tokenizer = AutoTokenizer.from_pretrained(cls_model[1])
        classifier = pipeline(model=cls_model[0],
                              tokenizer=tokenizer,
                              padding=True,
                              max_length=512,
                              truncation=True)
        config = AutoConfig.from_pretrained(cls_model[0])
        
        # df_test['maped_label'] = df_test['subject_label'].map(config.label2id)
 
        # predictions = classifier(list(df_test['sentence']))
        
        predictions = []
        for pred in tqdm(classifier(mydataset, batch_size=batch_size),
                   total=len(mydataset), desc='Classification', leave=False):
            predictions.append(pred)
            
        print(f"predictions Size: {len(predictions)}")            

        y_pred = []
        for pred in predictions:
            y_pred.append(config.label2id[pred['label']])
        y_test_true = list(df_test['label_id'])
        
        output_dict = {
          'dataset': dataset['dataset_desc'],
          'lm_model': cls_model[0],
          'cls_model': "-",
        }        

        output_dict['f1_micro'] = f1_score(y_test_true, y_pred, average='micro')

        with open('cls_lm_results.jsonl', 'a', encoding="utf-8") as f:
            f.write(json.dumps(output_dict))
            f.write("\n")        

  0%|          | 0/3 [00:00<?, ?it/s]

Test Size: 1349


  0%|          | 0/2 [00:00<?, ?it/s]

Test Size: 30122


  0%|          | 0/2 [00:00<?, ?it/s]

Test Size: 17310


  0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
results = []
with open('cls_lm_results.jsonl', 'r', encoding="utf-8") as f:
    for l in f:
        results.append(json.loads(l))

In [19]:
import pandas as pd

df_result = pd.DataFrame(results)[['dataset', 'lm_model', 'cls_model', 'f1_micro']]
df_result['f1_micro'] = round(df_result['f1_micro'] * 100, 2)
df_result.sort_values(by=["dataset", "lm_model", "cls_model"])

Unnamed: 0,dataset,lm_model,cls_model,f1_micro
0,CSAbstruct,gubartz/cls_minilm_abstruct,-,68.05
1,CSAbstruct,gubartz/cls_scibert_abstruct,-,66.42
4,PMC-Sents-FULL,gubartz/cls_minilm_pmc_sents_full,-,70.02
5,PMC-Sents-FULL,gubartz/cls_scibert_pmc_sents_full,-,69.97
2,PubMed-RCT,gubartz/cls_minilm_pubmed_rct,-,82.4
3,PubMed-RCT,gubartz/cls_scibert_pubmed_rct,-,82.12
