In [1]:
import os
import numpy as np
from datasets import load_from_disk, load_dataset
from transformers import AutoTokenizer

# project root path
dsdir = os.getenv("DSDIR")
scratch = os.getenv("SCRATCH")
idr_models = f"{dsdir}/HuggingFace_Models"
bioasq_raw_path = f"{scratch}/pretrain-med-data-qual/data/bioasq_7b_raw"
bert_path = f"{idr_models}/bert-base-uncased"

# pretrain set

In [11]:
ds = load_from_disk("pubmed_preproc")

Loading dataset from disk:   0%|          | 0/215 [00:00<?, ?it/s]

In [14]:
# Remove filtering metrics columns
columns_to_keep = [
    "input_ids",
    "token_type_ids",
    "attention_mask",
    "special_tokens_mask",
]
ds["train"] = ds["train"].remove_columns(
    [c for c in ds["train"].column_names
     if c not in columns_to_keep]
    )

ds["validation"] = ds["validation"].remove_columns(
    [c for c in ds["validation"].column_names 
     if c not in columns_to_keep]
)

In [30]:
train_ds = ds["train"].select(range(10000))
val_ds = ds["validation"].select(range(10000))

In [31]:
from itertools import chain
max_seq_length = 512
tokenizer = AutoTokenizer.from_pretrained(bert_path)

In [32]:
def group_tokens_and_pad(examples):
    # Concatenate all texts in the batch
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # padding variable
    remainder = total_length % max_seq_length 
    # update total length
    total_length = (total_length // max_seq_length) * max_seq_length 
    # Add padding tokens and masks
    if remainder != 0:
        total_length += max_seq_length 
        concatenated_examples["input_ids"] += [tokenizer.pad_token_id] * (max_seq_length - remainder)
        concatenated_examples["token_type_ids"] += [0] * (max_seq_length - remainder)
        concatenated_examples["attention_mask"] += [0] * (max_seq_length - remainder)
        concatenated_examples["special_tokens_mask"] += [1] * (max_seq_length - remainder)
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

train_ds = train_ds.map(
    group_tokens_and_pad,
    batched=True,
    batch_size=1024,
    num_proc=8,
    load_from_cache_file=True,
    desc=f"Grouping texts in chunks of {max_seq_length}",
) 
val_ds = val_ds.map(
    group_tokens_and_pad,
    batched=True,
    batch_size=1024,
    num_proc=8,
    load_from_cache_file=True,
    desc=f"Grouping texts in chunks of {max_seq_length}",
) 

Grouping texts in chunks of 512 (num_proc=8):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [7]:
ds["train"][0]["random"] 

2.3444218515250483

# finetuning tasks sets

In [2]:
ds_name_config =  [
        ("bigbio/blurb","bc5chem"),
        ("bigbio/blurb","bc5disease"),
        ("bigbio/blurb","bc2gm"),
        ("bigbio/blurb","jnlpba"),
        ("bigbio/blurb","ncbi_disease"),
        ("bigbio/ebm_pico", None), 
        ("bigbio/chemprot", "chemprot_bigbio_kb"),
        ("bigbio/ddi_corpus", "ddi_corpus_bigbio_kb"),
        ("bigbio/gad", "gad_blurb_bigbio_text"),
        ("bigbio/biosses", "biosses_bigbio_pairs"),
        ("bigbio/hallmarks_of_cancer", None),
        ("bigbio/bioasq_task_b","bioasq_blurb_bigbio_qa"),
        ("bigbio/pubmed_qa","pubmed_qa_labeled_fold0_bigbio_qa"),
]
for i,dname in enumerate(ds_name_config):
    if i < 5 : print(i,dname[1])
    else : print(i,dname[0])

0 bc5chem
1 bc5disease
2 bc2gm
3 jnlpba
4 ncbi_disease
5 bigbio/ebm_pico
6 bigbio/chemprot
7 bigbio/ddi_corpus
8 bigbio/gad
9 bigbio/biosses
10 bigbio/hallmarks_of_cancer
11 bigbio/bioasq_task_b
12 bigbio/pubmed_qa


In [3]:
choice = 10
ds = load_dataset(
    ds_name_config[choice][0],
    name=ds_name_config[choice][1],
    cache_dir=".blurb_cache",
    trust_remote_code=True,
)

In [9]:
for name,conf in ds_name_config : 
    ds = load_dataset(
        name,
        name=conf,
        cache_dir=".blurb_cache",
        trust_remote_code=True,
        data_dir=bioasq_data_dir if name.startswith("bigbio/bioasq") else None,
    )

In [4]:
ds["train"].features

{'document_id': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'label': [ClassLabel(names=['evading growth suppressors', 'tumor promoting inflammation', 'enabling replicative immortality', 'cellular energetics', 'resisting cell death', 'activating invasion and metastasis', 'genomic instability and mutation', 'none', 'inducing angiogenesis', 'sustaining proliferative signaling', 'avoiding immune destruction'], id=None)]}

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['document_id', 'text', 'label'],
        num_rows: 12119
    })
    test: Dataset({
        features: ['document_id', 'text', 'label'],
        num_rows: 3547
    })
    validation: Dataset({
        features: ['document_id', 'text', 'label'],
        num_rows: 1798
    })
})