In [2]:
import os
import numpy as np
import datasets
from transformers import AutoTokenizer

# project root path
dsdir = os.getenv("DSDIR")
scratch = os.getenv("SCRATCH")
root = f"{scratch}/pretrain-med-data-qual"
idr_models = f"{dsdir}/HuggingFace_Models"
bioasq_raw_path = f"{root}/data/bioasq_7b_raw"
bert_path = f"{idr_models}/bert-base-uncased"

# pretrain set

In [11]:
ds = load_from_disk("pubmed_preproc")

Loading dataset from disk:   0%|          | 0/215 [00:00<?, ?it/s]

In [14]:
# Remove filtering metrics columns
columns_to_keep = [
    "input_ids",
    "token_type_ids",
    "attention_mask",
    "special_tokens_mask",
]
ds["train"] = ds["train"].remove_columns(
    [c for c in ds["train"].column_names
     if c not in columns_to_keep]
    )

ds["validation"] = ds["validation"].remove_columns(
    [c for c in ds["validation"].column_names 
     if c not in columns_to_keep]
)

In [30]:
train_ds = ds["train"].select(range(10000))
val_ds = ds["validation"].select(range(10000))

In [31]:
from itertools import chain
max_seq_length = 512
tokenizer = AutoTokenizer.from_pretrained(bert_path)

In [32]:
def group_tokens_and_pad(examples):
    # Concatenate all texts in the batch
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # padding variable
    remainder = total_length % max_seq_length 
    # update total length
    total_length = (total_length // max_seq_length) * max_seq_length 
    # Add padding tokens and masks
    if remainder != 0:
        total_length += max_seq_length 
        concatenated_examples["input_ids"] += [tokenizer.pad_token_id] * (max_seq_length - remainder)
        concatenated_examples["token_type_ids"] += [0] * (max_seq_length - remainder)
        concatenated_examples["attention_mask"] += [0] * (max_seq_length - remainder)
        concatenated_examples["special_tokens_mask"] += [1] * (max_seq_length - remainder)
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

train_ds = train_ds.map(
    group_tokens_and_pad,
    batched=True,
    batch_size=1024,
    num_proc=8,
    load_from_cache_file=True,
    desc=f"Grouping texts in chunks of {max_seq_length}",
) 
val_ds = val_ds.map(
    group_tokens_and_pad,
    batched=True,
    batch_size=1024,
    num_proc=8,
    load_from_cache_file=True,
    desc=f"Grouping texts in chunks of {max_seq_length}",
) 

Grouping texts in chunks of 512 (num_proc=8):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [7]:
ds["train"][0]["random"] 

2.3444218515250483

# finetuning tasks sets

In [58]:
ds_name_config =  [
        ("bigbio/blurb","bc5chem"),
        ("bigbio/blurb","bc5disease"),
        ("bigbio/blurb","bc2gm"),
        ("bigbio/blurb","jnlpba"),
        ("bigbio/blurb","ncbi_disease"),
        ("bigbio/ebm_pico", None), 
        ("bigbio/chemprot", "chemprot_bigbio_kb"),
        ("bigbio/ddi_corpus", "ddi_corpus_bigbio_kb"),
        ("bigbio/gad", "gad_blurb_bigbio_text"),
        ("bigbio/biosses", "biosses_bigbio_pairs"),
        ("bigbio/hallmarks_of_cancer", None),
        ("bigbio/bioasq_task_b","bioasq_blurb_bigbio_qa"),
        ("bigbio/pubmed_qa","pubmed_qa_labeled_fold0_bigbio_qa"),
]
for i,dname in enumerate(ds_name_config):
    if i < 5 : print(i,dname[1])
    else : print(i,dname[0])

0 bc5chem
1 bc5disease
2 bc2gm
3 jnlpba
4 ncbi_disease
5 bigbio/ebm_pico
6 bigbio/chemprot
7 bigbio/ddi_corpus
8 bigbio/gad
9 bigbio/biosses
10 bigbio/hallmarks_of_cancer
11 bigbio/bioasq_task_b
12 bigbio/pubmed_qa


In [59]:
choice = 8
ds = datasets.load_dataset(
    ds_name_config[choice][0],
    name=ds_name_config[choice][1],
    cache_dir='.blurb_cache',
)

In [28]:
ds["train"].features

{'id': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None)}

In [12]:
def get_label_list(raw_dataset, split="train"):
    """Get the list of labels from a multi-label dataset"""

    if isinstance(raw_dataset[split]["label"][0], list):
        label_list = [label for sample in raw_dataset[split]["label"] for label in sample]
        label_list = list(set(label_list))
    else:
        label_list = raw_dataset[split].unique("label")
    # we will treat the label list as a list of string instead of int, consistent with model.config.label2id
    label_list = [str(label) for label in label_list]
    return label_list

In [30]:
set(get_label_list(ds,split="train")).difference(set(get_label_list(ds,split="test")))

{'Undefined'}

# gad preprocess

In [62]:
ds = datasets.load_from_disk("gad")

In [67]:
ds["train"][4]

{'id': '4',
 'document_id': '4',
 'text': 'In conclusion, @GENE$ 8092C > A polymorphism may modify the associations between cumulative cigarette smoking and @DISEASE$ risk.',
 'label': 1}

In [60]:
ds = ds.map(lambda x: {"labels":np.array(x).flatten().astype(int)}, input_columns="labels", batched=True)
ds = ds.rename_column("labels","label")
ds = ds.cast_column("label",datasets.ClassLabel(num_classes=2, names=['no_relation', 'gene-disease_relation']))

Map:   0%|          | 0/4261 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

Map:   0%|          | 0/534 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4261 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/535 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/534 [00:00<?, ? examples/s]

In [57]:
!rm -rf gad

In [61]:
ds.save_to_disk('gad')

Saving the dataset (0/1 shards):   0%|          | 0/4261 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/535 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/534 [00:00<?, ? examples/s]

# biosses grid search

In [105]:
from transformers import BertModel
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
import evaluate

In [99]:
ds = load_from_disk("biosses")
models_paths = [
    f"{idr_models}/bert-base-uncased",
    f"{root}/pretraining/random_25%",
    f"{root}/pretraining/h-index_mid25%",
    f"{root}/pretraining/sjr_top25%",
    f"{root}/pretraining/sjr_mid25%",
    f"{root}/pretraining/h-index_mid50%",
    f"{root}/pretraining/h-index_top25%",
]

pearsonr = evaluate.load('../evaluation/metrics/evaluate_pearsonr.py',cache_dir='.test_cache')

base_regressor = SGDRegressor(max_iter=100, random_state=42)
hp_grid = {
    'alpha':[float(f'1e{p}') for p in range(-5,2,1)], # regularization term constant multiplier
    'eta0':[float(f'1e{p}') for p in range(-5,0,1)] # learning rate
}
gs_sgdreg = GridSearchCV(
    base_regressor,
    param_grid=hp_grid,
    scoring='neg_mean_squared_error',
)
def encode_dataset(dataset):
    tokenized_set = tokenizer(
        text=dataset["text_1"],
        text_pair=dataset["text_2"],
        padding=True,
        return_tensors='pt'
    ).to('cuda')
    X = model(**tokenized_set).last_hidden_state[:,0,:]
    X = X_train.to('cpu').detach().numpy()
    y = [float(y) for y in dataset["label"]]
    return X,y

In [103]:
results = {}
for m in models_paths:
    model = BertModel.from_pretrained(m).to('cuda')
    tokenizer = AutoTokenizer.from_pretrained(m) 
    # ENCODE TRAIN DATA
    X_train, y_train = encode_dataset(ds["train"])
    # ENCODE TEST DATA
    X_test, y_test = encode_dataset(ds["test"])
    # do hp_search
    gs_best_params = gs_sgdreg.fit(X_train,y_train).best_params_
    # train regressor
    regressor = SGDRegressor(**gs_best_params).fit(X_train,y_train)
    # predict and eval
    y_pred = regressor.predict(X_test)
    pearsonr_result = pearsonr.compute(predictions=y_pred,references=y_test)["pearsonr"]
    results[m.split('/')[-1]] = pearsonr_result

Some weights of BertModel were not initialized from the model checkpoint at /gpfsscratch/rech/aro/urz45id/pretrain-med-data-qual/pretraining/random_25% and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /gpfsscratch/rech/aro/urz45id/pretrain-med-data-qual/pretraining/h-index_mid25% and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at /gpfsscratch/rech/aro/urz45id/pretrain-med-data-qual/pretraining/sjr_top25% and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to

In [104]:
results

{'bert-base-uncased': 0.8641115928790642,
 'random_25%': -0.18442714588209208,
 'h-index_mid25%': -0.38572075126211725,
 'sjr_top25%': -0.2220021055334163,
 'sjr_mid25%': -0.338702970692504,
 'h-index_mid50%': -0.17426184999385272,
 'h-index_top25%': -0.41648110179198267}

# ebm pico preproc

In [1]:
ebm_pico = datasets.load_dataset(
    "bigbio/ebm_pico",
    name="ebm_pico_source",
    split="test",
    cache_dir=".blurb_cache",
)

NameError: name 'datasets' is not defined