In [17]:
import os
import numpy as np
from datasets import load_from_disk, load_dataset
from transformers import AutoTokenizer

# project root path
dsdir = os.getenv("DSDIR")
idr_models_dir = os.path.join(dsdir,"HuggingFace_Models")

# pubmed pretraining set

In [3]:
ds = load_from_disk("pubmed_preproc")

Loading dataset from disk:   0%|          | 0/215 [00:00<?, ?it/s]

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['h-index', 'sjr', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'random'],
        num_rows: 57940603
    })
    validation: Dataset({
        features: ['h-index', 'sjr', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 3049506
    })
})

In [7]:
ds["train"][0]["random"] 

2.3444218515250483

# biosses sentence sim

In [45]:
biosses = load_dataset("bigbio/biosses",cache_dir=".blurb_cache",trust_remote_code=True)

In [40]:
def average_annotators(examples):
    examples["annotator_avg"] = np.mean([examples[k] for k in examples if "annotator" in k],axis=0)
    return examples
biosses=biosses.map(
    average_annotators,
    batched=True,
    remove_columns=[f"annotator_{x}" for x in "abcde"]
)

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [44]:
biosses["train"][2]

{'id': 26,
 'document_id': 27,
 'text_1': 'We then sought to reassess the regulation of miR-223 in the exactly same experimental system adopted in the previous work ',
 'text_2': 'Importantly, our reassessment revealed that this conserved promoter is probably active in the induction of miR-223 during All-trans retinoic acid (ATRA)-induced differentiation of the APL cell line, NB4 cells, which is the main experimental system adopted in the previous study ',
 'annotator_avg': 2.4}

In [5]:
tokenizer = AutoTokenizer.from_pretrained(f"{idr_models_dir}/bert-base-uncased")

In [6]:
text_column_names = "text_1,text_2"
text_column_delimiter = " [SEP] "
label_to_id = None

In [7]:
def preprocess_function(examples):
    text_column_names = ["text_1","text_2"]
    # join together text columns into "sentence" column
    examples["sentence"] = examples[text_column_names[0]]
    for column in text_column_names[1:]:
        for i in range(len(examples[column])):
            examples["sentence"][i] += text_column_delimiter + examples[column][i]
    # Tokenize the texts
    result = tokenizer(examples["sentence"], padding=False, max_length=512, truncation=True)
    if label_to_id is not None and "label" in examples:
        if is_multi_label:
            result["label"] = [multi_labels_to_ids(l) for l in examples["label"]]
        else:
            result["label"] = [(label_to_id[str(l)] if l != -1 else -1) for l in examples["label"]]
    return result

In [8]:
biosses = biosses.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/64 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/16 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

In [9]:
biosses["train"][0]["sentence"]

'It has recently been shown that Craf is essential for Kras G12D-induced NSCLC.[SEP]It has recently become evident that Craf is essential for the onset of Kras-driven non-small cell lung cancer. '

In [66]:
tokenizer.decode(biosses["train"][0]["input_ids"])

'[CLS] it has recently been shown that craf is essential for kras g12d - induced nsclc. [SEP] it has recently become evident that craf is essential for the onset of kras - driven non - small cell lung cancer. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [67]:
for token in biosses["train"][0]["input_ids"]:
    decoded = tokenizer.decode(token)
    print(token,decoded)

101 [CLS]
2009 it
2038 has
3728 recently
2042 been
3491 shown
2008 that
13675 cr
10354 ##af
2003 is
6827 essential
2005 for
1047 k
8180 ##ras
1043 g
12521 ##12
2094 ##d
1011 -
10572 induced
24978 ns
20464 ##cl
2278 ##c
1012 .
102 [SEP]
2009 it
2038 has
3728 recently
2468 become
10358 evident
2008 that
13675 cr
10354 ##af
2003 is
6827 essential
2005 for
1996 the
14447 onset
1997 of
1047 k
8180 ##ras
1011 -
5533 driven
2512 non
1011 -
2235 small
3526 cell
11192 lung
4456 cancer
1012 .
102 [SEP]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD]
0 [PAD