In [3]:
#!pip install transformers tqdm gcsfs

In [2]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU detected:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected.")


PyTorch version: 1.13.1+cu117
CUDA available: True
GPU detected: Tesla T4


In [4]:
import os
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import gcsfs

In [5]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU detected:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected.")

PyTorch version: 1.13.1+cu117
CUDA available: True
GPU detected: Tesla T4


In [6]:
def predict_long_text_biomed(
    text: str,
    pipe,
    max_length=512,
    stride=256
):
    tokenizer = pipe.tokenizer
    input_ids = tokenizer(text, add_special_tokens=True)["input_ids"]
    total_tokens = len(input_ids)

    if total_tokens <= max_length:
        return pipe(text)[0]

    subchunk_sentiments = []
    start = 0
    while start < total_tokens:
        end = start + max_length
        sub_ids = input_ids[start:end]
        sub_text = tokenizer.decode(sub_ids, skip_special_tokens=True)

        sub_res = pipe(sub_text)
        subchunk_sentiments.append(sub_res[0])

        if end >= total_tokens:
            break
        start += max_length - stride

    label_counts = {}
    for chunk_res in subchunk_sentiments:
        lbl = chunk_res["label"]
        label_counts[lbl] = label_counts.get(lbl, 0) + 1

    overall_label = max(label_counts, key=label_counts.get)
    overall_score = label_counts[overall_label] / len(subchunk_sentiments)
    return {"label": overall_label, "score": overall_score}


In [7]:
def process_chunk(
    chunk_file: str,
    output_folder: str,
    biomedbert_model: str,
    max_length: int = 512,
    stride: int = 256,
    batch_size: int = 500,  # Process in batches
    text_column: str = "abstract",
    uid_column: str = "uid"
):
    output_file = f"{output_folder}/labeled_{os.path.basename(chunk_file)}"
    fs = gcsfs.GCSFileSystem()
    if fs.exists(output_file):
        print(f"Chunk {chunk_file} already processed. Skipping.")
        return

    with fs.open(chunk_file, 'rb') as f:
        df_chunk = pd.read_parquet(f)
    print(f"Processing {len(df_chunk)} rows from {chunk_file}...")

    tokenizer = AutoTokenizer.from_pretrained(biomedbert_model)
    model = AutoModelForSequenceClassification.from_pretrained(biomedbert_model)
    pipe = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        truncation=True,
        max_length=max_length,
        device=0  # Use GPU
    )

    results = []
    for start in tqdm(range(0, len(df_chunk), batch_size), desc="Batch Processing"):
        sub_batch = df_chunk.iloc[start:start + batch_size]
        for _, row in sub_batch.iterrows():
            prediction = predict_long_text_biomed(
                text=row[text_column],
                pipe=pipe,
                max_length=max_length,
                stride=stride
            )
            results.append({
                "UID": row[uid_column],
                "label": prediction["label"],
                "score": prediction["score"]
            })

    output_df = pd.DataFrame(results)
    with fs.open(output_file, 'wb') as f:
        output_df.to_parquet(f, index=False)
    print(f"Saved labeled chunk to {output_file}")

In [8]:
def biomedbert_sequential_processing(
    input_folder: str,
    output_folder: str,
    biomedbert_model: str,
    max_length: int = 512,
    stride: int = 256,
    text_column: str = "abstract",
    uid_column: str = "uid",
    specific_chunk: str = None
):
    fs = gcsfs.GCSFileSystem()
    os.makedirs(output_folder, exist_ok=True)

    chunk_files = [
        f for f in fs.glob(f"{input_folder}/*.parquet")
    ]

    if specific_chunk:
        process_chunk(
            chunk_file=specific_chunk,
            output_folder=output_folder,
            biomedbert_model=biomedbert_model,
            max_length=max_length,
            stride=stride,
            text_column=text_column,
            uid_column=uid_column
        )
    else:
        for chunk_file in tqdm(chunk_files, desc="Processing Chunks"):
            process_chunk(
                chunk_file=chunk_file,
                output_folder=output_folder,
                biomedbert_model=biomedbert_model,
                max_length=max_length,
                stride=stride,
                text_column=text_column,
                uid_column=uid_column
            )

In [None]:
if __name__ == "__main__":
    input_dir = "gs://pubmed_123/sent_chunks"
    output_dir = "gs://pubmed_123/labeled_chunks"
    biomedbert_model = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract"

    specific_chunk = None  # Example: "gs://pubmed_123/sent_chunks/sent_chunks_chunk_1.parquet"

    biomedbert_sequential_processing(
        input_folder=input_dir,
        output_folder=output_dir,
        biomedbert_model=biomedbert_model,
        max_length=512,
        stride=256,
        text_column="abstract",
        uid_column="uid",
        specific_chunk=specific_chunk
    )

Processing Chunks:   0%|          | 0/11 [00:00<?, ?it/s]

Chunk pubmed_123/sent_chunks/chunk_1.parquet already processed. Skipping.
Processing 100000 rows from pubmed_123/sent_chunks/chunk_10.parquet...


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


Batch Processing:   0%|          | 0/200 [00:00<?, ?it/s][AYou seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

Batch Processing:   0%|          | 1/200 [00:12<42:06, 12.70s/it][A
Batch Processing:   1%|          | 2/200 [00:24<39:50, 12.07s/it][A
Batch Processing:   2%|▏         | 3/200 [00:35<38:25, 11.70s/it][A
Batch Processing:   2%|▏         | 4/200 [00:48<39:08, 11.98s/it][A
Batch Processing:   2%|▎         | 5/200 [01:00<39:17, 12.09s/it][A
Batch Processing:   3%|▎         | 6/200 [01:11<38:15, 11.83s/it][A
Batch Processing:   4%|▎         | 7/200 [01:23<37:51, 11.77s/it][A
Batch Processing:   4%|▍         | 8/200 [01:34<37:02, 11.58s/it][A
Batch Processing:   4%|▍         | 9/200 [01:46<37:02, 11.63s/it][A
Batch Processing:   5%|▌         | 10/200 [01:58<37:23, 11.81s/it][A
Batch Processing:   6%|▌         | 11/200 [02:10<37:37, 11.94s/it][A
Batch Processing:   6%|▌         | 12/200 [02:21<36:47, 11.74s/it][