In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('michiyasunaga/BioLinkBERT-base')
model = BertModel.from_pretrained('michiyasunaga/BioLinkBERT-base', output_attentions=True)

tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/447k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [3]:
import torch

# Prepare data
def plots(text,filename):
    texts=[text]
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", return_attention_mask=True)

    with torch.no_grad():
        outputs = model(**inputs)

    attention = outputs.attentions[-1]
    attention_weights = attention.mean(dim=1)  
    diagonal_attention = attention_weights[:, range(attention_weights.size(1)), range(attention_weights.size(1))]
    # Example of averaging token attention weights for each sentence
    sentence_attention_weights = []
    for i, text in enumerate(texts):
        token_attention_weights = diagonal_attention[i].tolist()
        average_attention = sum(token_attention_weights) / len(token_attention_weights)
        sentence_attention_weights.append(average_attention)

    attention_data = []
    for i, text in enumerate(texts):
        tokens = tokenizer.tokenize(text)
        weights = diagonal_attention[i].tolist()[:len(tokens)]  # Truncate to the length of the number of tokens
        sentence_att_weight = sentence_attention_weights[i]  # Sentence-level attention weight
        for token, weight in zip(tokens, weights):
            attention_data.append({
                'sent_id': text,
                'raw_word': token,
                'normalized_word_weight': weight,
                'sent_att_weight': sentence_att_weight
            })

    attention_df = pd.DataFrame(attention_data)
    predict = 0.5
    context_series = pd.Series({'label': 'Positive', 'name': 'Patient History'})
    normalized_attention_df = attention_df.copy()
    normalized_attention_df['normalized_word_weight'] = normalized_attention_df['normalized_word_weight'] / normalized_attention_df['normalized_word_weight'].max()
    filtered_df = normalized_attention_df[normalized_attention_df['normalized_word_weight'] > 0.45]
    print(filtered_df.head())
    return filtered_df

In [4]:
patient_histories = pd.read_csv("/kaggle/input/complete-dataset/Patient_Histories_Cleaned.csv")
attention_patient_history=[]
for index, row in patient_histories.iterrows():
        results = []
        patient_id = index+1
        patient_history = row['Patient History']
        diagnosis = row['Diagnosis']
        attention_patient_history.append(plots(patient_history,f'patient-{patient_id}-BioBert-Attention.png'))
        

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


                                              sent_id raw_word  \
2   A 30-year-old female presents with bloody disc...        -   
4   A 30-year-old female presents with bloody disc...        -   
6   A 30-year-old female presents with bloody disc...   female   
10  A 30-year-old female presents with bloody disc...      ##y   
12  A 30-year-old female presents with bloody disc...     from   

    normalized_word_weight  sent_att_weight  
2                 1.000000         0.032205  
4                 0.587949         0.032205  
6                 0.671260         0.032205  
10                0.730280         0.032205  
12                0.578835         0.032205  
                                              sent_id         raw_word  \
2   A 65-year-old woman arrives to the ED complain...                -   
6   A 65-year-old woman arrives to the ED complain...            woman   
20  A 65-year-old woman arrives to the ED complain...          medical   
23  A 65-year-old woman arrives

In [5]:
patient_history_list = []
for df in attention_patient_history:
    patient_history_list.append(' '.join(df['raw_word'].astype(str).tolist()))

patient_history_df = pd.DataFrame(patient_history_list, columns=['CombinedRawWords'])


In [6]:
!pip install sacremoses

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import json

Collecting sacremoses
  Obtaining dependency information for sacremoses from https://files.pythonhosted.org/packages/0b/f0/89ee2bc9da434bd78464f288fdb346bc2932f2ee80a90b2a4bbbac262c74/sacremoses-0.1.1-py3-none-any.whl.metadata
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [7]:
summarized_abstracts_path = '/kaggle/input/biobert-level-2/BootStrapped-Level-2-BioBert/summarized_abstracts_bert'

In [8]:
from transformers import AutoTokenizer
bootstrapped_model_name = "michiyasunaga/BioLinkBERT-base"
tokenizer = AutoTokenizer.from_pretrained(bootstrapped_model_name)

from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=bootstrapped_model_name, tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def get_decision(question):
    inputs = tokenizer(question, return_tensors="pt", max_length=512, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_class_idx = outputs.logits.argmax(dim=-1).item()
    
    return predicted_class_idx

MAX_TOKENS = 512

def truncate_text_to_fit(text, max_tokens=MAX_TOKENS):
    token_count = len(tokenizer.tokenize(text))
    if token_count > max_tokens - 2:  
        truncated_text = " ".join(text.split()[:-1])
        while len(tokenizer.tokenize(truncated_text)) > max_tokens - 2:
            truncated_text = " ".join(truncated_text.split()[:-1])
    else:
        truncated_text = text
        
    return truncated_text

def create_input_text(question, context):
    input_text =  "Is this patient history- "+ context + "- related to abstract- " + question
    return input_text

def create_df(patient_id,indexes, pubids, results):
    df = pd.DataFrame({
        'Index': indexes,
        'PubID': pubids,
        'Result': results
    })
    return df
    
label_mapping = {
    "LABEL_0": "yes",
    "LABEL_1": "no",
    "LABEL_2": "maybe"
}

def map_label(result_str):
    try:
        result_str = result_str.replace("'", '"')
        result = json.loads(result_str)
        label = result[0]['label']
        return label_mapping.get(label, "Unknown")
    except:
        return "Error in processing"

In [10]:
def extract_patient_number(filename):
    # Extracting the patient number from filenames like 'summarized_patient-numberX-articles.csv'
    # or 'Patient-X-results-with-links.csv'
    parts = filename.split('-')
    for part in parts:
        if part.isdigit():
            return part
    return none

In [11]:
summarized_abstracts_path = '/kaggle/input/complete-dataset/summarized_abstracts_bert'

In [12]:
# Loop to process and handle each patient's data individually
for file in os.listdir('/kaggle/input/complete-dataset/BootStrapped-Level-2-BioLinkBert'):
    if file.endswith('.csv'):
        patient_number = extract_patient_number(file)
        if patient_number is none:
            continue

        # Load and filter the result file
        result_df = pd.read_csv(os.path.join('/kaggle/input/complete-dataset/BootStrapped-Level-2-BioLinkBert', file))
        filtered_results = result_df[result_df['Result'].isin(['yes', 'maybe'])]

        # Load and filter the abstracts file for this patient
        abstracts_filename = f"summarized_patient-number{patient_number}-articles.csv"
        abstracts_path = os.path.join(summarized_abstracts_path, abstracts_filename)
        if os.path.exists(abstracts_path):
            abstracts_df = pd.read_csv(abstracts_path)
            filtered_abstracts = abstracts_df[abstracts_df['ID'].isin(filtered_results['PubID'])]
        else:
            filtered_abstracts = pd.DataFrame()
        # Get the diagnosis for this patient from the patient history file
        history = patient_history_list[int(patient_number)-1]
#         patient_history= 
#         print(patient_history)
        # Combine the results and abstracts into one DataFrame and add the diagnosis as a new column
        combined_df = pd.merge(filtered_results, filtered_abstracts, left_on='PubID', right_on='ID', how='left')
        combined_df['History'] = history
        combined_df['input_text'] = combined_df.apply(lambda row: create_input_text(row['Summary'], row['History']), axis=1)
        results = []
        pubids = []
        indexes = []
        for idx, row in combined_df.iterrows():
            indexes.append(idx + 1)
            pubids.append(row['PubID'])
            results.append(classifier(truncate_text_to_fit(row['input_text'])))

        if results:
            result=create_df(patient_number,indexes, pubids, results)
            result['Result'] = result['Result'].apply(map_label)
            new_file_name = f"/kaggle/working/Patient-{patient_number}-final-results-BioLinkBert.csv"
            combined_df.to_csv(new_file_name, index=False)
            print(f"Processed, and saved: {new_file_name}")

Processed, and saved: /kaggle/working/Patient-12-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-15-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-1-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-8-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-14-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-10-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-7-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-16-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-9-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-3-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-13-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-2-final-results-BioLinkBert.csv
Processed, and saved: /kaggle/working/Patient-5-final-resu