In [1]:
!pip install sacremoses

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import json

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
# Directory paths
bootstrapped_level_1_path = '/kaggle/input/biobert-level-2/BootStrapped-Level-2-BioBert/BootStrapped-Level-1-filter-Bert'
summarized_abstracts_path = '/kaggle/input/biobert-level-2/BootStrapped-Level-2-BioBert/summarized_abstracts_bert'
patient_histories_path = '/kaggle/input/biobert-level-2/BootStrapped-Level-2-BioBert/Patient_Histories_Cleaned.csv'

In [3]:
# Function to extract patient number from file name
def extract_patient_number(filename):
    # Extracting the patient number from filenames like 'summarized_patient-numberX-articles.csv'
    # or 'Patient-X-results-with-links.csv'
    parts = filename.split('-')
    for part in parts:
        if part.isdigit():
            return part
    return None

# Load the patient histories data
patient_histories_df = pd.read_csv(patient_histories_path)

In [4]:
from transformers import AutoTokenizer
bootstrapped_model_name = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(bootstrapped_model_name)

from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=bootstrapped_model_name, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def get_decision(question):
    inputs = tokenizer(question, return_tensors="pt", max_length=512, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_class_idx = outputs.logits.argmax(dim=-1).item()
    
    return predicted_class_idx

MAX_TOKENS = 512

def truncate_text_to_fit(text, max_tokens=MAX_TOKENS):
    token_count = len(tokenizer.tokenize(text))
    if token_count > max_tokens - 2:  
        truncated_text = " ".join(text.split()[:-1])
        while len(tokenizer.tokenize(truncated_text)) > max_tokens - 2:
            truncated_text = " ".join(truncated_text.split()[:-1])
    else:
        truncated_text = text
        
    return truncated_text

def create_input_text(context, question):
    input_text =  "is it a relevant diagnosis: "+ question +  " in this context: " + context
    return input_text

def create_df(patient_id,indexes, pubids, results):
    df = pd.DataFrame({
        'Index': indexes,
        'PubID': pubids,
        'Result': results
    })
    return df
    #df.to_csv(str(patient_id)+'-results.csv', index=False)
    
label_mapping = {
    "LABEL_0": "yes",
    "LABEL_1": "no",
    "LABEL_2": "maybe"
}

def map_label(results):
    try:
        # Initialize an empty list to hold the mapped labels
        mapped_labels = []
        # Iterate over each result in the list
        for result in results:
            # Assuming each result is a list with a single dictionary,
            # extract the label from that dictionary
            label = result['label']
            # Map the label and append to the list of mapped labels
            mapped_labels.append(label_mapping.get(label, "Unknown"))
        return mapped_labels[0]

    except Exception as e:
        # Return the error message for debugging
        return f"Error in processing: {e}"

In [6]:
# Loop to process and handle each patient's data individually
for file in os.listdir(bootstrapped_level_1_path):
    if file.endswith('.csv'):
        patient_number = extract_patient_number(file)
        if patient_number is None:
            continue

        # Load and filter the result file
        result_df = pd.read_csv(os.path.join(bootstrapped_level_1_path, file))
        filtered_results = result_df[result_df['Result'].isin(['yes', 'maybe'])]

        # Load and filter the abstracts file for this patient
        abstracts_filename = f"summarized_patient-number{patient_number}-articles.csv"
        abstracts_path = os.path.join(summarized_abstracts_path, abstracts_filename)
        if os.path.exists(abstracts_path):
            abstracts_df = pd.read_csv(abstracts_path)
            filtered_abstracts = abstracts_df[abstracts_df['ID'].isin(filtered_results['PubID'])]
        else:
            filtered_abstracts = pd.DataFrame()

        # Get the diagnosis for this patient from the patient history file
        patient_diagnosis = patient_histories_df[patient_histories_df['Patient ID'] == f"Patient-{patient_number}"]['Diagnosis'].iloc[0]
        # Combine the results and abstracts into one DataFrame and add the diagnosis as a new column
        combined_df = pd.merge(filtered_results, filtered_abstracts, left_on='PubID', right_on='ID', how='left')
        combined_df['Diagnosis'] = patient_diagnosis
        combined_df['input_text'] = combined_df.apply(lambda row: create_input_text(row['Summary'], row['Diagnosis']), axis=1)
        results = []
        pubids = []
        indexes = []
        for idx, row in combined_df.iterrows():
            indexes.append(idx + 1)
            pubids.append(row['PubID'])
            results.append(classifier(truncate_text_to_fit(row['input_text'])))

        if results:
            result=create_df(patient_number,indexes, pubids, results)
            result['Result'] = result['Result'].apply(map_label)
            new_file_name = f"/kaggle/working/Patient-{patient_number}-final-results-BioBert.csv"
            result.to_csv(new_file_name, index=False)
            print(f"Processed, and saved: {new_file_name}")

Processed, and saved: /kaggle/working/Patient-3-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-7-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-4-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-6-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-5-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-12-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-11-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-9-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-13-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-10-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-14-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-15-final-results-BioBert.csv
Processed, and saved: /kaggle/working/Patient-8-final-results-BioBert.csv
Processed, and saved: /kaggle/wo