In [1]:
!pip install sacremoses
from transformers import pipeline

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
from transformers import AutoTokenizer
bootstrapped_model_name = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(bootstrapped_model_name)

from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=bootstrapped_model_name, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import torch
import pandas as pd

In [4]:
def get_decision(question):
    inputs = tokenizer(question, return_tensors="pt", max_length=512, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_class_idx = outputs.logits.argmax(dim=-1).item()
    
    return predicted_class_idx

In [5]:
MAX_TOKENS = 512

def truncate_text_to_fit(text, max_tokens=MAX_TOKENS):
    token_count = len(tokenizer.tokenize(text))
    if token_count > max_tokens - 2:  
        truncated_text = " ".join(text.split()[:-1])
        while len(tokenizer.tokenize(truncated_text)) > max_tokens - 2:
            truncated_text = " ".join(truncated_text.split()[:-1])
    else:
        truncated_text = text
        
    return truncated_text

In [6]:
def create_input_text(question, context):
    input_text = "is the question: "+ question + "relevent to this context: " + context
    return input_text

In [7]:
def save_to_csv(patient_id,indexes, pubids, results):
    df = pd.DataFrame({
        'Index': indexes,
        'PubID': pubids,
        'Result': results
    })
    df.to_csv(str(patient_id)+'-results.csv', index=False)

In [8]:
patient_histories_path = '/kaggle/input/bert-patient/Patient_Histories_Cleaned.csv'
patient_histories_df = pd.read_csv(patient_histories_path)

for patient_id, patient_history in patient_histories_df[['Patient ID', 'Patient History']].values:
    patient_number = patient_id.replace("Patient-", "number")
    summarized_abstracts_path = f'/kaggle/input/bert-patient/summarized_patient_cases_bert/summarized_patient_cases_bert/summarized_patient-{patient_number}-articles.csv'
    summarized_abstracts_df = pd.read_csv(summarized_abstracts_path)
    summarized_abstracts_df = summarized_abstracts_df.dropna()
    summarized_abstracts_df['input_text']=  summarized_abstracts_df['Summary'].apply(lambda context: create_input_text(patient_history, context))
    results = []
    pubids = []
    indexes = []
    for idx, row in summarized_abstracts_df.iterrows():
            indexes.append(idx + 1)
            pubids.append(row['ID'])
            results.append(classifier(truncate_text_to_fit(row['input_text'])))

    if results:
            save_to_csv(patient_id,indexes, pubids, results)
        

In [9]:
import pandas as pd
import json
import os

label_mapping = {
    "LABEL_0": "yes",
    "LABEL_1": "no",
    "LABEL_2": "maybe"
}

def map_label(result_str):
    try:
        result_str = result_str.replace("'", '"')
        result = json.loads(result_str)
        label = result[0]['label']
        return label_mapping.get(label, "Unknown")
    except:
        return "Error in processing"

# Loop through each file
for i in range(1, 17):
    file_name = f"/kaggle/working/Patient-{i}-results.csv"
    
    try:
        # Read the file
        decisions = pd.read_csv(file_name)
        
        # Apply the map_label function
        decisions['Result'] = decisions['Result'].apply(map_label)
        
        # Save the file with a new suffix
        new_file_name = f"/kaggle/working/Patient-{i}-results-converted.csv"
        decisions.to_csv(new_file_name, index=False)
        
        # Delete the original file
        os.remove(file_name)

        print(f"Processed, saved, and deleted: {new_file_name}")

    except FilenotFoundError:
        print(f"File not found: {file_name}")

Processed, saved, and deleted: /kaggle/working/Patient-1-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-2-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-3-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-4-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-5-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-6-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-7-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-8-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-9-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-10-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-11-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Patient-12-results-converted.csv
Processed, saved, and deleted: /kaggle/working/Pa