In [1]:
!pip install sacremoses
from transformers import pipeline

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
from transformers import BioGptForSequenceClassification, BioGptTokenizer

model_name = "stanford-crfm/BioMedLM"
model = BioGptForSequenceClassification.from_pretrained(model_name, num_labels=3, problem_type="multi_label_classification")
tokenizer = BioGptTokenizer.from_pretrained(model_name)

from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="gpt2-medium", tokenizer="gpt2-medium")

Downloading (…)lve/main/config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

You are using a model of type gpt2 to instantiate a model of type biogpt. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

Some weights of BioGptForSequenceClassification were not initialized from the model checkpoint at stanford-crfm/BioMedLM and are newly initialized: ['layers.5.self_attn.q_proj.weight', 'layers.7.self_attn.v_proj.weight', 'layers.5.self_attn_layer_norm.weight', 'layers.14.self_attn.v_proj.weight', 'layers.5.fc2.weight', 'layers.21.self_attn.v_proj.weight', 'layers.16.self_attn.k_proj.bias', 'layers.22.final_layer_norm.weight', 'layers.17.self_attn.q_proj.weight', 'layers.14.self_attn_layer_norm.weight', 'layers.22.self_attn.q_proj.weight', 'layers.2.fc1.weight', 'layers.15.self_attn_layer_norm.bias', 'layers.3.self_attn.v_proj.weight', 'layers.10.final_layer_norm.bias', 'layers.23.fc1.weight', 'layers.23.self_attn.q_proj.bias', 'layers.4.self_attn.out_proj.weight', 'layers.18.self_attn.k_proj.weight', 'layers.17.self_attn.out_proj.bias', 'layers.15.final_layer_norm.weight', 'layers.0.final_layer_norm.weight', 'layers.15.self_attn.q_proj.weight', 'layers.22.self_attn.v_proj.bias', 'layer

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/602k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/276k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/267 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'BioGptTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
import torch
import pandas as pd

In [4]:
def get_decision(question):
    inputs = tokenizer(question, return_tensors="pt", max_length=512, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_class_idx = outputs.logits.argmax(dim=-1).item()
    
    return predicted_class_idx

In [5]:
MAX_TOKENS = 512

def truncate_text_to_fit(text, max_tokens=MAX_TOKENS):
    token_count = len(tokenizer.tokenize(text))
    if token_count > max_tokens - 2:  
        truncated_text = " ".join(text.split()[:-1])
        while len(tokenizer.tokenize(truncated_text)) > max_tokens - 2:
            truncated_text = " ".join(truncated_text.split()[:-1])
    else:
        truncated_text = text
        
    return truncated_text

In [6]:
def create_input_text(question, context):
    input_text = "question: "+ question + "context: " + context
    return input_text

In [7]:
def save_to_csv(patient_id,indexes, pubids, results):
    df = pd.DataFrame({
        'Index': indexes,
        'PubID': pubids,
        'Result': results
    })
    df.to_csv(str(patient_id)+'-results.csv', index=False)

In [8]:
patient_histories_path = '/kaggle/input/gpt-patient/Patient_Histories_Cleaned.csv'
patient_histories_df = pd.read_csv(patient_histories_path)

for patient_id, patient_history in patient_histories_df[['Patient ID', 'Patient History']].values:
    patient_number = patient_id.replace("Patient-", "number")
    summarized_abstracts_path = f'/kaggle/input/gpt-patient/summarized_patient_cases_gpt/summarized_patient_cases_gpt/summarized_patient-{patient_number}-articles.csv'
    summarized_abstracts_df = pd.read_csv(summarized_abstracts_path)
    summarized_abstracts_df = summarized_abstracts_df.dropna()
    summarized_abstracts_df['input_text']=  summarized_abstracts_df['Summary'].apply(lambda context: create_input_text(patient_history, context))
    results = []
    pubids = []
    indexes = []
    for idx, row in summarized_abstracts_df.iterrows():
            indexes.append(idx + 1)
            pubids.append(row['ID'])
            results.append(classifier(truncate_text_to_fit(row['input_text'])))

    if results:
            save_to_csv(patient_id,indexes, pubids, results)
        

In [10]:
import pandas as pd
import json
import os

label_mapping = {
    "LABEL_0": "yes",
    "LABEL_1": "no",
    "LABEL_2": "maybe"
}

def map_label(result_str):
    try:
        result_str = result_str.replace("'", '"')
        result = json.loads(result_str)
        label = result[0]['label']
        return label_mapping.get(label, "Unknown")
    except:
        return "Error in processing"

# Loop through each file
for i in range(1, 17):
    file_name = f"/kaggle/working/Patient-{i}-results.csv"
    
    try:
        # Read the file
        decisions = pd.read_csv(file_name)
        
        # Apply the map_label function
        decisions['Result'] = decisions['Result'].apply(map_label)
        
        # Save the file with a new suffix
        new_file_name = f"/kaggle/working/Patient-{i}-results-converted.csv"
        decisions.to_csv(new_file_name, index=False)
        
        # Delete the original file
        os.remove(file_name)

        print(f"Processed, saved, and deleted: {new_file_name}")

    except FileNotFoundError:
        print(f"File not found: {file_name}")

File not found: /kaggle/working/Patient-1-results.csv
File not found: /kaggle/working/Patient-2-results.csv
File not found: /kaggle/working/Patient-3-results.csv
File not found: /kaggle/working/Patient-4-results.csv
File not found: /kaggle/working/Patient-5-results.csv
File not found: /kaggle/working/Patient-6-results.csv
File not found: /kaggle/working/Patient-7-results.csv
File not found: /kaggle/working/Patient-8-results.csv
File not found: /kaggle/working/Patient-9-results.csv
File not found: /kaggle/working/Patient-10-results.csv
File not found: /kaggle/working/Patient-11-results.csv
File not found: /kaggle/working/Patient-12-results.csv
File not found: /kaggle/working/Patient-13-results.csv
File not found: /kaggle/working/Patient-14-results.csv
File not found: /kaggle/working/Patient-15-results.csv
File not found: /kaggle/working/Patient-16-results.csv
