In [13]:
import os
import json
import openai
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage  

# Initialize LangChain with OpenAI LLM
llm = ChatOpenAI(model_name="gpt-4", openai_api_key="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")

# Define prompt template for labeling
prompt_template = """
You are an expert in medical instruction parsing. Each instruction has words or tokens that belong to one of the following categories: {label_list}.

Below is an example of how to correctly label tokens:

Example Instruction: "1. Duloxetine 60 mg Capsule Sig: One (1) Capsule PO DAILY for depression."

Tokenized Tokens and Corresponding Labels:
1.              : O
Duloxetine      : B-DRUG
60              : B-STRENGTH
mg              : I-STRENGTH
Capsule         : B-FORM
Sig:            : O
One             : B-DOSAGE
(1)             : B-DOSAGE
Capsule         : B-FORM
PO              : B-ROUTE
DAILY           : B-FREQUENCY
for             : O
depression.     : B-REASON

Now, based on the provided categories, please label the following instruction **without adding any extra explanation or commentary**. Only output the tokens and their corresponding labels in the format: "<token> -> <label>".

Instruction: "{instruction}"

For each word in the instruction, assign the correct label from the categories. Use the format: "<token> -> <label>".

Please do not provide any explanation, context, or additional information—strictly follow the format requested.
Instruction:
"""

# Define the label list
label_list = ["O", "B-DRUG", "I-DRUG", "B-STRENGTH", "I-STRENGTH",
              "B-FORM", "I-FORM", "B-DOSAGE", "I-DOSAGE",
              "B-FREQUENCY", "I-FREQUENCY", "B-ROUTE", "I-ROUTE",
              "B-REASON", "I-REASON"]

# Simple space-based tokenizer
def simple_tokenizer(text):
    """Simple space-based tokenizer."""
    return text.split()

# Function to create the prompt for LangChain
def create_prompt(instruction):
    return prompt_template.format(instruction=instruction, label_list=", ".join(label_list))

# Adjusted function to use `HumanMessage` and `invoke`
def generate_labels_for_instruction(instruction):
    # Create a prompt for the LLM
    prompt = create_prompt(instruction)
    
    # Use LangChain's updated class with the `invoke` method
    response = llm.invoke([HumanMessage(content=prompt)])
    
    # Process the LLM response to extract tokens and labels
    tokens = []
    labels = []
    
    # Example processing of response: "<token> -> <label>"
    for line in response.content.split('\n'):  # Use `response.content` for the text
        if '->' in line:
            parts = line.split('->')

            # Check if the split result is exactly two parts
            if len(parts) == 2:
                token, label = parts
                tokens.append(token.strip())
                labels.append(label.strip())
            else:
                # If there's an unexpected format, assign the label "O" to the token
                print(f"Warning: Unexpected format in line: {line}")
        else:
            # If no delimiter found, treat the entire line as a single token with label "O"
            tokens.append(line.strip())
            labels.append("O")
            print(f"Warning: Unexpected format in line: {line} - Defaulting label to 'O'.")
    
    return tokens, labels

# Function to process a single record file
def process_record_file(record_path):
    with open(record_path, 'r') as file:
        instructions = file.readlines()
    
    labeled_entries = []
    
    for instruction in instructions:
        instruction = instruction.strip()  # Clean up whitespace
        
        if instruction:  # Ignore empty lines
            # Tokenize the instruction using the simple tokenizer
            tokens = simple_tokenizer(instruction)
            
            # Generate labels using LangChain
            _, labels = generate_labels_for_instruction(instruction)
            
            # Store in the desired format
            labeled_entries.append({
                "instruction": instruction,
                "tokens": tokens,
                "labels": labels
            })
    
    return labeled_entries

# Function to save labeled data to a JSON file
def save_to_json(record_id, labeled_entries, output_dir="/Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/labeled_records"):
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"{record_id}.json")
    
    with open(output_path, 'w') as json_file:
        json.dump({"record_id": record_id, "entries": labeled_entries}, json_file, indent=4)
    
    print(f"Saved labeled data for {record_id} to {output_path}")

# New function to process all record files in a folder
def process_all_records(records_folder="records"):
    for record_file in os.listdir(records_folder):
        if record_file.endswith('.txt'):  # Process only .txt files
            record_path = os.path.join(records_folder, record_file)
            record_id = os.path.splitext(record_file)[0]
            
            # Process the record file
            labeled_entries = process_record_file(record_path)
            
            # Save to JSON
            save_to_json(record_id, labeled_entries)

# Call the function to process all record files
process_all_records(records_folder="/Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/input")

Saved labeled data for record_016 to /Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/labeled_records/record_016.json
Saved labeled data for record_002 to /Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/labeled_records/record_002.json
Saved labeled data for record_003 to /Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/labeled_records/record_003.json
Saved labeled data for record_017 to /Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/labeled_records/record_017.json
Saved labeled data for record_029 to /Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/labeled_records/record_029.json
Saved labeled data for record_001 to /Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/labeled_records/record_001.json
Saved labeled data for record_015 to /Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_dataset/labeled_records/record_015.json
Saved labeled data for record_014 to /Users/remon.m/Desktop/Y4S2/Thesis/Coding/Synth/test_