In [1]:
# This code load the question and label
import pandas as pd
import json
# Load the CSV file
def load_csv(file_path):
    return pd.read_csv(file_path)
    
def filter_answers(df):
    # Use a regular expression to filter rows
    # The regex '^[A-Z]$' matches a single uppercase letter
    return df[df['Answer'].str.match(r'^[A-Z]$', na=False)]

file_path = 'NEJM_All_Questions_And_Answers.csv'  
df = load_csv(file_path)
df = filter_answers(df)
subject_types = df['Subject'].unique().tolist()
print(subject_types)
print(df.columns)


['Hypertension', 'Pregnancy and Kidney Disease', 'Primary & Secondary Glomerular Diseases', 'Disorders of Divalent Ions Renal Bone Disease & Nephrolithiasis', 'ESRD & Dialysis', 'Electrolyte Acid-Base Disorders', 'Acute Kidney Injury & Critical Care Nephrology', 'Interventional Nephrology & Dialysis Therapy', 'Disorders of Divalent Ions, Renal Bone Disease and Nephrolithiasis', 'Secondary Glomerular Disease', 'End Stage Renal Disease & Dialysis', 'Primary Glomerular Disease', 'Infection Control & Prevention in Outpatient Hemodialysis Facilities', 'Chronic Kidney Disease and Progression', 'Transplantation', 'Primary and Secondary Glomerular Diseases', 'End-Stage Kidney Disease', 'Electrolytes and Acid-Base Disorders', 'Chronic Kidney Disease', 'Disorder of Divalent Ions Renal Bone Disease and Nephrolithiasis']
Index(['ID', 'Context', 'Question', 'Choices', 'Answer', 'Solution',
       'Subject'],
      dtype='object')


# Embedding

In [None]:
import json

In [None]:
import openai
openai.api_key = "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"

embedding_results = []

for index, row in df.iterrows():
    # Extracting question, context, and response
    question = row['Question']
    context = str(row['Context'])
    choice = row['Choices']
    ID = row['ID']
    
    # Generate embeddings for each text element
    question_embedding = openai.embeddings.create(
        model="text-embedding-3-large",
        input=question+choice,
        encoding_format="float"
    )
    if len(context)==0:
        context_embedding = openai.embeddings.create(
            model="text-embedding-3-large",
            input=context,
            encoding_format="float"
        )
    else:
        # Use space as placeholder for no context, the context embedding in this case is meaningless
        # Embedding for no context will not be used in analysis
        context_embedding = openai.embeddings.create(
            model="text-embedding-3-large",
            input=" ",
            encoding_format="float"
        )
    
    # Store the embeddings with the corresponding id
    embedding_results.append({
        "ID": ID,
        "question_embedding": question_embedding.data[0].embedding,
        "context_embedding": context_embedding.data[0].embedding
    })
# Optionally, you can save this data back to a JSON file
with open('embedding_results.json', 'w') as outfile:
    json.dump(embedding_results, outfile)

print("Embeddings generated and stored.")

In [None]:
import numpy as np

def load_embeddings(file_path):
    """Load embeddings from a JSON file."""
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def save_data(data, file_path):
    """Save the data to a JSON file."""
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

def compute_cosine_similarity(vector_a, vector_b):
    """Compute cosine similarity between two vectors using OpenAI utility."""
    # Ensure vectors are np.arrays to match expected input format
    vector_a = np.array(vector_a).reshape(1, -1)
    vector_b = np.array(vector_b).reshape(1, -1)
    # Using OpenAI's cosine_similarity utility
    similarity = np.dot(vector_a, vector_b.T) / (np.linalg.norm(vector_a) * np.linalg.norm(vector_b))
    return similarity.item()

def process_embeddings(data):
    """Process each element to compute cosine similarities."""
    updated_data = []
    for item in data:
        question_embedding = item['question_embedding']
        context_embedding = item['context_embedding']
        
        # Compute cosine similarities
        question_context_similarity = compute_cosine_similarity(question_embedding, context_embedding)
        
        # Exclude embedding fields in the new item
        new_item = {
            "ID": item['ID'],
            "question_context_similarity": question_context_similarity,
        }
        updated_data.append(new_item)
    return updated_data

# Paths
embeddings_file_path = 'embedding_results.json'
updated_file_path = 'updated_results.json'

# Load, process, and save
embeddings_data = load_embeddings(embeddings_file_path)
updated_data = process_embeddings(embeddings_data)
save_data(updated_data, updated_file_path)

print("Updated results saved without embeddings.")

# Inference: get generated answer

In [32]:
import openai
import time
openai.api_key = "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"

In [35]:

def generate_gpt_answer(data,filname):
    generated_responses = []
    for idx, row in data.iterrows():
        question = row['Question']
        context = str(row['Context'])
        choices = row['Choices']
        id = row['ID']

        # Generate a response with context
        messages = []
        messages.append({"role": "system", "content": """
                        You will be given a question and a context. Your task is to select the \
                        most appropriate answer from the choices provided. Make sure your response\
                        start with the letter of your choice. For example, if you choose option A, \
                        your response should start with 'A'.\
                        """})
        messages.append({"role": "assistant", "content": "Context: " + context})
        messages.append({"role": "user", "content": f'The question is {question} and the choices are {choices}'})
        response = openai.chat.completions.create(
                    model="gpt-4-1106-preview",
                    temperature=1.0,
                    max_tokens=1000,
                    messages=messages,
                )
        response_context = {
            'id': id,
            'question': question,
            'context': context,
            'response': response.choices[0].message.content,
        }
        generated_responses.append(response_context)
        time.sleep(30)

        # Generate a response without context
        messages = []
        messages.append({"role": "system", "content": """
                        You will be given a question. Your task is to select the \
                        most appropriate answer from the choices provided. Make sure your response\
                        start with the letter of your choice. For example, if you choose option A, \
                        your response should start with 'A'.\
                        """})
        messages.append({"role": "user", "content": f'The question is {question} and the choices are {choices}'})
        response = openai.chat.completions.create(
                    model="gpt-4-1106-preview",
                    temperature=1.0,
                    max_tokens=1000,
                    messages=messages,
                )
        response_no_context = {
            'id': id,
            'question': question,
            'context': '',
            'response': response.choices[0].message.content,
        }
        generated_responses.append(response_no_context)
        time.sleep(30)
    
        with open(filname, 'w') as outfile:
            json.dump(generated_responses, outfile)
    return generated_responses


In [36]:
generated_answer = generate_gpt_answer(df,'gpt_responses_4_19.json')
