In [51]:
from openai import OpenAI

client = OpenAI(api_key="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")

def complete_masked_question(masked_question: str, mask_token: str = "()", num_completions: int = 5) -> list:
    """
    Complete a masked question using OpenAI's API to generate likely completions.
    
    Args:
        masked_question (str): Question with masked words (e.g., "What is the () of this ()?")
        api_key (str): OpenAI API key
        mask_token (str): Token used to indicate masked words
        num_completions (int): Number of different completions to generate
    Returns:
        list: List of completed questions
    """
    
    
    # Count number of masks to help with prompt engineering
    mask_count = masked_question.count(mask_token)
    
    # Create a prompt that encourages filling in the masks
    system_prompt = "You are a helpful assistant that completes masked words in questions. Provide natural and contextually appropriate question."
    user_prompt = f"""
    Complete the following question by replacing {mask_count} masked word(s) marked with {mask_token}.
    Question: {masked_question}
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2,
        n=num_completions
    )

    completions = [
        choice.message.content
        for choice in response.choices
    ]
    
    return completions
    
# masked_question = "() () () () () () () watermelon seeds ()?"

# completions = complete_masked_question(masked_question=masked_question)

# print(completions)


In [None]:
masked_question = "() () () () () () () () meal () () swimming ()?"

completions = complete_masked_question(masked_question=masked_question)


In [62]:
print(completions)

['Question: Did you enjoy your delicious meal before going swimming?', 'Question: Have you ever enjoyed a delicious homemade meal before going swimming?', 'Question: Did you enjoy your delicious meal before going swimming?', 'Question: Did you enjoy your delicious meal before going swimming?', 'Question: Did you enjoy your delicious meal before going swimming?']


In [46]:
import pandas as pd
from fuzzywuzzy import fuzz 


def calculate_accuracy(data_path: str, similarity_threshold: int = 80):
    # Load the data
    df = pd.read_csv(data_path)
    
    # Initialize counters and new DataFrame list
    total_questions = len(df)
    accurate_count = 0
    results = []
    
    # Process each record
    for index, row in df.iterrows():
        if index >= 100: break
        original_question = row['original_question']
        masked_question = row['masked_question']
        
        # Get completions for the masked question
        completions = complete_masked_question(masked_question)
        
        # Initialize variables for the best match
        best_match = None
        best_score = 0
        
        # Compare each completion with the original question
        for completion in completions:
            score = fuzz.ratio(original_question, completion)
            if score > best_score:
                best_score = score
                best_match = completion
        
        # Determine if the match meets the threshold
        matched = best_score >= similarity_threshold
        if matched:
            accurate_count += 1
        
        # Append the record to the results
        results.append({
            'original_question': original_question,
            'masked_question': masked_question,
            'generated_question': best_match,
            'matched': matched,
            'similarity_score': best_score
        })
    
    # Calculate accuracy
    accuracy_percentage = (accurate_count / 100) * 100
    print(f"Accuracy: {accuracy_percentage}%")
    
    # Create a new DataFrame from results
    results_df = pd.DataFrame(results)
    
    # Return the results DataFrame
    return results_df

In [52]:
data_path = "Data/masked_mmlu.csv"
processed_df = calculate_accuracy(data_path)

Accuracy: 32.0%


In [53]:
processed_df.to_csv("Data/MMLU/accuracy_gpt4o.csv", index=False)
print("\nData saved to 'accuracy.csv'")


Data saved to 'accuracy.csv'


## Claude 3 

In [18]:
import anthropic

# Create the Claude API client (you'll need to replace with your actual API key)
client = anthropic.Anthropic(api_key="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")

def complete_masked_question(masked_question: str, model_name: str, mask_token: str = "()", num_completions: int = 5) -> list:
    """
    Complete a masked question using Anthropic's Claude API to generate likely completions.
    
    Args:
        masked_question (str): Question with masked words (e.g., "What is the () of this ()?")
        mask_token (str): Token used to indicate masked words
        num_completions (int): Number of different completions to generate
    
    Returns:
        list: List of completed questions
    """
    # Validate inputs
    if not masked_question or mask_token not in masked_question:
        return []
    
    # Count number of masks to help with prompt engineering
    mask_count = masked_question.count(mask_token)
    
    # Create a system prompt that guides the completion
    system_prompt = (
        "You are a helpful assistant that completes masked words in questions. "
        "Provide natural and contextually appropriate completions. "
        f"Replace the {mask_count} masked word(s) marked with {mask_token} with suitable terms. Provide just the question"
    )
    
    # Prepare the user prompt
    user_prompt = f"Complete this question by replacing the masked sections:\n{masked_question}"
    
    try:   
        # Store completions
        completions = []
        
        # Generate multiple completions
        for _ in range(num_completions):
            response = client.messages.create(
                model= model_name,  
                max_tokens=400,
                system=system_prompt,
                messages=[
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.7 
            )
            
            # Extract and store the completion
            completions.append(response.content[0].text.strip())
        
        return completions
    
    except Exception as e:
        print(f"Error generating completions: {e}")
        return []


In [19]:
import pandas as pd
from fuzzywuzzy import fuzz 


def calculate_accuracy(data_path: str, model_name: str, similarity_threshold: int = 80):
    # Load the data
    df = pd.read_csv(data_path)
    
    # Initialize counters and new DataFrame list
    total_questions = len(df)
    accurate_count = 0
    results = []
    
    # Process each record
    for index, row in df.iterrows():
        if index >= 100: break
        original_question = row['original_question']
        masked_question = row['masked_question']
        
        # Get completions for the masked question
        completions = complete_masked_question(masked_question, model_name)
        
        # Initialize variables for the best match
        best_match = None
        best_score = 0
        
        # Compare each completion with the original question
        for completion in completions:
            score = fuzz.ratio(original_question, completion)
            if score > best_score:
                best_score = score
                best_match = completion
        
        # Determine if the match meets the threshold
        matched = best_score >= similarity_threshold
        if matched:
            accurate_count += 1
        
        # Append the record to the results
        results.append({
            'original_question': original_question,
            'masked_question': masked_question,
            'generated_question': best_match,
            'matched': matched,
            'similarity_score': best_score
        })
    
    # Calculate accuracy
    accuracy_percentage = (accurate_count / 100) * 100
    print(f"Accuracy: {accuracy_percentage}%")
    
    # Create a new DataFrame from results
    results_df = pd.DataFrame(results)
    
    # Return the results DataFrame
    return results_df

In [20]:
data_path = "Data/masked_TruthfulQA.csv"
processed_df = calculate_accuracy(data_path, "claude-3-5-sonnet-20241022")

processed_df.to_csv("Data/TruthfulQA/accuracy_claude3-5sonnet.csv", index=False)
print("\nData saved to 'accuracy_claude3-5sonnet.csv'")

Accuracy: 54.0%

Data saved to 'accuracy_claude3-5sonnet.csv'


GSM8K - 
Claude 3 Sonnet - Accuracy: 9.0% 
Claude 3.5 Sonnet - Accuracy: 27.0%
GPT 3.5 - Accuracy: 12.0%
GPT 4o - Accuracy: 24.0%
Llama 3.1 - Accuracy: 



TruthfulQA -
Claude 3 Sonnet - Accuracy: 21.0%
Claude 3.5 Sonnet - Accuracy: 54.0%
GPT 3.5 - Accuracy: 51.0%
GPT 4o - Accuracy: 64%
Llama 3.1 - Accuracy: 


Commonsense - 
Claude 3 Sonnet - Accuracy: 10%
Claude 3.5 Sonnet - Accuracy: 21.0%
GPT 3.5 - Accuracy: 18.0%
GPT 4o - Accuracy: 24.0%


MMLU - 
Claude 3 Sonnet - Accuracy: 
Claude 3.5 Sonnet - Accuracy: 
GPT 3.5 - Accuracy: 24.0%
GPT 4o - Accuracy: 32.0%



In [21]:
processed_df = calculate_accuracy(data_path, "claude-3-sonnet-20240229")

processed_df.to_csv("Data/TruthfulQA/accuracy_claude3sonnet.csv", index=False)
print("\nData saved to 'accuracy_claude3sonnet.csv'")

Accuracy: 21.0%

Data saved to 'accuracy_claude3sonnet.csv'


## Fine Tuned

In [36]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import pipeline
import torch

def complete_masked_question(masked_question: str, model_name: str = "ank028/Llama-3.2-1B-Instruct-gsm8k", 
                           mask_token: str = "()", num_completions: int = 5) -> list:
    """
    Complete a masked question using Llama model to generate likely completions.
    
    Args:
        masked_question (str): Question with masked words (e.g., "What is the () of this ()?")
        model_name (str): Name of the Llama model to use
        mask_token (str): Token used to indicate masked words
        num_completions (int): Number of different completions to generate
    
    Returns:
        list: List of completed questions
    """
    # Validate inputs
    if not masked_question or mask_token not in masked_question:
        return []
    
    try:
        # Initialize the pipeline
        generator = pipeline(
            "text-generation",
            model=model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        
        # Count number of masks
        mask_count = masked_question.count(mask_token)
        
        # Create prompt
        system_prompt = (
            "You are a helpful assistant that completes masked words in questions. "
            "Provide natural and contextually appropriate completions. "
            f"Replace the {mask_count} masked word(s) marked with {mask_token} with suitable terms. "
            "Provide just the completed question without any additional text."
        )
        
        # Combine system and user prompts in Llama chat format
        prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
        prompt += f"Complete this question by replacing the masked sections:\n{masked_question}[/INST]"
        
        # Generate completions
        outputs = generator(
            prompt,
            num_return_sequences=num_completions,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=generator.tokenizer.eos_token_id,
            return_full_text=False  # Only return the newly generated text
        )
        
        # Extract and clean completions
        completions = []
        for output in outputs:
            # Clean up the generated text
            completed_question = output['generated_text'].strip()
            completions.append(completed_question)
        
        return completions
        
    except Exception as e:
        print(f"Error generating completions: {e}")
        return []

In [37]:
masked_question = "() () () () () () () watermelon seeds ()?"

completions = complete_masked_question(masked_question=masked_question)

print(completions)

Error generating completions: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`
[]
