In [1]:
# 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

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import pipeline
import torch
# meta-llama/Llama-3.1-8B-Instruct
def complete_masked_question(masked_question: str, model_name: str = "meta-llama/Llama-3.2-3B-Instruct", 
                           mask_token: str = "()", num_completions: int = 5) -> list:
    """
    Complete a masked question using Llama model to generate likely completions.
    
    Args:
        masked_question (str): Question with masked words (e.g., "What is the () of this ()?")
        model_name (str): Name of the Llama model to use
        mask_token (str): Token used to indicate masked words
        num_completions (int): Number of different completions to generate
    
    Returns:
        list: List of completed questions
    """
    # Validate inputs
    # if not masked_question or mask_token not in masked_question:
    #     return []
    
    try:
        # Initialize the pipeline
        generator = pipeline(
            "text-generation",
            model=model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        
        # Count number of masks
        mask_count = masked_question.count(mask_token)
        
        # Create prompt
        system_prompt = (
            # "You are a helpful assistant that completes masked words in questions. "
            # "Provide natural and contextually appropriate completions for question. "
            "Provide a natural and contextually appropriate question by completing the masked question by filling appropriate terms"
            f"Replace the {mask_count} masked word(s) marked with {mask_token} with suitable terms. "
            "Provide just the completed question. Do not provide anything else"
        )
        
        # Combine system and user prompts in Llama chat format
        # prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
        prompt = f"{system_prompt}\n"
        prompt += f"Complete this question by replacing the masked sections:\n{masked_question}[/INST]"
        
        # Generate completions
        outputs = generator(
            prompt,
            num_return_sequences=num_completions,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=generator.tokenizer.eos_token_id,
            return_full_text=False,
            top_k = 5
        )
        # print(outputs)
        
        # Extract and clean completions
        completions = []
        for output in outputs:
            # Clean up the generated text
            completed_question = output['generated_text'].strip()
            completions.append(completed_question)
        
        return completions
        
    except Exception as e:
        print(f"Error generating completions: {e}")
        return []

In [3]:
# masked_question = "What is 2+3?"

# completions = complete_masked_question(masked_question=masked_question)

# print(completions)

In [9]:
masked_question = "() police busted () () door () () realized () () () () () () pawn () () partner () grander () ()"

completions = complete_masked_question(masked_question=masked_question)

# print(completions)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[{'generated_text': ' \nWhat is the meaning of this question. The question seems to be asking about a situation where a police officer breaks into a house or a building, finds some illegal items or evidence, realizes the significance of what they have found, and decides to take action. The question is likely asking about the circumstances surrounding a police raid or a search warrant. The context of the question suggests that the police officer is discovering something that is illegal or incriminating, and the officer is considering the consequences of their discovery'}, {'generated_text': " (1)\nThe police busted the man who had been running a large-scale pawn shop in a small town. He had been using a variety of methods to avoid detection, including selling items to people who were not aware that they were purchasing stolen goods. The police also found that the man had been using a network of associates to help him launder money. The police were able to recover many of the stolen item

In [10]:
print(completions)

['What is the meaning of this question. The question seems to be asking about a situation where a police officer breaks into a house or a building, finds some illegal items or evidence, realizes the significance of what they have found, and decides to take action. The question is likely asking about the circumstances surrounding a police raid or a search warrant. The context of the question suggests that the police officer is discovering something that is illegal or incriminating, and the officer is considering the consequences of their discovery', "(1)\nThe police busted the man who had been running a large-scale pawn shop in a small town. He had been using a variety of methods to avoid detection, including selling items to people who were not aware that they were purchasing stolen goods. The police also found that the man had been using a network of associates to help him launder money. The police were able to recover many of the stolen items and arrest several of the man's associate

In [16]:
import pandas as pd
from fuzzywuzzy import fuzz 


def calculate_accuracy(data_path: str, similarity_threshold: int = 80):
    # Load the data
    df = pd.read_csv(data_path)
    
    # Initialize counters and new DataFrame list
    total_questions = len(df)
    accurate_count = 0
    results = []
    
    # Process each record
    for index, row in df.iterrows():
        if index >= 100: break
        original_question = row['original_question']
        masked_question = row['masked_question']
        
        # Get completions for the masked question
        completions = complete_masked_question(masked_question)
        
        # Initialize variables for the best match
        best_match = None
        best_score = 0
        
        # Compare each completion with the original question
        for completion in completions:
            score = fuzz.ratio(original_question, completion)
            if score > best_score:
                best_score = score
                best_match = completion
        
        # Determine if the match meets the threshold
        matched = best_score >= similarity_threshold
        if matched:
            accurate_count += 1
        
        # Append the record to the results
        results.append({
            'original_question': original_question,
            'masked_question': masked_question,
            'generated_question': best_match,
            'matched': matched,
            'similarity_score': best_score
        })
    
    # Calculate accuracy
    accuracy_percentage = (accurate_count / 100) * 100
    print(f"Accuracy: {accuracy_percentage}%")
    
    # Create a new DataFrame from results
    results_df = pd.DataFrame(results)
    
    # Return the results DataFrame
    return results_df

In [21]:
data_path = "masked_TruthfulQA.csv"
processed_df = calculate_accuracy(data_path)
processed_df.to_csv("accuracy_llama3-2_truthfulQA.csv", index=False)
print("\nData saved to 'accuracy_llama3.csv'")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Accuracy: 6.0%

Data saved to 'accuracy_llama3.csv'


In [14]:
# MMLU 1.0%
# Commonsense Accuracy: 1.0%
# GSM8K Accuracy: 0.0%
# TruthfulQA: 2%