In [1]:
import gpt4all
import pandas as pd
import re
from gpt4all import GPT4All

In [None]:
def load_and_process_data(file_path):
    """ 
    Load and process data from a file into a pandas dataframe. 
    """
    df = pd.read_csv(file_path, delimiter="\t", header=None)
    
    df.columns = ["Question", "Option_A", "Option_B", "Option_C", "Option_D", "Correct_Answer"]

    return df

def generate_prompt_dataframe(df_testset, df_additional_data, prompt_strategy):
    """ 
    Generate a dataframe of prompts for the test set. 
    Can be either zero-shot or few-shot.
    """
    prompt_df = pd.DataFrame(columns=["Prompt", "Correct Answer"])
    prompts = []

    for _, row in df_testset.iterrows():
        question = row["Question"]
        options = [row["Option_A"], row["Option_B"], row["Option_C"], row["Option_D"]]
        correct_answer = row["Correct_Answer"]
        if prompt_strategy == "zero-shot":
            prompt = f"Answer the following multiple choice question about astronomy (only one answer is correct):\n\n"
            prompt = f"Question: {question}\n"
            prompt += f"A: {options[0]}\n"
            prompt += f"B: {options[1]}\n"
            prompt += f"C: {options[2]}\n"
            prompt += f"D: {options[3]}\n"
            prompt += "Answer: "

        elif prompt_strategy == "few-shot":
            example_indices = [0, 1]

            prompt = f"The following are {len(example_indices)} examples of multiple choice questions (with answers) about astronomy:\n\n"
            for i in example_indices:
                example_question = df_additional_data.iloc[i]["Question"]
                example_options = [df_additional_data.iloc[i]["Option_A"], df_additional_data.iloc[i]["Option_B"], df_additional_data.iloc[i]["Option_C"], df_additional_data.iloc[i]["Option_D"]]
                example_correct_answer = df_additional_data.iloc[i]["Correct_Answer"]
                prompt += f"Question: {example_question}\n"
                prompt += f"A: {example_options[0]}\n"
                prompt += f"B: {example_options[1]}\n"
                prompt += f"C: {example_options[2]}\n"
                prompt += f"D: {example_options[3]}\n"
                prompt += f"Answer: {example_correct_answer}\n\n"

            prompt += f"Now, answer the following question:\n\n"
            prompt += f"Question: {question}\n"
            prompt += f"A: {options[0]}\n"
            prompt += f"B: {options[1]}\n"
            prompt += f"C: {options[2]}\n"
            prompt += f"D: {options[3]}\n"
            prompt += "Answer: "
        else:
            raise ValueError("Invalid prompt strategy")
        
        prompts.append({"Prompt": prompt, "Correct Answer": correct_answer})

    return pd.DataFrame(prompts)

def prompt_model(prompt, model, prompt_strategy):
    """
    Generate a response from the model given a prompt.
    """
    print(f"\nRunning {prompt_strategy} prompt:") #debugging
    print(prompt) #debugging
    response = model.generate(prompt, max_tokens=20) 
    print(response) #debugging
    print("\n") #debugging

    match = re.search(r"\b([ABCD])\b", response)
    if match:
        answer = match.group(1)
    else:
        answer = None
        
    return response, answer
   
def run_prompt_strategy(df, model, prompt_strategy):
    """
    Run the prompt strategy on the test set.
    """
    results = []

    for _, row in df.iterrows():
        prompt = row["Prompt"]
        correct_answer = row["Correct Answer"]
        response, answer = prompt_model(prompt, model, prompt_strategy)
        
        results.append({
            "Prompt: ": prompt,
            "Correct Answer": correct_answer,
            "Predicted Answer": answer,
            "Full Response": response
        })

    return pd.DataFrame(results)
    
def compute_accuracy(df_results):
    """
    Compute the accuracy of the model.
    """
    correct = sum(df_results["Correct Answer"] == df_results["Predicted Answer"])
    total = len(df_results)
    return correct / total if total > 0 else 0

In [None]:
testset_path = "TestSet.txt"
additional_data_path = "AdditionalDataForFewShotPrompting.txt"

df_testset = load_and_process_data(testset_path)
df_additional_data = load_and_process_data(additional_data_path)

df_zero_shot_prompts = generate_prompt_dataframe(df_testset, df_additional_data, "zero-shot")
df_few_shot_prompts = generate_prompt_dataframe(df_testset, df_additional_data, "few-shot")

# Save the generated prompts to a txt file
with open("few_shot_prompts.txt", "w", encoding="utf-8") as f:
    for prompt, correct_answer in zip(df_few_shot_prompts["Prompt"], df_few_shot_prompts["Correct Answer"]):
        f.write(f"{prompt}\t{correct_answer}\n\n")
with open("zero_shot_prompts.txt", "w", encoding="utf-8") as f:
    for prompt, correct_answer in zip(df_zero_shot_prompts["Prompt"], df_zero_shot_prompts["Correct Answer"]):
        f.write(f"{prompt}\t{correct_answer}\n\n")

In [5]:
model_name = "Meta-Llama-3-8B-Instruct.Q4_0.gguf"
model = GPT4All(model_name)
Llama_zero_shot_results = run_prompt_strategy(df_zero_shot_prompts, model, "zero-shot")
Llama_zero_shot_results.to_csv("Llama_zero_shot_results.csv", index=False)
Llama_few_shot_results = run_prompt_strategy(df_few_shot_prompts, model, "few-shot")
Llama_few_shot_results.to_csv("Llama_few_shot_results.csv", index=False)

model_name = "Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf"
model = GPT4All(model_name)
Hermes_zero_shot_results = run_prompt_strategy(df_zero_shot_prompts, model, "zero-shot")
Hermes_zero_shot_results.to_csv("Hermes_zero_shot_results.csv", index=False)
Hermes_few_shot_results = run_prompt_strategy(df_few_shot_prompts, model, "few-shot")
Hermes_few_shot_results.to_csv("Hermes_few_shot_results.csv", index=False)

model_name = "Phi-3-mini-4k-instruct.Q4_0.gguf"
model = GPT4All(model_name)
Phi_zero_shot_results = run_prompt_strategy(df_zero_shot_prompts, model, "zero-shot")
Phi_zero_shot_results.to_csv("Phi_zero_shot_results.csv", index=False)
Phi_few_shot_results = run_prompt_strategy(df_few_shot_prompts, model, "few-shot")
Phi_few_shot_results.to_csv("Phi_few_shot_results.csv", index=False)


Running zero-shot prompt:
Question: What is true for a type-Ia ("type one-a") supernova?
A: This type occurs in binary systems.
B: This type occurs in young galaxies.
C: This type produces gamma-ray bursts.
D: This type produces high amounts of X-rays.
Answer: 
 A
Explanation: Type Ia (also known as "type one-a") supernovae are



Running zero-shot prompt:
Question: If you know both the actual brightness of an object and its apparent brightness from your location then with no other information you can estimate:
A: Its speed relative to you
B: Its composition
C: Its size
D: Its distance from you
Answer: 
 D: Its distance from you
Explanation: Since we have two pieces of information, the actual brightness



Running zero-shot prompt:
Question: Why is the sky blue?
A: Because the molecules that compose the Earth's atmosphere have a blue-ish color.
B: Because the sky reflects the color of the Earth's oceans.
C: Because the atmosphere preferentially scatters short wavelengths.
D: Because t

In [6]:
df_results_llama_zero = pd.read_csv("Llama_zero_shot_results.csv")
df_results_llama_few = pd.read_csv("Llama_few_shot_results.csv")
df_results_hermes_zero = pd.read_csv("Hermes_zero_shot_results.csv")
df_results_hermes_few = pd.read_csv("Hermes_few_shot_results.csv")
df_results_phi_zero = pd.read_csv("Phi_zero_shot_results.csv")
df_results_phi_few = pd.read_csv("Phi_few_shot_results.csv")

accuracy_results = pd.DataFrame({
    "Model": ["Llama-3-8B", "Llama-3-8B", "Nous-Hermes-2-Mistral", "Nous-Hermes-2-Mistral", "Phi-3-mini-4k", "Phi-3-mini-4k"],
    "Prompting Style": ["Zero-Shot", "Few-Shot", "Zero-Shot", "Few-Shot", "Zero-Shot", "Few-Shot"],
    "Accuracy": [
        compute_accuracy(df_results_llama_zero),
        compute_accuracy(df_results_llama_few),
        compute_accuracy(df_results_hermes_zero),
        compute_accuracy(df_results_hermes_few),
        compute_accuracy(df_results_phi_zero),
        compute_accuracy(df_results_phi_few)
    ]
})

print(accuracy_results)

                   Model Prompting Style  Accuracy
0             Llama-3-8B       Zero-Shot      0.68
1             Llama-3-8B        Few-Shot      0.48
2  Nous-Hermes-2-Mistral       Zero-Shot      0.56
3  Nous-Hermes-2-Mistral        Few-Shot      0.70
4          Phi-3-mini-4k       Zero-Shot      0.72
5          Phi-3-mini-4k        Few-Shot      0.64
