# Hallucination Project

#### Imports

In [1]:
from load_data import *
from pathlib import Path
from IPython.display import clear_output
import json
import numpy as np

### Data extraction

We start by extracting the Abstract and Reasoning Corpus (ARC) dataset, composed with 400 `.json` files in the training and another 400 in evaluation folder.

With the `data_extraction` function we convert every `.json` file into a dictionary.

When we have extracted our data, we need to convert them in "arrays of strings" so we can use them as prompt for the LLMs. To do this we use `number_to_colour` function.


In [2]:
data_path = Path(
    "/Users/mattiapiazza/Documents/University/cognitive_behavioral_and_social_data/project/cbsd_hallucination_project/ARC_master/data"
)

# training_dataset = data_extraction(data_path=Path(data_path / "training"))
training_dataset = data_extraction(data_path=data_path / "training")
evaluation_dataset = data_extraction(data_path=data_path / "evaluation")

# Small check
print(f"""training_dataset train: {len(training_dataset["train"]["input"])}
evaluation_dataset: {len(evaluation_dataset["train"]["input"])}""")
print(f"""training_dataset test: {len(training_dataset["test"]["input"])}
evaluation_dataset: {len(evaluation_dataset["test"]["input"])}""")

# Changing our dataset from numbers to letters
training_dataset = number_to_colour(training_dataset)
evaluation_dataset = number_to_colour(evaluation_dataset)


data_path = Path(
    "/Users/mattiapiazza/Documents/University/cognitive_behavioral_and_social_data/project/cbsd_hallucination_project/data"
)

# Saving training_dataset
with open(Path(data_path / "training_dataset.json"), "w") as f:
    json.dump(training_dataset, f)

# Saving evaluation_dataset
with open(Path(data_path / "evaluation_dataset.json"), "w") as f:
    json.dump(evaluation_dataset, f)

training_dataset train: 1301
evaluation_dataset: 1363
training_dataset test: 416
evaluation_dataset: 419


### LLM's response

We give the LLM some hint on the task as follow:

>I'm going to give you a visual puzzle (but you won't have to deal with an image yourself as I went ahead and converted the image to an easier textual format for you!).
>
>You are basically given a big square that consists of many smaller colored squares (like a digital mosaic if you will) and you need to figure out what are the actual colors of squares that are black (black serves as a placeholder color).
>
>The best way to do it is to spell out the colors you see in the following format (see below) for the original square and then try and fill in the blank (i.e. black squares).I'll go ahead and spell it out for you.
>
>(note: P stands for pink, C - cyan, B - blue, G - gray, R - red, Y - yellow, O - orange, V - Green, M -brown, X - black/placeholder that you need to solve for)

and then add the input prompt previously retrieved and save the LLM's response.

In [3]:
# Retrieve the datasets
data_path = Path(
    "/Users/mattiapiazza/Documents/University/cognitive_behavioral_and_social_data/project/cbsd_hallucination_project/data"
)

with open(Path(data_path / "training_dataset.json"), "r") as f:
    training_dataset = json.load(f)

# with open(Path(data_path / "evaluation_dataset.json"), "r") as f:
#     evaluation_dataset = json.load(f)

print(
    f"training shape: {training_dataset.keys()}\nevaluation shape: {evaluation_dataset.keys()}\n"
)


for (
    key,  # key = ["train", "test"]
    value,  # value = dict
) in training_dataset.items():
    if "GPTsays" not in value.keys():  # Adding the key for ChatGPT response to input
        training_dataset[key]["GPTsays"] = []

    # No need to print also the "output"
    for i in range(len(training_dataset[key]["GPTsays"]), len(value["input"])):
        print(f"Array #: {i+1}/{len(value["input"])}\n{value["input"][i]}\n")

        llm_says = input("Insert here ChatGPT output: ")

        training_dataset[key]["GPTsays"].append(llm_says)

        clear_output(wait=True)  # Clear output each time

        with open(Path(data_path / "training_dataset.json"), "w") as f:
            json.dump(training_dataset, f)

Array #: 11/1301
1 -> BXBXX
2 -> XXXXX
3 -> XXXXX
4 -> XBXBX
5 -> XXXXX




### Question our LLM
Now that we have our prompts it's time to use them to question the LLM chosen