In [1]:
import pandas as pd

from master_thesis.core.utils.reproducibility import seed_everything
from master_thesis.core.models.llama import load_model_and_tokenizer
from master_thesis.core.utils.prompts import load_prompt
from master_thesis.core.methods.gt_cace import GTCaCEEstimator
from master_thesis.core.utils.reproducibility import save_results


DEVICE = "cuda"
PROMPT_TYPE = "few_shot"
POSITIVE_TOKEN = "true"
NEGATIVE_TOKEN = "false"
MODEL = "LLAMA_2_7B_CHAT"

DATA_DIR = "../../../../data"
DATASETS_DIR = f"{DATA_DIR}/datasets/base_experiments/car_vs_bike/test"
CACHE_DIR = f".cache/results"
CLASS_NAMES = ["bike", "car"]
LABEL_ASCPECT_NAMES = ["bike", "car"]
CONFOUNDING_ASPECT_NAMES = ["negative", "positive"]

In [2]:
seed_everything()

In [3]:
model, tokenizer = load_model_and_tokenizer(MODEL)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
TOKENS = [tokenizer.encode(NEGATIVE_TOKEN)[-1], tokenizer.encode(POSITIVE_TOKEN)[-1]]

### Prepare datasets and prompt

In [5]:
test = pd.read_csv(f"{DATASETS_DIR}/test.csv")
test_50_50 = pd.read_csv(f"{DATASETS_DIR}/test_50_50.csv")
test_95_5 = pd.read_csv(f"{DATASETS_DIR}/test_95_5.csv")

In [6]:
prompt = load_prompt(
    DATA_DIR,
    dataset_path="base_experiments/car_vs_bike",
    prompt_type=PROMPT_TYPE,
    prompt_aspect="label_aspect",
)

### Estimate CaCE

In [7]:
gt_cace_etimator = GTCaCEEstimator(model, tokenizer, prompt, TOKENS, DEVICE)

#### Test 50 50

In [8]:
test_50_50_results_label = gt_cace_etimator.evaluate(test_50_50, test, "label")

100%|██████████| 200/200 [02:46<00:00,  1.20it/s]


In [9]:
save_results(
    test_50_50_results_label,
    class_names=CLASS_NAMES,
    aspect_names=LABEL_ASCPECT_NAMES,
    save_path=f"{CACHE_DIR}/gt_cace/test_50_50/label_aspect.json",
)

In [10]:
test_50_50_results_label

{0: {0: 0.8000492818224529, 1: 0.8002573239756748},
 1: {0: 0.989401224603202, 1: 0.9897209726610163}}

In [11]:
test_50_50_results_confounding = gt_cace_etimator.evaluate(
    test_50_50, test, "confounding"
)

100%|██████████| 200/200 [02:44<00:00,  1.21it/s]


In [12]:
save_results(
    test_50_50_results_confounding,
    class_names=CLASS_NAMES,
    aspect_names=CONFOUNDING_ASPECT_NAMES,
    save_path=f"{CACHE_DIR}/gt_cace/test_50_50/confounding_aspect.json",
)

In [13]:
test_50_50_results_confounding

{0: {0: 0.10139692313055519, 1: 0.10140001411251433},
 1: {0: 0.10139692313055519, 1: 0.10140001411251433}}

#### Test 95 5

In [14]:
test_95_5_results_label = gt_cace_etimator.evaluate(test_95_5, test, "label")

100%|██████████| 200/200 [02:45<00:00,  1.21it/s]


In [15]:
save_results(
    test_95_5_results_label,
    class_names=CLASS_NAMES,
    aspect_names=LABEL_ASCPECT_NAMES,
    save_path=f"{CACHE_DIR}/gt_cace/test_95_5/label_aspect.json",
)

In [16]:
test_95_5_results_label

{0: {0: 0.853265325649179, 1: 0.8534759099558505},
 1: {0: 0.9795571831291077, 1: 0.9798835686694293}}

In [17]:
test_95_5_results_confounding = gt_cace_etimator.evaluate(
    test_95_5, test, "confounding"
)

100%|██████████| 200/200 [02:45<00:00,  1.21it/s]


In [18]:
save_results(
    test_95_5_results_confounding,
    class_names=CLASS_NAMES,
    aspect_names=CONFOUNDING_ASPECT_NAMES,
    save_path=f"{CACHE_DIR}/gt_cace/test_95_5/confounding_aspect.json",
)

In [19]:
test_95_5_results_confounding

{0: {0: 0.0020325138169368985, 1: 0.00202442713034543},
 1: {0: 0.1560984715453182, 1: 0.15612418080690987}}