In [None]:
import json

import matplotlib.pyplot as plt

In [None]:
instruct_eval_loss = []
instruct_train_loss = []
path_to_save_metrics = "../../../results/llama3_results/instruct/metrics"
steps = []
with open(path_to_save_metrics + "/trainer_state.json") as f:
    instruct_metrics = json.load(f)
    log_history = instruct_metrics["log_history"]

    for log in log_history:
        if "eval_loss" in log:
            instruct_eval_loss.append(log["eval_loss"])
            steps.append(log["step"])
        elif "loss" in log:
            instruct_train_loss.append(log["loss"])

In [None]:
assert len(instruct_eval_loss) == len(instruct_train_loss), "Length must match"
assert len(instruct_eval_loss) == len(steps), "Length must match"

In [None]:
# Instruct finetuning metrics
fig, ax = plt.subplots()

ax.plot(steps, instruct_eval_loss, "r-", label="Evaluation loss")
ax.plot(steps, instruct_train_loss, "g-", label="Train loss")
ax.set_xlabel("Steps")
ax.set_ylabel("Loss", color="black")
ax.legend()
plt.title("Llama 3 instruct")
txt = "lr=2.0e-05, batch_size=4, epochs=1, gradient_clip=1.0 \n Training duration: 370 minutes"
plt.figtext(0.5, -0.05, txt, wrap=True, horizontalalignment="center", fontsize=12)

plt.savefig(path_to_save_metrics + "/graph_1_epoch.png")
plt.show()

In [None]:
import torch
from Scripts.llama_model_wrapper import InstructModelWrapper
from Scripts.load_dataset import load_dataset

In [None]:
model_kwargs = {
    "path": "../../../results/llama3_results/instruct/model.nosync",
    "tokenizer_path": "meta-llama/Meta-Llama-3-8B-Instruct",
    "torch_dtype": torch.bfloat16,
    "device_map": "auto",
}

In [None]:
instruct_model_wrapper = InstructModelWrapper(**model_kwargs)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "mps"

In [None]:
_, test_ds = load_dataset(
    "../../../../German_newspaper_articles/10kGNAD/train.csv",
    "../../../../German_newspaper_articles/10kGNAD/test.csv",
)

In [None]:
test_ds = test_ds.map(
    instruct_model_wrapper.create_test_messages, remove_columns=["text"]
)

In [None]:
test_ds = test_ds.map(instruct_model_wrapper.tokenize_messages)

In [None]:
test_ds.set_format("torch", device=device)

In [None]:
correct = 0
correct_dict = {
    "Web": 0,
    "International": 0,
    "Etat": 0,
    "Wirtschaft": 0,
    "Panorama": 0,
    "Sport": 0,
    "Wissenschaft": 0,
    "Kultur": 0,
    "Inland": 0,
}
y_true = []
y_pred = []
wrong = []
instruct_model_wrapper.model.eval()
for i, sample in enumerate(test_ds):
    outputs = instruct_model_wrapper.model.generate(
        sample["input_ids"],
        max_new_tokens=128,
        eos_token_id=instruct_model_wrapper.terminators,
        do_sample=False,
        temperature=0.1,
        top_p=0.9,
    )
    response = outputs[0][sample["input_ids"].shape[-1] :]
    response = instruct_model_wrapper.tokenizer.decode(
        response, skip_special_tokens=True
    )
    y_true.append(sample["label"])
    y_pred.append(response)
    if sample["label"] in response:
        correct += 1
        correct_dict[sample["label"]] += 1
        print(f"At {i}: {response}")

In [None]:
assert len(set(y_true)) == len(set(y_pred)), "Labels are not the same"

In [None]:
from collections import Counter

import matplotlib.pyplot as plt

In [None]:
label_counts = Counter(test_ds["label"])
label_counts

In [None]:
labels = list(label_counts.keys())
differences = dict()
for label in labels:
    differences[label] = correct_dict[label] / label_counts[label]

In [None]:
path_to_save_metrics = "../../../results/llama3_results/instruct/metrics"

difference_values = [value * 100 for value in differences.values()]
difference_labels = [value for value in differences.keys()]
fig, ax = plt.subplots()

xs = range(len(difference_labels))
ys = [difference_values[x] for x in xs]

ax.bar(difference_labels, ys, 0.6)
plt.title("correct per category")
plt.xlabel("category")
plt.ylabel("accuracy in %")
plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment="right")
plt.savefig(path_to_save_metrics + "/test_evaluation.png")

plt.show()

In [None]:
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    confusion_matrix,
    f1_score,
)

In [None]:
cm = confusion_matrix(
    y_true,
    y_pred,
    labels=[
        "Web",
        "International",
        "Etat",
        "Wirtschaft",
        "Panorama",
        "Sport",
        "Wissenschaft",
        "Kultur",
        "Inland",
    ],
)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

In [None]:
accuracy_score(y_true, y_pred)

In [None]:
f1_score(y_true, y_pred, average="weighted")