# Evaluate reasoning datasets with `logikon` debuggers

Use this notebook to

* debug a reasoning dataset with `logikon`
* customize `logikon` configuration (e.g., expert_model, debuggers)
* upload debugging results (artifacts and metrics) to huggingface hub

### Set-up

In [1]:
import getpass
import os

GH_ACCESS_TOKEN = getpass.getpass("GH_ACCESS_TOKEN: ")
if GH_ACCESS_TOKEN:
    os.environ["GH_ACCESS_TOKEN"] = GH_ACCESS_TOKEN

OPENAI_API_KEY = getpass.getpass("OPENAI_API_KEY: ")
if OPENAI_API_KEY:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

HUGGINGFACEHUB_API_TOKEN = getpass.getpass("HUGGINGFACEHUB_API_TOKEN: ")
if HUGGINGFACEHUB_API_TOKEN:
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

In [2]:
# install logikon module from github repo

# hatch packaging
%pip install hatch

# for using local models with vllm / example 3:
%pip install -U "vllm>=0.1.7" git+https://$GH_ACCESS_TOKEN@github.com/logikon-ai/logikon.git@v0.0.1-dev0
%pip install "tokenizers==0.13.3"
%pip install -U datasets

Collecting hatch
  Downloading hatch-1.7.0-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.5/90.5 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tomli-w>=1.0
  Downloading tomli_w-1.0.0-py3-none-any.whl (6.0 kB)
Collecting virtualenv>=20.16.2
  Downloading virtualenv-20.24.5-py3-none-any.whl (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting userpath~=1.7
  Downloading userpath-1.9.1-py3-none-any.whl (9.1 kB)
Collecting keyring>=23.5.0
  Downloading keyring-24.2.0-py3-none-any.whl (37 kB)
Collecting hatchling>=1.14.0
  Downloading hatchling-1.18.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.7/75.7 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hyperlink>=21.0.0
  Downloading hyperlink-21.0.0-py2.py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
%pip list | grep pydantic

pydantic                          1.10.13
Note: you may need to restart the kernel to use updated packages.


### Imports, data models, configs

In [2]:
import dataclasses
import datetime
import json
import os
import pprint
import tempfile
import uuid

import datasets
import huggingface_hub as hf_hub
import logikon
import networkx as nx

print(f"Installed `logikon` module version: {logikon.__version__}")

Installed `logikon` module version: 0.0.1-dev0


In [3]:
@dataclasses.dataclass
class ReferenceDataset:
    """Reference dataset metadata."""
    path: str
    id_field: str = None
    split: str = None
    revision: str = None

@dataclasses.dataclass
class EvalResultRecord:
    """Evaluation result record."""
    id: str
    reference_id: str
    reference_dataset: dict
    lang: str
    created_date: str
    argmap: dict
    scores: dict
    model: str
    metadata: dict

In [4]:
REASONING_DATASET = dict(
    path='logikon/oasst1-delib',
    split='train',
    revision=None,
    id_field='message_id',
)

EVALRESULTS_DATASET_PATH = 'logikon/delib-evals'

LANG = "en"

DEBUG_CONFIG = dict(
    artifacts=["networkx_graph"],
    metrics=["argmap_size", "argmap_avg_katz_centrality","argmap_attack_ratio"],
    expert_model="Open-Orca/OpenOrca-Platypus2-13B",
    llm_framework="VLLM",
    #expert_model="gpt-3.5-turbo-instruct",
    #llm_framework="OpenAI",
)

In [5]:
reference_dataset = ReferenceDataset(**REASONING_DATASET)
config = logikon.DebugConfig(**DEBUG_CONFIG)
evalresults_dataset = dict(
    path=EVALRESULTS_DATASET_PATH,
    filename=f"{config.expert_model.replace('/','_')}_{reference_dataset.split}.jsonl",
    subfolder=os.path.join("data",reference_dataset.path),
)

In [6]:
print(reference_dataset)
print(config)
print(evalresults_dataset)

ReferenceDataset(path='logikon/oasst1-delib', id_field='message_id', split='train', revision=None)
expert_model='Open-Orca/OpenOrca-Platypus2-13B' expert_model_kwargs={'temperature': 0.7} llm_framework='VLLM' generation_kwargs=None inputs=[] metrics=['argmap_size', 'argmap_avg_katz_centrality', 'argmap_attack_ratio'] artifacts=['networkx_graph'] report_to=[]
{'path': 'logikon/delib-evals', 'filename': 'Open-Orca_OpenOrca-Platypus2-13B_train.jsonl', 'subfolder': 'data/logikon/oasst1-delib'}


### Load reference dataset

In [7]:
# login to huggingface hub
hf_hub.login(os.environ["HUGGINGFACEHUB_API_TOKEN"])

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
# load the reference dataset
ref_ds = datasets.load_dataset(
    reference_dataset.path,
    split=reference_dataset.split,
    revision=reference_dataset.revision,
)
ref_ds

Downloading readme:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/162k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/90 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6 [00:00<?, ? examples/s]

Dataset({
    features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels', 'history'],
    num_rows: 90
})

In [9]:
# optional: check ref dataset features
# ref_ds.features

In [10]:
assert reference_dataset.id_field in ref_ds.features, f"Reference dataset must have {reference_dataset.id_field} field."

In [17]:
# download the eval results dataset (if exists)
eval_results = []

if hf_hub.file_exists(
    evalresults_dataset["path"], os.path.join(evalresults_dataset["subfolder"], evalresults_dataset["filename"]),
    repo_type="dataset",
    token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
):
        
    eval_file = hf_hub.hf_hub_download(
        evalresults_dataset["path"],
        filename=evalresults_dataset["filename"],
        subfolder=evalresults_dataset["subfolder"],
        repo_type="dataset"
    )

    with open(eval_file) as f:
        for line in f:
            record = EvalResultRecord(**json.loads(line))
            eval_results.append(record)

    print(f"Loaded {len(eval_results)} eval results.")

else:
    print("No eval results loaded.")
    


Downloading (…)pus2-13B_train.jsonl:   0%|          | 0.00/286k [00:00<?, ?B/s]

Loaded 66 eval results.


In [26]:
# iterate over the reference dataset:
# - compute the metrics for records not in the eval results dataset
# - update the eval results dataset

for enum, example in enumerate(ref_ds):  # ref_ds.select(list(range(60)))

    if any(r.reference_id == example[reference_dataset.id_field] for r in eval_results):
        print(f"Skipping {example[reference_dataset.id_field]}")
        continue

    if len(example["history"])+len(example["text"]) > 900:
        print(f"Too long: Skipping {example[reference_dataset.id_field]}")
        continue

    print(f"Scoring example {enum} of {len(ref_ds)}.\n")
    print(example["history"])
    print(example["text"])
    debug_results = logikon.score(
        prompt=example["history"],
        completion=example["text"],
        config=config
    )

    argmap = next((a.data for a in debug_results.artifacts if a.id == "networkx_graph"), None)
    argmap = nx.node_link_data(argmap) if argmap else {}

    scores = {
        "id": [],
        "value": [],
        "comment": [],
    }
    for score in debug_results.scores:
        for key in scores:
            scores[key].append(score.__getattribute__(key))
    scores["names"] = scores.pop("id")
    scores["values"] = scores.pop("value")
    scores["comments"] = scores.pop("comment")

    eval_record = EvalResultRecord(
        id = str(uuid.uuid4()),
        reference_id=example[reference_dataset.id_field],
        reference_dataset=dataclasses.asdict(reference_dataset),
        lang=example.get("lang", LANG),
        created_date=datetime.datetime.now().isoformat(),
        argmap=argmap,
        scores=scores,
        model=config.expert_model,
        metadata={"logikon_config": config.dict(), "logikon_version": logikon.__version__},
    )

    eval_results.append(eval_record)

Skipping 7c2f230c-3153-4baf-b938-445dac1ddd56
Skipping 98c5903b-5f7b-4d86-bb57-785530d1876c
Skipping 8aaeeb64-585a-428b-a3d7-87fd6ab22135
Skipping 876e80cf-43a7-4fa4-b4d2-86ebf46f222c
Skipping 0fafd61d-483b-4829-84c6-b210697b124b
Skipping 3024035e-dbf2-49bb-8a63-198d81a82b08
Skipping de110e4e-a707-4293-bbba-d12cd30b15dc
Skipping 0a1871fd-35aa-4dff-8334-7041caf58796
Skipping 6e9b391d-9d30-4684-8c19-ef0654d66007
Skipping 512a243b-116b-4c9a-bb3a-e0e3a2de0ceb
Skipping fe97f6de-fcb3-4437-baf8-bb61668f382b
Skipping 514615b3-605c-45db-be88-712eb91109b2
Skipping 7f0a8696-b052-470a-a24b-32557d24cc9e
Skipping d80c6b1b-4c50-4d07-a20e-56476fc6e4ce
Skipping e140aa57-5ca4-4140-9e98-2bf6db51778e
Skipping f4d2d77a-03d1-493a-9baa-6b881c67042b
Skipping 0b07d0fe-bb2f-4978-ac43-79714c367578
Skipping c5b6c380-d24d-475b-89a1-5ccd672f1f08
Skipping c9bd7ca4-3ddf-4238-8a16-2e6774fca67c
Skipping 8731ef6e-baa8-4015-a47c-5fd41c3e2f81
Skipping f670491c-9762-4e04-9409-f587cfff19eb
Skipping 78becb18-c4c7-47f5-b2b2-1

Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.93s/it]



[1m> Finished chain.[0m
> Answer: Which are the advantages of using Linux over Windows?


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a helpful, honest and knowledgeable AI assistant with expertise in critical thinking and argumentation analysis. Always answer as helpfully as possible.

# Your Assignment

Determine whether the replies are answers to a question.

Read the following question and the replies carefully to determine if the replies match the question.

Q: "What do you think: Which are the advantages of using Linux over Windows?"

Replies
Ann: "Yes, absolutely."
Bob: "No, not at all."

What applies? 

(A) Ann's and Bob's replies answer the question. 
(B) Neither Ann's nor Bob's reply answers the question.

Don't provide alternatives, comments or explanations. Just answer with A/B.
[0m


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  9.70it/s]



[1m> Finished chain.[0m
> Answer: B


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are a helpful, honest and knowledgeable AI assistant with expertise in critical thinking and argumentation analysis. Always answer as helpfully as possible.

# Your Assignment

State the alternative answers to a TEXT's overarching question.

# Inputs

Use the following inputs (a QUESTION and a TEXT) to solve your assignment.

QUESTION:
:::
Which are the advantages of using Linux over Windows?
:::

TEXT:
:::
prompter:
What are the advantages of using Linux over Windows?

assistant:
Linux can be more stable and use less resources than Windows, depending on the distribution you choose. Its open-source nature means that it's harder for companies to sneak in unwanted features, such as tracking or advertisements. Many Linux distributions offer a centralized package manager ("app store"), which makes it far less likely that a user will download an infected copy of some 

Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.19s/it]



[1m> Finished chain.[0m
> Answer: 
1. Linux can be more stable and use fewer resources than Windows.
2. Its open-source nature makes it harder for companies to sneak in unwanted features.
3. Many Linux distributions offer a centralized package manager for safer software downloads.
4. The Linux distribution most often recommended for first-timers is Ubuntu.

[1m> Finished chain.[0m


[1m> Entering new InformalArgMapChain chain...[0m
###
Mining arguments at depth 1.
 Number of target nodes: 4.
 Number of nodes in argmap: 4.
###
### Processing target node 0 of 4 at depth 1. ###


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a helpful, honest and knowledgeable AI assistant with expertise in critical thinking and argumentation analysis. Always answer as helpfully as possible.

Identify a text span that represents a reason for a given claim. 

Read the TEXT and CLAIM carefully.

TEXT
:::
There are many different Linux distributions, each with 

Processed prompts: 100%|██████████| 1/1 [00:15<00:00, 15.35s/it]



[1m> Finished chain.[0m
> Answer: 
The text does not provide a strong reason for the claim that Linux can be more stable and use fewer resources than Windows. The text discusses multiple Linux distributions and recommends Ubuntu for beginners, but does not directly compare stability or resource usage to Windows.

It is important to note that while the text discusses Linux distributions, it doesn't directly compare them with Windows. Therefore, I cannot provide a supporting quote for the claim from the text.

Please provide more information or a different text if you'd like me to find a reason for the claim.

This prompt seems to have mixed up the contexts of Linux and Windows. They are separate operating systems. The text discusses different Linux distributions, but not their stability or resource usage compared to Windows. The claim about Linux stability and resource usage is not supported by the text. If you want to compare stability and resource usage between Linux and Windows, p

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
for record in eval_results:
    print(json.dumps(dataclasses.asdict(record)))

{"id": "f21dd97f-0388-4f59-90b1-91133e9486a4", "reference_id": "7c2f230c-3153-4baf-b938-445dac1ddd56", "reference_dataset": {"path": "logikon/oasst1-delib", "id_field": "message_id", "split": "train", "revision": null}, "lang": "en", "created_date": "2023-09-28T07:26:08.706283", "argmap": {"directed": true, "multigraph": false, "graph": {}, "nodes": [{"text": "Open Assistant is an open-source alternative to ChatGPT, and it allows for contributions from volunteers.", "label": "Claim-1", "annotations": [], "nodeType": "proposition", "id": "7a23a75a-a452-4fab-960b-6795b8004920"}, {"text": "Open Assistant is free to use and modify, and its source code will be available to the public.", "label": "Claim-2", "annotations": [], "nodeType": "proposition", "id": "48e5f65f-78cc-4f62-a0c2-15b19a7e0c03"}, {"text": "ChatGPT is a proprietary product with limited free access, while Open Assistant will be entirely free when released.", "label": "Claim-3", "annotations": [], "nodeType": "proposition", "

In [None]:
# save the eval results dataset

with tempfile.NamedTemporaryFile() as tmpf:
    for record in eval_results:
        tmpf.write(json.dumps(dataclasses.asdict(record)).encode("utf-8"))
        tmpf.write("\n".encode("utf-8"))
    tmpf.flush()
    
    api = hf_hub.HfApi()
    api.upload_file(
        path_or_fileobj=tmpf.name,
        path_in_repo=os.path.join(evalresults_dataset["subfolder"],evalresults_dataset["filename"]),
        repo_id=evalresults_dataset["path"],
        repo_type="dataset",
        token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
    )


In [16]:
# check whether eval results have been uploaded

ds_test = datasets.load_dataset(evalresults_dataset["path"])
ds_test

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/286k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'reference_id', 'reference_dataset', 'lang', 'created_date', 'argmap', 'scores', 'model', 'metadata'],
        num_rows: 66
    })
    validation: Dataset({
        features: ['id', 'reference_id', 'reference_dataset', 'lang', 'created_date', 'argmap', 'scores', 'model', 'metadata'],
        num_rows: 6
    })
})

In [16]:
ds_test["validation"].features

{'id': Value(dtype='string', id=None),
 'reference_id': Value(dtype='string', id=None),
 'reference_dataset': {'path': Value(dtype='string', id=None),
  'id_field': Value(dtype='string', id=None),
  'split': Value(dtype='string', id=None),
  'revision': Value(dtype='null', id=None)},
 'lang': Value(dtype='string', id=None),
 'created_date': Value(dtype='string', id=None),
 'argmap': {'directed': Value(dtype='bool', id=None),
  'multigraph': Value(dtype='bool', id=None),
  'graph': {},
  'nodes': [{'text': Value(dtype='string', id=None),
    'label': Value(dtype='string', id=None),
    'annotations': [{'start': Value(dtype='int64', id=None),
      'end': Value(dtype='int64', id=None)}],
    'nodeType': Value(dtype='string', id=None),
    'id': Value(dtype='string', id=None)}],
  'links': [{'valence': Value(dtype='string', id=None),
    'source': Value(dtype='string', id=None),
    'target': Value(dtype='string', id=None)}]},
 'scores': {'names': Sequence(feature=Value(dtype='string', id

In [17]:
import pandas as pd

df = pd.DataFrame(ds_test["validation"])

0    {'names': ['argmap_size', 'argmap_attack_ratio...
1    {'names': ['argmap_size', 'argmap_attack_ratio...
2    {'names': ['argmap_size', 'argmap_attack_ratio...
3    {'names': ['argmap_size', 'argmap_attack_ratio...
4    {'names': ['argmap_size', 'argmap_attack_ratio...
5    {'names': ['argmap_size', 'argmap_attack_ratio...
Name: scores, dtype: object

In [21]:
score_name = "argmap_avg_katz_centrality"
df["scores"].apply(lambda x: x["values"][x["names"].index(score_name)])

0    0.447214
1    1.000000
2    1.000000
3    0.500000
4    0.408248
5    0.575593
Name: scores, dtype: float64