# Experiments: Conversational Grounding LLM Prediction

This Jupyter notebook contains the code for the experiments for the paper titled: "Bridging Information Gaps in Dialogues With Grounded Exchanges Using Knowledge Graphs" submitted to SIGDIAL 2024. 

### Imports

In [None]:
import os
import regex as re
import pandas as pd
from openai import OpenAI
from groq import Groq
import time
import numpy as np
import json
from deepdiff import DeepDiff
import dotenv
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support, classification_report
import ast

### Load Dialogue Dataset

In [None]:
file_path = 'dialogue_corpus/annotated_dialogue_corpus.xlsx'
df = pd.read_excel(file_path, header=0)
df['topic'] = df['topic'].str.lower()
df.shape

In [None]:
def create_conversation_dictionary(df):
    # Create dictionary to store conversations for each channel-topic combination
    df_conv_dict = {}

    # Get unique channel numbers and topics
    channels = df['room_name'].unique()
    topics = df['topic'].unique()

    # Iterate through each channel and topic combination
    for channel in channels:
        for topic in topics:
            # Filter dataframe based on channel and topic
            filtered_df = df[(df['room_name'] == channel) & (df['topic'] == topic)]
            
            if not filtered_df.empty:
                # Store filtered dataframe in the dictionary
                df_conv_dict[(channel, topic)] = filtered_df
            else: pass

    return df_conv_dict

# Example usage:
# Assuming df is your dataframe with the provided columns
df_conv_dict = create_conversation_dictionary(df)

# Accessing datasets for each channel-topic combination
#for (channel, topic), dataset in df_conv_dict.items():
#    print(f"Channel: {channel}, Topic: {topic}")
#    print(dataset)
#    print("\n")
len(df_conv_dict)

In [None]:
# Split dialogue into list of input subdialogues for LLM
# Create list with indices for the LLM predictions
def create_prompt_input_dict(df_conv_dict):
    prompt_input_dict = {}
    for (channel, topic), df in df_conv_dict.items():
        prompt_input_dict[(channel, topic)] = []
        label_indices = df[df['grounding_label'].isin(["explicit", "implicit", "clarification"])].index.tolist()
        current_dialogue = []
        for i in label_indices:
            current_dialogue = []
            for index, row in df[["role", "message", "grounding_label", "grounded_knowledge"]].iterrows():
                role = row['role']
                message = row['message']
                grounding_label = row['grounding_label']
                grounded_knowledge = row['grounded_knowledge']

                current_dialogue.append(f"{role}: {message}")

                if index == i:
                    system_knowledge = df[df["grounded_knowledge"] != "-"]["grounded_knowledge"].iloc[-1]
                    prompt_input_dict[(channel, topic)] += [{'label_index': i, 'label_indices': label_indices, 'dialogue': current_dialogue, 'grounding_label': grounding_label, 'grounded_knowledge': grounded_knowledge, 'system_knowledge': system_knowledge}]
                    break
    return prompt_input_dict

prompt_input_dict = create_prompt_input_dict(df_conv_dict)
len(prompt_input_dict)

### Define Prompts for LLM in Chat Format

* CLS = classification of grounding label
* GK = prediction of grounded knowledge subset

In [None]:
# Define CLS prompts
def get_CLS_system_prompt() -> str:
    ''' Returns the system instruction for CLS of grounding labels'''
    return """Predict the grounding label for the last response in the 'Input Dialogue:'. The label indicates whether the knowledge in the dialogue was accepted. Choose one of the following labels:
    explicit: The response confirms understanding or acceptance (e.g., 'okay', 'thanks', 'alright', 'nice') without seeking clarification.
    clarification: The response seeks clarification about a previous dialogue snippet.
    implicit: The response moves the conversation forward without explicitly confirming or seeking clarification."""

    #clarification: The response seeks clarification about a previous dialogue snippet.
def get_CLS_zero_shot_chat_prompt(input_dialogue: str, system_message: bool = True):
    ''' Returns the template for the zero-shot prompt that predicts a grounding label given a dialogue'''
    message = []

    if system_message:
        message.append({"role": "system", "content": f"{get_CLS_system_prompt()}"})

    message.append({"role": "user", "content": f"""Input dialogue: {input_dialogue}
Output label: """})
    return message

user_CLS_example_1 = """
Input Dialogue:
seeker: Can you give me some information about your dataset?
provider: My dataset includes information on buildings of Gothic architecture.
seeker: How tall is the Cologne Cathedral?
"""
assistant_CLS_example_1 = """
Output Label: implicit
"""
user_CLS_example_2 = """
Input Dialogue:
provider: Monitors have different attributes like size or panel technology.
provider: There are some with an aspect ratio of 21:9.
seeker: What is aspect ratio?
"""
assistant_CLS_example_2 = """
Output Label: clarification
"""
user_CLS_example_3 = """
Input Dialogue:
provider: An elephant's average lifespan is around 65 years.
seeker: I see, good to know.
"""
assistant_CLS_example_3 = """
Output Label: explicit
"""

def get_CLS_few_shot_chat_prompt(input_dialogue: str, system_message: bool = True) -> list:
    ''' Returns the template for the few-shot prompt that predicts a grounding label given a dialogue'''
    message = []
    # system instruction
    if system_message:
        message.append({"role": "system", "content": f"{get_CLS_system_prompt()}"})
    # few-shot examples
    message.append({"role": "user", "content": f"""{user_CLS_example_1}"""})
    message.append({"role": "assistant", "content": f"""{assistant_CLS_example_1}"""})
    message.append({"role": "user", "content": f"""{user_CLS_example_2}"""})
    message.append({"role": "assistant", "content": f"""{assistant_CLS_example_2}"""})
    message.append({"role": "user", "content": f"""{user_CLS_example_3}"""})
    message.append({"role": "assistant", "content": f"""{assistant_CLS_example_3}"""})
    # input dialogue
    message.append({"role": "user", "content": f"""Input dialogue: {input_dialogue}
Output label: """})
    return message

In [None]:
# Define GK prompts
def get_GK_system_prompt() -> str:
    ''' Returns the system instruction for prediction of grounded knowledge'''
    return '''Your task is to identify the knowledge items that have been grounded by the conversation partners in the 'Input Dialogue'. The items of mutually grounded knowledge must be explicitly mentioned in the dialogue. Based on the complete set of 'System Knowledge', your task is to generate the subset of knowledge items that have been grounded so far. Ensure that the output is a valid JSON-LD structure (an array of JSON objects) and only include knowledge items from the formatted 'System Knowledge'.'''

def get_GK_zero_shot_chat_prompt(system_knowledge: str, input_dialogue: str, system_message: bool = True) -> list:
    ''' Returns the template for the zero-shot prompt that predicts a JSON with grounded knowledge given a dialogue'''
    message = []

    if system_message:
        message.append({"role": "system", "content": f"{get_GK_system_prompt()}"})
    
    message.append({"role": "user", "content": f"""System Knowledge: {system_knowledge}
    Input Dialogue: {input_dialogue}
    Output JSON-LD: """})
    return message

user_GK_example_1 = """
System Knowledge: [{"@context": ["http://www.w3.org/ns/csvw", {"schema": "http://schema.org"}], "@id": "http://example.org/american-presidents", "url": "american-presidents.csv", "schema:description": "The table contains information about American presidents", "tableSchema": {"columns": [{"name": "name", "datatype": "string"}, {"name": "term", "datatype": "string"}, {"name": "party", "datatype": "string"}, {"name": "election_year", "datatype": "integer"}]}, "primaryKey": "name"}, {"@type": "schema:Person", "name": "Barack Obama", "party": "Democratic"}]
Input Dialogue:
seeker: Can you give me an example entry from your dataset?
provider: One of the presidents in the list is Barack Obama.
seeker: Thanks. What party does he belong to?
"""
assistant_GK_example_1 = """
Output JSON-LD: [{"@context": ["http://www.w3.org/ns/csvw", {"schema": "http://schema.org"}], "@id": "http://example.org/american-presidents", "url": "american-presidents.csv", "schema:description": "The table contains information about American presidents", "tableSchema": {"columns": [{"name": "name", "datatype": "string"}]}, "primaryKey": "name"}, {"@type": "schema:Person", "name": "Barack Obama"}]
"""
user_GK_example_2 = """
System Knowledge: [{"@context": ["http://www.w3.org/ns/csvw", {"schema": "http://schema.org"}], "@id": "http://example.org/greek-islands", "url": "greek-islands.csv", "schema:description": "The table contains information about islands in Greece", "tableSchema": {"columns": [{"name": "island", "datatype": "string"}, {"name": "area_in_km2", "datatype": "integer", "minimum": 64, "maximum": 8336}, {"name": "cluster", "datatype": "string"}]}, "primaryKey": "island"}, {"@type": "schema:Place", "island": "Crete", "area_in_km2": 8336, "cluster": "Cretan"}, {"@type": "schema:Place", "island": "Alonnisos", "area_in_km2": 64, "cluster": "Sporades"}, {"@type": "schema:Place", "island": "Lesbos", "area_in_km2": 1633, "cluster": "North Aegean Islands"}]
Input Dialogue:
provider: My dataset contains information on Greek islands. For example, there is Crete with an area of 8336 square kilometers.
provider: That makes it the largest island in Greece.
seeker: Which one is the smallest and what is its area?
"""
assistant_GK_example_2 = """
Output JSON-LD: [{"@context": ["http://www.w3.org/ns/csvw", {"schema": "http://schema.org"}], "@id": "http://example.org/greek-islands", "url": "greek-islands.csv", "schema:description": "The table contains information about islands in Greece", "tableSchema": {"columns": [{"name": "island", "datatype": "string"}, {"name": "area_in_km2", "datatype": "integer", "maximum": 8336}]}, "primaryKey": "island"}, {"@type": "schema:Place", "island": "Crete", "area_in_km2": 8336}]
"""
user_GK_example_3 = """
System Knowledge: [{"@context": ["http://www.w3.org/ns/csvw", {"schema": "http://schema.org"}], "@id": "http://example.org/android-smartphones", "url": "android-smartphones.csv", "schema:description": "The table contains information about Android smartphones", "tableSchema": {"columns": [{"name": "model", "datatype": "string"}, {"name": "developer", "datatype": "string"}, {"name": "release_year", "datatype": "integer", "minimum": 2008, "maximum": 2024}, {"name": "android_version", "datatype": "string"}]}, "primaryKey": "model"}, {"@type": "schema:Product", "model": "HTC Dream", "developer:": "HTC", "release_year": "2008"}, {"@type": "schema:Product", "model": "LG Wing", "developer:": "LG", "release_year": "2020", "android_version": "Android 10"}, {"@type": "schema:Product", "release_year": "2024"}]
Input Dialogue:
provider: I can provide technical information about Android smartphones.
provider: One column contains data about the model and another specifies its release year.
seeker: I see, good to know.
"""
assistant_GK_example_3 = """
Output JSON-LD: [{"@context": ["http://www.w3.org/ns/csvw", {"schema": "http://schema.org"}], "@id": "http://example.org/android-smartphones", "url": "android-smartphones.csv", "schema:description": "The table contains information about Android smartphones", "tableSchema": {"columns": [{"name": "model", "datatype": "string"}, {"name": "release_year", "datatype": "integer"}]}, "primaryKey": "model"}]
"""

def get_GK_few_shot_chat_prompt(system_knowledge: str, input_dialogue: str, system_message: bool = True) -> list:
    ''' Returns the template for the few-shot prompt that predicts a JSON with grounded knowledge given a dialogue'''
    message = []
    # system instruction
    if system_message:
        message.append({"role": "system", "content": f"{get_GK_system_prompt()}"})
    # few-shot examples
    message.append({"role": "user", "content": f"""{user_GK_example_1}"""})
    message.append({"role": "assistant", "content": f"""{assistant_GK_example_1}"""})
    message.append({"role": "user", "content": f"""{user_GK_example_2}"""})
    message.append({"role": "assistant", "content": f"""{assistant_GK_example_2}"""})
    message.append({"role": "user", "content": f"""{user_GK_example_3}"""})
    message.append({"role": "assistant", "content": f"""{assistant_GK_example_3}"""})
    # input
    message.append({"role": "user", "content": f"""System Knowledge: {system_knowledge}
    Input Dialogue: {input_dialogue}
    Output JSON-LD: """})
    return message

### Send Prompt to LLM (OpenAI API / Groq API)

In [None]:
dotenv.load_dotenv(".env", override=True)

client = OpenAI(api_key=os.environ['OPENAI_API_KEY']) # used for gpt models
#client = Groq(api_key=os.environ.get("GROQ_API_KEY")) # used for llama models

def query_gpt(prompt): 
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125", # set model version = gpt-3.5-turbo-0125 or gpt-4o or llama3-8b-8192 or llama3-70b-8192
        max_tokens = 4096,
        seed=1,
        messages=prompt, # provide prompt in chat format
        temperature=0) # set model temperature = 0
    return response

In [None]:
# Run inference to get LLM predictions
def run_CLS_inference(df, prompt_input, n_last_turns=None):
    df_predictions = df.copy(deep=True)
    df_predictions["CLS_prediction"] = ""
    df_predictions["CLS_prompt"] = ""
    # n_last_turns = n_last_turns # number of last input turns before last dialogue utterance
    label_indices = prompt_input[0]["label_indices"] # get label indices
    for idx in label_indices:
        print("label index:", idx)
        if n_last_turns: # provide number of last input turns before last dialogue utterance
            context_dialogue = "\n".join(prompt_input[label_indices.index(idx)]["dialogue"][-(n_last_turns+1):])
        else: # if None: provide full dialogue history
            context_dialogue = "\n".join(prompt_input[label_indices.index(idx)]["dialogue"])
            print(idx, label_indices, label_indices.index(idx))
        system_knowledge = prompt_input[label_indices.index(idx)]["system_knowledge"]
        prompt = get_CLS_few_shot_chat_prompt(context_dialogue, system_knowledge) # <----------------ZERO vs FEW SHOT
        print("prompt:", prompt)
        response = query_gpt(prompt=prompt)
        result = response.choices[0].message.content.strip()
        print("prediction:", result)
        print("\n")
        df_predictions.at[idx, "CLS_prompt"] = prompt
        df_predictions.at[idx, "CLS_prediction"] = result
        time.sleep(0.1)
    return df_predictions

def merge_dataframes(df_conv_dict):
    # concatenate all dataframes in the dictionary
    merged_df = pd.concat(df_conv_dict.values(), ignore_index=True)
    return merged_df

In [None]:
run_name = "CLS_few_all_gpt35"
# Set number of n last dialogue turns
n_last_turns = None

df_conv_dict[('channel-01','history')] = run_CLS_inference(df_conv_dict[('channel-01','history')], prompt_input_dict[('channel-01','history')], n_last_turns)
df_conv_dict[('channel-01','sports')] = run_CLS_inference(df_conv_dict[('channel-01','sports')], prompt_input_dict[('channel-01','sports')], n_last_turns)

df_conv_dict[('channel-02','geography')] = run_CLS_inference(df_conv_dict[('channel-02','geography')], prompt_input_dict[('channel-02','geography')], n_last_turns)
df_conv_dict[('channel-02','media')] = run_CLS_inference(df_conv_dict[('channel-02','media')], prompt_input_dict[('channel-02','media')], n_last_turns)

df_conv_dict[('channel-03','media')] = run_CLS_inference(df_conv_dict[('channel-03','media')], prompt_input_dict[('channel-03','media')], n_last_turns)
df_conv_dict[('channel-03','nutrition')] = run_CLS_inference(df_conv_dict[('channel-03','nutrition')], prompt_input_dict[('channel-03','nutrition')], n_last_turns)

df_conv_dict[('channel-04','history')] = run_CLS_inference(df_conv_dict[('channel-04','history')], prompt_input_dict[('channel-04','history')], n_last_turns)
df_conv_dict[('channel-04','media')] = run_CLS_inference(df_conv_dict[('channel-04','media')], prompt_input_dict[('channel-04','media')], n_last_turns)

df_conv_dict[('channel-06','geography')] = run_CLS_inference(df_conv_dict[('channel-06','geography')], prompt_input_dict[('channel-06','geography')], n_last_turns)
df_conv_dict[('channel-06','media')] = run_CLS_inference(df_conv_dict[('channel-06','media')], prompt_input_dict[('channel-06','media')], n_last_turns)

df_conv_dict[('channel-07','history')] = run_CLS_inference(df_conv_dict[('channel-07','history')], prompt_input_dict[('channel-07','history')], n_last_turns)
df_conv_dict[('channel-07','sports')] = run_CLS_inference(df_conv_dict[('channel-07','sports')], prompt_input_dict[('channel-07','sports')], n_last_turns)

df_conv_dict[('channel-08','history')] = run_CLS_inference(df_conv_dict[('channel-08','history')], prompt_input_dict[('channel-08','history')], n_last_turns)
df_conv_dict[('channel-08','geography')] = run_CLS_inference(df_conv_dict[('channel-08','geography')], prompt_input_dict[('channel-08','geography')], n_last_turns)

df_conv_dict[('channel-09','history')] = run_CLS_inference(df_conv_dict[('channel-09','history')], prompt_input_dict[('channel-09','history')], n_last_turns)
df_conv_dict[('channel-09','geography')] = run_CLS_inference(df_conv_dict[('channel-09','geography')], prompt_input_dict[('channel-09','geography')], n_last_turns)

df_conv_dict[('channel-10','sports')] = run_CLS_inference(df_conv_dict[('channel-10','sports')], prompt_input_dict[('channel-10','sports')], n_last_turns)
df_conv_dict[('channel-10','nutrition')] = run_CLS_inference(df_conv_dict[('channel-10','nutrition')], prompt_input_dict[('channel-10','nutrition')], n_last_turns)

df_conv_dict[('channel-11','sports')] = run_CLS_inference(df_conv_dict[('channel-11','sports')], prompt_input_dict[('channel-11','sports')], n_last_turns)
df_conv_dict[('channel-11','nutrition')] = run_CLS_inference(df_conv_dict[('channel-11','nutrition')], prompt_input_dict[('channel-11','nutrition')], n_last_turns)

df_conv_dict[('channel-12','geography')] = run_CLS_inference(df_conv_dict[('channel-12','geography')], prompt_input_dict[('channel-12','geography')], n_last_turns)
df_conv_dict[('channel-12','media')] = run_CLS_inference(df_conv_dict[('channel-12','media')], prompt_input_dict[('channel-12','media')], n_last_turns)

df_conv_dict[('channel-13','geography')] = run_CLS_inference(df_conv_dict[('channel-13','geography')], prompt_input_dict[('channel-13','geography')], n_last_turns)
df_conv_dict[('channel-13','nutrition')] = run_CLS_inference(df_conv_dict[('channel-13','nutrition')], prompt_input_dict[('channel-13','nutrition')], n_last_turns)

df_conv_dict[('channel-14','sports')] = run_CLS_inference(df_conv_dict[('channel-14','sports')], prompt_input_dict[('channel-14','sports')], n_last_turns)
df_conv_dict[('channel-14','nutrition')] = run_CLS_inference(df_conv_dict[('channel-14','nutrition')], prompt_input_dict[('channel-14','nutrition')], n_last_turns)

In [None]:
# Run inference to get LLM predictions
def run_GK_inference(df, prompt_input):
    df_predictions = df.copy(deep=True)
    df_predictions["GK_prediction"] = ""
    df_predictions["GK_prompt"] = ""
    label_indices = prompt_input[0]["label_indices"] # get label indices
    for idx in label_indices:
        print("label index:", idx)
        context_dialogue = "\n".join(prompt_input[label_indices.index(idx)]["dialogue"]) # for GK always include full dialogue history
        system_knowledge = prompt_input[label_indices.index(idx)]["system_knowledge"]
        grounding_label = prompt_input[label_indices.index(idx)]["grounding_label"]
        if grounding_label != "clarification":
            prompt = get_GK_few_shot_chat_prompt(context_dialogue, system_knowledge) # <--------ZERO vs FEW SHOT
            print("--> prompt:", prompt)
            response = query_gpt(prompt=prompt)
            result = response.choices[0].message.content.strip()
            print("--> prediction:", result)
            print("\n")
            df_predictions.at[idx, "GK_prompt"] = prompt
            df_predictions.at[idx, "GK_prediction"] = result
        else:
            print(grounding_label)
            print("\n")
        time.sleep(0.01)
    return df_predictions

def merge_dataframes(df_conv_dict):
    # concatenate all dataframes in the dictionary
    merged_df = pd.concat(df_conv_dict.values(), ignore_index=True)
    return merged_df

In [None]:
run_name = "GK_few_shot_gpt35"

df_conv_dict[('channel-01','history')] = run_GK_inference(df_conv_dict[('channel-01','history')], prompt_input_dict[('channel-01','history')])
df_conv_dict[('channel-01','sports')] = run_GK_inference(df_conv_dict[('channel-01','sports')], prompt_input_dict[('channel-01','sports')])

df_conv_dict[('channel-02','geography')] = run_GK_inference(df_conv_dict[('channel-02','geography')], prompt_input_dict[('channel-02','geography')])
df_conv_dict[('channel-02','media')] = run_GK_inference(df_conv_dict[('channel-02','media')], prompt_input_dict[('channel-02','media')])

df_conv_dict[('channel-03','media')] = run_GK_inference(df_conv_dict[('channel-03','media')], prompt_input_dict[('channel-03','media')])
df_conv_dict[('channel-03','nutrition')] = run_GK_inference(df_conv_dict[('channel-03','nutrition')], prompt_input_dict[('channel-03','nutrition')])

df_conv_dict[('channel-04','history')] = run_GK_inference(df_conv_dict[('channel-04','history')], prompt_input_dict[('channel-04','history')])
df_conv_dict[('channel-04','media')] = run_GK_inference(df_conv_dict[('channel-04','media')], prompt_input_dict[('channel-04','media')])

df_conv_dict[('channel-06','geography')] = run_GK_inference(df_conv_dict[('channel-06','geography')], prompt_input_dict[('channel-06','geography')])
df_conv_dict[('channel-06','media')] = run_GK_inference(df_conv_dict[('channel-06','media')], prompt_input_dict[('channel-06','media')])

df_conv_dict[('channel-07','history')] = run_GK_inference(df_conv_dict[('channel-07','history')], prompt_input_dict[('channel-07','history')])
df_conv_dict[('channel-07','sports')] = run_GK_inference(df_conv_dict[('channel-07','sports')], prompt_input_dict[('channel-07','sports')])

df_conv_dict[('channel-08','history')] = run_GK_inference(df_conv_dict[('channel-08','history')], prompt_input_dict[('channel-08','history')])
df_conv_dict[('channel-08','geography')] = run_GK_inference(df_conv_dict[('channel-08','geography')], prompt_input_dict[('channel-08','geography')])

df_conv_dict[('channel-09','history')] = run_GK_inference(df_conv_dict[('channel-09','history')], prompt_input_dict[('channel-09','history')])
df_conv_dict[('channel-09','geography')] = run_GK_inference(df_conv_dict[('channel-09','geography')], prompt_input_dict[('channel-09','geography')])

df_conv_dict[('channel-10','sports')] = run_GK_inference(df_conv_dict[('channel-10','sports')], prompt_input_dict[('channel-10','sports')])
df_conv_dict[('channel-10','nutrition')] = run_GK_inference(df_conv_dict[('channel-10','nutrition')], prompt_input_dict[('channel-10','nutrition')])

df_conv_dict[('channel-11','sports')] = run_GK_inference(df_conv_dict[('channel-11','sports')], prompt_input_dict[('channel-11','sports')])
df_conv_dict[('channel-11','nutrition')] = run_GK_inference(df_conv_dict[('channel-11','nutrition')], prompt_input_dict[('channel-11','nutrition')])

df_conv_dict[('channel-12','geography')] = run_GK_inference(df_conv_dict[('channel-12','geography')], prompt_input_dict[('channel-12','geography')])
df_conv_dict[('channel-12','media')] = run_GK_inference(df_conv_dict[('channel-12','media')], prompt_input_dict[('channel-12','media')])

df_conv_dict[('channel-13','geography')] = run_GK_inference(df_conv_dict[('channel-13','geography')], prompt_input_dict[('channel-13','geography')])
df_conv_dict[('channel-13','nutrition')] = run_GK_inference(df_conv_dict[('channel-13','nutrition')], prompt_input_dict[('channel-13','nutrition')])

df_conv_dict[('channel-14','sports')] = run_GK_inference(df_conv_dict[('channel-14','sports')], prompt_input_dict[('channel-14','sports')])
df_conv_dict[('channel-14','nutrition')] = run_GK_inference(df_conv_dict[('channel-14','nutrition')], prompt_input_dict[('channel-14','nutrition')])

## Analysis CLS

In [None]:
run_name_list = ['CLS_zero_n1_llama3_8b', 'CLS_zero_n3_llama3_8b', 'CLS_zero_all_llama3_8b',
                 'CLS_few_n1_llama3_8b', 'CLS_few_n3_llama3_8b', 'CLS_few_all_llama3_8b',
                 
                 'CLS_zero_n1_llama3_70b', 'CLS_zero_n3_llama3_70b', 'CLS_zero_all_llama3_70b',
                 'CLS_few_n1_llama3_70b', 'CLS_few_n3_llama3_70b', 'CLS_few_all_llama3_70b',
                 
                 'CLS_zero_n1_gpt35', 'CLS_zero_n3_gpt35', 'CLS_zero_all_gpt35',
                 'CLS_few_n1_gpt35', 'CLS_few_n3_gpt35', 'CLS_few_all_gpt35',
                 
                 'CLS_zero_n1_gpt4o', 'CLS_zero_n3_gpt4o', 'CLS_zero_all_gpt4o',
                 'CLS_few_n1_gpt4o', 'CLS_few_n3_gpt4o', 'CLS_few_all_gpt4o',
                ]
folder_name = 'clsruns'

In [None]:
def calculate_metrics(df, label_column="grounding_label", prediction_column="CLS_prediction_pp", avg_m="macro"):
    # Extract label and prediction values from the dataframe
    CLS_true = df[label_column].replace("-", np.nan).dropna()
    # FIX WRONG OUTPUT FORMAT PREPROCESSING
    CLS_pred = df[prediction_column].replace("", np.nan).dropna()
    unique_values = CLS_pred.unique()
    if set(unique_values) != {'implicit', 'explicit', 'clarification'}:
        print(f"Warning: Column {prediction_column} contains invalid or incomplete values: {set(unique_values)}")
    elif (len(CLS_pred) == len(CLS_pred)) and (set(unique_values) == {'implicit', 'explicit', 'clarification'}):
        print(f"Labels OK.", set(unique_values))

    # Calculate accuracy
    accuracy = accuracy_score(CLS_true, CLS_pred)
    # Calculate precision
    precision = precision_score(CLS_true, CLS_pred, average=avg_m, zero_division=0)
    # Calculate recall
    recall = recall_score(CLS_true, CLS_pred, average=avg_m, zero_division=0)
    # Calculate F1 score
    f1 = f1_score(CLS_true, CLS_pred, average=avg_m, zero_division=0)
    
    print("<", run, ">", "Metric Avg:", avg_m)
    print("Accuracy:", round(accuracy,2))
    print("Precision:", round(precision,2))
    print("Recall:", round(recall,2))
    print("F1 Score:", round(f1,2))
    print("")

for run in run_name_list:
    folder_name = 'clsruns'
    file_path = folder_name + '/' +  run +'.xlsx'
    df = pd.read_excel(file_path, header=0)
    
    # Define the regex CLS extraction pattern
    pattern = r'(explicit|implicit|clarification)'

    # Extract the first occurrence of the label and keep the rest of the content after it
    df['CLS_prediction_pp'] = ""
    df['CLS_prediction_pp'] = df['CLS_prediction'].str.extract(pattern, expand=False)
    
    calculate_metrics(df=df, label_column="grounding_label", prediction_column="CLS_prediction_pp", avg_m="macro")  

In [None]:
# Detailed calculation
class_names = ['clarification', 'explicit', 'implicit']

# Extract label and prediction values from the dataframe
CLS_true = df["grounding_label"].replace("-", np.nan).dropna()
# FIX WRONG OUTPUT FORMAT PREPROCESSING
CLS_pred = df["CLS_prediction"].str.replace("Output Label: ", "").str.strip()
CLS_pred = CLS_pred.replace("", np.nan).dropna()
    
# Generate classification report
report = classification_report(CLS_true, CLS_pred, target_names=class_names, zero_division=0)

# Print the classification report
print(report)

In [None]:
folder_name = 'clsruns'
file_path = folder_name + '/' +  'CLS_zero_all_llama3_70b' +'.xlsx'
df = pd.read_excel(file_path, header=0)

# Define the regex CLS extraction pattern
pattern = r'(explicit|implicit|clarification)'

# Extract the first occurrence of the label and keep the rest of the content after it
df['CLS_prediction_pp'] = ""
df['CLS_prediction_pp'] = df['CLS_prediction'].str.extract(pattern, expand=False)
set(df['CLS_prediction_pp'])

## Analysis GK

In [None]:
run_name_list = ['GK_zero_shot_llama3_8b', 'GK_few_shot_llama3_8b',
                 
                 'GK_zero_shot_llama3_70b', 'GK_few_shot_llama3_70b',
                 
                 'GK_zero_shot_gpt35', 'GK_few_shot_gpt35',
                 
                 'GK_zero_shot_gpt4o', 'GK_few_shot_gpt4o',
                ]

In [None]:
def preprocess_predictions(df, column):
    replacements = [
    ("Output JSON-LD:", ""), 
    ("```json-ld", ""),
    ("```json", ""),
    ("```", "")]
    
    for old, new in replacements:
        df[column] = df[column].str.replace(old, new, regex=False).str.strip()

def validate_json_ld(df, prompt_input_dict):
    def is_valid_json(json_str):
        try:
            json.loads(json_str)
            return 1
        except ValueError:
            return 0

    def get_all_keys(data, keys=None):
        if keys is None:
            keys = set()

        if isinstance(data, dict):
            for key, value in data.items():
                keys.add(key)
                get_all_keys(value, keys)
        elif isinstance(data, list):
            for item in data:
                get_all_keys(item, keys)
        return keys

    def has_expected_schema(json_str, schema_keys):
        try:
            data = json.loads(json_str)
            data_keys = get_all_keys(data)
            return int(data_keys.issubset(schema_keys))
        except ValueError:
            return 0
        
    def has_equal_schema(json_str, schema_keys):
        try:
            data = json.loads(json_str)
            data_keys = get_all_keys(data)
            return int(data_keys == schema_keys)
        except ValueError:
            return 0
            
    def get_all_values(data, values=None):
        if values is None:
            values = set()
        
        if isinstance(data, dict):
            for value in data.values():
                get_all_values(value, values)
        elif isinstance(data, list):
            for item in data:
                get_all_values(item, values)
        else:
            if isinstance(data, (str, int, float, bool, type(None))):
                values.add(data)
        return values

    def has_expected_values(json_str, expected_values):
        try:
            data = json.loads(json_str)
            values = get_all_values(data)
            return int(values.issubset(expected_values))
        except ValueError:
            return 0
        
    def has_equal_values(json_str, expected_values):
        try:
            data = json.loads(json_str)
            values = get_all_values(data)
            return int(values == expected_values)
        except ValueError:
            return 0
    
    def set_difference(json_str, expected_set, set_type=None):
        if set_type == "keys":
            try:
                data = json.loads(json_str)
                predicted_set = get_all_keys(data)
                difference = predicted_set.difference(expected_set)
                return difference
            except ValueError:
                raise ValueError("Invalid JSON input")
        elif set_type == "values":
            try:
                data = json.loads(json_str)
                predicted_set = get_all_values(data)
                difference = predicted_set.difference(expected_set)
                return difference
            except ValueError:
                raise ValueError("Invalid JSON input")
    
    def compare_json_ld(json1, json2):
        try:
            data1 = json.loads(json1)
            data2 = json.loads(json2)
            return int(data1 == data2)
        except ValueError:
            return 0

    def get_json_diff(json1, json2):
        try:
            data1 = json.loads(json1)
            data2 = json.loads(json2)
            diff = DeepDiff(data1, data2, ignore_order=True)
            return diff if diff else None
        except ValueError:
            return "Invalid JSON"
    
    # Initialize new columns with default values
    df['GK_valid'] = ""
    
    df['GK_PRED_valid'] = ""
    df['GK_schema_valid'] = ""
    df['GK_PRED_schema_valid'] = ""
    df['GK_PRED_schema_equal'] = ""
    df['GK_full_schema_valid'] = ""
    df['GK_PRED_full_schema_valid'] = ""
    df['GK_PRED_full_schema_diff'] = ""
    
    df['GK_values_valid'] = ""
    df['GK_PRED_values_valid'] = ""
    df['GK_PRED_values_equal'] = ""
    df['GK_all_values_valid'] = ""
    df['GK_PRED_all_values_valid'] = ""
    df['GK_PRED_all_values_diff'] = ""
    
    df['content_equal'] = ""
    df['json_diff'] = ""

    # Iterate over rows and perform checks
    for idx, row in df.iterrows():
        if (pd.notna(row['grounded_knowledge']) and pd.notna(row['GK_prediction']) and row['grounded_knowledge'] != "-"):
            grounded_knowledge = row['grounded_knowledge']
            GK_prediction = row['GK_prediction']

            # Syntactic validation
            gk_valid = is_valid_json(grounded_knowledge)
            gk_PRED_valid = is_valid_json(GK_prediction)
            df.at[idx, 'GK_valid'] = gk_valid
            df.at[idx, 'GK_PRED_valid'] = gk_PRED_valid

            if gk_valid and gk_PRED_valid:
                # Get schema and full schema
                conv_key = (row["room_name"], row["topic"])
                #print("conv_key:", conv_key, "index:", idx)
                #print("# SCHEMA")
                system_knowledge = prompt_input_dict[conv_key][0]["system_knowledge"]
                full_schema_keys = get_all_keys(json.loads(system_knowledge))
                schema_keys = get_all_keys(json.loads(grounded_knowledge))
                PRED_keys = get_all_keys(json.loads(GK_prediction))
                #print("full_schema_keys:", len(full_schema_keys), full_schema_keys)
                #print("PRED_keys:", len(PRED_keys), PRED_keys)
                
                # Schema validation
                gk_schema_valid = has_expected_schema(grounded_knowledge, schema_keys)
                gk_PRED_schema_valid = has_expected_schema(GK_prediction, schema_keys)
                gk_PRED_schema_equal = has_equal_schema(GK_prediction, schema_keys)
                df.at[idx, 'GK_schema_valid'] = gk_schema_valid
                df.at[idx, 'GK_PRED_schema_valid'] = gk_PRED_schema_valid
                df.at[idx, 'GK_PRED_schema_equal'] = gk_PRED_schema_equal
                #print("schema_equal:", gk_PRED_schema_equal)
                
                # Full schema validation
                gk_full_schema_valid = has_expected_schema(grounded_knowledge, full_schema_keys)
                gk_PRED_full_schema_valid = has_expected_schema(GK_prediction, full_schema_keys)
                gk_PRED_full_schema_diff = set_difference(GK_prediction, full_schema_keys, set_type="keys")
                #print("full_schema_diff:", len(gk_PRED_full_schema_diff), gk_PRED_full_schema_diff)
                df.at[idx, 'GK_full_schema_valid'] = gk_full_schema_valid
                df.at[idx, 'GK_PRED_full_schema_valid'] = gk_PRED_full_schema_valid
                df.at[idx, 'GK_PRED_full_schema_diff'] = gk_PRED_full_schema_diff

                #if gk_full_schema_valid and gk_PRED_full_schema_valid:
                    #print("# VALUES")
                    # Get values and all values
                conv_key = (row["room_name"], row["topic"])
                system_knowledge = prompt_input_dict[conv_key][0]["system_knowledge"]
                all_values = get_all_values(json.loads(system_knowledge))
                values = get_all_values(json.loads(grounded_knowledge))
                PRED_values = get_all_values(json.loads(GK_prediction))
                #print("all_values:", len(all_values), all_values)
                #print("values:", len(values), values)
                #print("PRED_values:", len(PRED_values), PRED_values)

                # Values validation
                gk_values_valid = has_expected_values(grounded_knowledge, values)
                gk_PRED_values_valid = has_expected_values(GK_prediction, values)
                gk_PRED_values_equal = has_equal_values(GK_prediction, values)
                df.at[idx, 'GK_values_valid'] = gk_values_valid
                df.at[idx, 'GK_PRED_values_valid'] = gk_PRED_values_valid
                df.at[idx, 'GK_PRED_values_equal'] = gk_PRED_values_equal
                #print("values_equal:", gk_PRED_schema_equal)

                # All values validation
                gk_all_values_valid = has_expected_values(grounded_knowledge, all_values)
                gk_PRED_all_values_valid = has_expected_values(GK_prediction, all_values)
                gk_PRED_all_values_diff = set_difference(GK_prediction, all_values, set_type="values")
                #print("all_values_diff:", len(gk_PRED_all_values_diff), gk_PRED_all_values_diff)
                df.at[idx, 'GK_all_values_valid'] = gk_all_values_valid
                df.at[idx, 'GK_PRED_all_values_valid'] = gk_PRED_all_values_valid
                df.at[idx, 'GK_PRED_all_values_diff'] = gk_PRED_all_values_diff
                    
                    
                    #if gk_all_values_valid and gk_PRED_all_values_valid:
                    
                # Content validation
                content_equal = compare_json_ld(grounded_knowledge, GK_prediction)
                df.at[idx, 'content_equal'] = content_equal

                if not content_equal:
                    # Detailed content differences
                    json_diff = get_json_diff(grounded_knowledge, GK_prediction)
                    df.at[idx, 'json_diff'] = json_diff
                #print("")
        else: continue

    return df
        

def calculate_accuracy_metrics(df, run_name):
    # List of columns to calculate metrics for
    columns = ['GK_PRED_valid', 'GK_PRED_full_schema_valid', 'GK_PRED_schema_valid', 'GK_PRED_schema_equal', 'GK_PRED_all_values_valid', 'GK_PRED_values_valid', 'GK_PRED_values_equal', 'content_equal']
    
    df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')
    
    # Dictionary to store the count (accuracy) of each column
    metrics = {}
    
    for col in columns:
        if col in df.columns:
            metrics[col] = df[col].sum()
        else:
            metrics[col] = None
            
    valid_predictions = metrics["GK_PRED_valid"]
    
    metrics['GK_PRED_full_schema_diff'] = df['GK_PRED_full_schema_diff'].apply(lambda x: isinstance(x, set) and len(x) > 0).sum()
    metrics['GK_PRED_all_values_diff'] = df['GK_PRED_all_values_diff'].apply(lambda x: isinstance(x, set) and len(x) > 0).sum()

    
    # Print formatted metrics
    print("<", run, ">")
    for col, mean in metrics.items():
        if mean is not None:
            print(f"{col}: {round(mean,2)}")
        else:
            print(f"{col}: Column not found in DataFrame.")
    print('1 - GK_PRED_valid %',  round(1-metrics['GK_PRED_valid']/127,2))
    print('GK_PRED_full_schema_diff %',  round(metrics['GK_PRED_full_schema_diff']/valid_predictions,2))
    print('GK_PRED_all_values_diff %',  round(metrics['GK_PRED_all_values_diff']/valid_predictions,2))
    print("")
    return metrics


def extract_json_ld(text):
    # Check if the text is valid JSON
    if isinstance(text, (float, type(None))):  # Check for NaN and None
        return None
    text = str(text)
    
    try:
        json.loads(text)
        return text
    except ValueError:
        # If not valid JSON, try to extract with regex
        pattern = r'(?:(?<=\n)|(?<=^))\[\s*.*?\s*\](?:(?=\n)|(?=$))'
        match = re.search(pattern, text, re.DOTALL)
        if match:
            return match.group()
    return text

In [None]:
for run in run_name_list:
    folder_name = 'gkruns'
    file_path = folder_name + '/' +  run +'.xlsx'
    df = pd.read_excel(file_path, header=0)
    preprocess_predictions(df, column="GK_prediction")
    df['GK_prediction'] = df['GK_prediction'].map(lambda x: extract_json_ld(x))
    df = validate_json_ld(df, prompt_input_dict)
    df.to_excel('evaluation/' + 'eval_' + run + ".xlsx", index=False)
    
    calculate_accuracy_metrics(df=df, run_name=run) 

In [None]:
# values can be valid but schema not valid, e.g. messes up primary key and only includes a subset of values not all, it can also associate wrong values with wrong entities butter 0 calories
# use value_counts to calculate [0][1] using valid
GK_PRED_values_equal = 17 + 2 # just the order or number can be wrong (one item removed)
GK_PRED_values_valid = 19 + 29 # values not complete vocabulary because e.g. underpedicted with no category, oder not there exists an instance of the year/category ... E
GK_PRED_schema_equal = 35
GK_PRED_schema_valid = 35 + 30 # keys not complute vocabulary, sometimes forgets category for instances (tuna seafood)

In [None]:
filter_cols =  ['GK_PRED_schema_equal', 'GK_PRED_values_equal', 'content_equal']

df_1 = pd.read_excel('evaluation/eval_GK_few_shot_llama3_8b.xlsx', header=0)
df_1 = df_1[filter_cols]


value_counts_1 = df_1.apply(pd.Series.value_counts).reindex([1, 0]).fillna(0).loc[[1, 0]].T
value_counts_1

In [None]:
def count_dict_keys(df, column):
    """
    Count the occurrences of dictionary keys in a specified DataFrame column.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the column to check.
    column (str): The name of the column to check for dictionaries.

    Returns:
    dict: A dictionary with keys and their corresponding counts.
    """
    key_counter = Counter()

    for item in df[column]:
        try:
            # Convert the item to a dictionary if it's a valid dictionary string
            item_dict = ast.literal_eval(item) if isinstance(item, str) else item
            if isinstance(item_dict, dict):
                key_counter.update(item_dict.keys())
        except (ValueError, SyntaxError):
            # Skip items that are not dictionaries
            continue

    return dict(key_counter)

In [None]:
run_name_list = ['GK_zero_shot_llama3_8b', 'GK_few_shot_llama3_8b',
                 
                 'GK_zero_shot_llama3_70b', 'GK_few_shot_llama3_70b',
                 
                 'GK_zero_shot_gpt35', 'GK_few_shot_gpt35',
                 
                 'GK_zero_shot_gpt4o', 'GK_few_shot_gpt4o',
                ]

change_keys = ['iterable_item_added', 'dictionary_item_added', 'iterable_item_removed', 'dictionary_item_removed', 'values_changed']

def calculate_error_metrics(run_name):
    folder_name = 'gkruns'
    file_path = folder_name + '/' + run_name +'.xlsx'
    df = pd.read_excel(file_path, header=0)
    preprocess_predictions(df, column="GK_prediction")
    df['GK_prediction'] = df['GK_prediction'].map(lambda x: extract_json_ld(x))
    df = validate_json_ld(df, prompt_input_dict)  

    df_sum_valid_json = len(df[df["GK_PRED_valid"]==1])
    df_sum_difflib = len(df[df["content_equal"]==0])
    df_changes = count_dict_keys(df, "json_diff")
    df_sum_changes = sum(df_changes.values())
    print("<", run_name, ">")
    for key in change_keys:
        print(key, df_changes[key], round(df_changes[key]/df_sum_changes, 2))
    sum_schema = len(df[df["GK_PRED_schema_equal"].isin([0, 1])])
    sum_schema_equal = len(df[df["GK_PRED_schema_equal"]==1])
    sum_schema_less = len(df[df["GK_PRED_schema_valid"]==1]) - sum_schema_equal
    sum_schema_more = sum_schema - sum_schema_equal - sum_schema_less
    print("sum_valid_json", df_sum_valid_json)
    print("schema_equal", sum_schema_equal, round(sum_schema_equal/sum_schema, 2))
    print("schema_more", sum_schema_more, round(sum_schema_more/sum_schema, 2))
    print("schema_less", sum_schema_less, round(sum_schema_less/sum_schema, 2))
    print("sum_schema", sum([sum_schema_equal, sum_schema_more, sum_schema_less]))
    sum_values = len(df[df["GK_PRED_values_equal"].isin([0, 1])])
    sum_values_equal = len(df[df["GK_PRED_values_equal"]==1])
    sum_values_less = len(df[df["GK_PRED_values_valid"]==1]) - sum_values_equal
    sum_values_more = sum_values - sum_values_equal - sum_values_less
    print("values_equal", sum_values_equal, round(sum_values_equal/sum_values, 2))
    print("values_more", sum_values_more, round(sum_values_more/sum_values, 2))
    print("values_less", sum_values_less, round(sum_values_less/sum_values, 2))
    print("sum_values", sum([sum_values_equal, sum_values_more, sum_values_less]))
    print("\n")
    return df

for run_name in run_name_list:
    calculate_error_metrics(run_name)