# Read Data and Imports


In [29]:
import pandas as pd 
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

In [30]:
# DATAPATH = '../datasets/error_analysis/sampled/auto_gpt3_gpt3_50_50_generations_sampled.csv'
DATAPATH = '../datasets/error_analysis/sampled/baseline_zero_shot_gpt3_50_50_sampled.csv'
# DATAPATH = '../datasets/error_analysis/sampled/zero_shot_cot_50_50_sampled.csv'
MODEL1 = "gpt-3.5-turbo-0301"
MODEL2 = "gpt-3.5-turbo-16k-0613"



df = pd.read_csv(DATAPATH)

In [31]:
load_dotenv()
handler = CallbackHandler(os.environ.get("LF_PK"), os.environ.get("LF_SK"), os.environ.get("LF_HOST"))    

# Get unstructured explanations

In [32]:


task = """Now explain concisely how you made your decision and explicitly mention the attributes and values that had a high influence on your decision."""




UNSTRUCTURED_ANALYSIS_PROMPT = ChatPromptTemplate.from_messages([
            ("system", "You are a helpful AI."),
            ("human", "{user_prompt}"),
            ("ai", "{ai_answer}"),
            ("human", task),
        ])





In [33]:
if "unstructured_analysis" not in df.columns:
    df["unstructured_analysis"] = ""

In [34]:
df

Unnamed: 0.1,Unnamed: 0,traceId,model,startTime,endTime,prompt,completion,predicted_label,label,error,similarity,confidence,whole_output,structured_explanation,structured_explanation_v2,structured_explanation_v2_gpt4,unstructured_analysis
0,0,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:05:58.793Z,2023-10-22T13:05:59.340Z,Are the following two products the same?\r\nPr...,False.,False,True,FN,30%,50%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""title"",""importance"":""0.50"",""val...",I made my decision based on the fact that the ...
1,7,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:05:55.711Z,2023-10-22T13:05:56.147Z,Are the following two products the same?\r\nPr...,False.,False,True,FN,85%,80%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""brand"",""importance"":""0.05"",""val...",I made my decision based on the fact that the ...
2,8,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:05:55.246Z,2023-10-22T13:05:55.698Z,Are the following two products the same?\r\nPr...,False.,False,True,FN,90%,80%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""brand"",""importance"":""0.2"",""valu...",I made my decision based on the fact that the ...
3,11,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:05:53.960Z,2023-10-22T13:05:54.432Z,Are the following two products the same?\r\nPr...,False.,False,True,FN,80%,70%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""brand"",""importance"":""0.1"",""valu...",I made my decision based on the fact that the ...
4,18,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:05:50.471Z,2023-10-22T13:05:50.960Z,Are the following two products the same?\r\nPr...,False.,False,True,FN,50%,70%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""brand"",""importance"":""0.7"",""valu...",I made my decision based on the fact that the ...
5,21,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:05:49.049Z,2023-10-22T13:05:49.548Z,Are the following two products the same?\r\nPr...,True.,True,True,NONE,100%,95%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""brand"",""importance"":""0.5"",""valu...",I made my decision based on the titles of the ...
6,26,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:05:46.703Z,2023-10-22T13:05:47.255Z,Are the following two products the same?\r\nPr...,False.,False,False,NONE,0%,40%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""brand"",""importance"":""0.7"",""valu...",I made my decision based on the fact that the ...
7,53,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:05:31.548Z,2023-10-22T13:05:32.049Z,Are the following two products the same?\r\nPr...,False.,False,False,NONE,30%,50%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""brand"",""importance"":""0.3"",""valu...",I made my decision based on the fact that the ...
8,55,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:05:30.592Z,2023-10-22T13:05:31.075Z,Are the following two products the same?\r\nPr...,False.,False,False,NONE,0%,50%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""brand"",""importance"":""1"",""values...",I made my decision based on the fact that the ...
9,105,19817c50-cbc5-48b5-bf93-e6c45825d48b,gpt-3.5-turbo-0301,2023-10-22T13:04:58.735Z,2023-10-22T13:04:59.743Z,Are the following two products the same?\r\nPr...,True,True,False,FP,90%,80%,,"\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","\r\n\r\n[{""attribute"":""brand"",""importance"":""0....","[{""attribute"":""brand"",""importance"":""0.5"",""valu...",I made my decision based on the fact that both...


In [35]:
def unstructured_analysis(start, end):
    llm = ChatOpenAI(temperature=0, model_name=MODEL1)
    chain = LLMChain(llm=llm, prompt=UNSTRUCTURED_ANALYSIS_PROMPT, callbacks=[handler])
    
    for i in range(start, end):
        print(i)
        whole_prompt = df["prompt"][i]
        user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
        ai_answer = df["completion"][i]
        if df["predicted_label"][i]:
            prediction = "Match"
        else:
            prediction = "No Match"
        ai_answer = ai_answer + "\n" + prediction
        try:
            output = chain.run(user_prompt=user_prompt, ai_answer=ai_answer, callbacks=[handler])
        except:
            output = "error"
            i = i - 1
            continue

        print(output)
        df["unstructured_analysis"][i] = output
    

In [36]:
# unstructured_analysis(0, len(df))
# df.to_csv(DATAPATH, index=False)

In [37]:
CATEGORIZATION_PROMPT = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about the decision and the actual label of the task.
Can you please group the wrong decisions into 3-6 fault types and indicate how often each one occurs?
{examples}
""")

CATEGORIZATION_PROMPT2 = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about the decision and the actual label of the task.
Can you please group the wrong decisions into 3 fault categories? 
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
{examples}
""")

CATEGORIZATION_PROMPT3 = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about the decision and the actual label of the task.
Can you please group the wrong decisions into 3 fault categories? Please also indicate how often each one occurs.
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
{examples}
""")

CATEGORIZATION_PROMPT4 = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about attributes used for the decision and the actual label of the task.
Can you please group the wrong decisions into 3 fault categories? Please also indicate how often each one occurs.
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
{examples}
""")

CATEGORIZATION_PROMPT5 = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about attributes used for the decision and the actual label of the task.
Can you please group the wrong decisions into 3-6 fault categories? Please also indicate how often each one occurs.
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
{examples}
""")

USED_PROMPT = CATEGORIZATION_PROMPT4

In [38]:
handler2 = CallbackHandler(os.environ.get("LF_PK"), os.environ.get("LF_SK"), os.environ.get("LF_HOST"))    

In [39]:
df.sort_values(by=["error"], inplace=True, ignore_index=True)

# Unstructred Clustering

In [40]:
llm = ChatOpenAI(temperature=0, model_name=MODEL2)
chain = LLMChain(llm=llm, prompt=USED_PROMPT, callbacks=[handler2])

examples = "Wrong Decicions:\n\n"
flag = True
for i in range(0, len(df)):
    if flag and df["error"][i] == "NONE":
        examples += "Correct Decisions for Reference:\n\n"
        flag = False
    whole_prompt = df["prompt"][i]
    user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
    examples += "Task " + str(i) + ":\n" + user_prompt + "\n"
    if df["predicted_label"][i]:
        prediction = "Match"
    else:
        prediction = "No Match"
    if df["label"][i]:
        label = "Match"
    else:
        label = "No Match"

    examples += "Decision: " + prediction + "\n"
    examples += "Details: " + df["unstructured_analysis"][i] + "\n"
    examples += "Actual Label: " + label + "\n\n"
    # examples += "Fault type: " + df["error"][i] + "\n\n" 


# print(examples)
output = chain.run(examples=examples, callbacks=[handler2])
print(output)

Fault Categories:

1. Title-based Decision:
- Task 0: The decision was based solely on the difference in titles, without considering other attributes. (1 occurrence)
- Task 2: The decision was based solely on the difference in titles, without considering other attributes. (1 occurrence)
- Task 4: The decision was based solely on the difference in titles, without considering other attributes. (1 occurrence)
- Task 6: The decision was based solely on the similarity in titles, without considering other attributes. (1 occurrence)
- Task 7: The decision was based solely on the similarity in titles, without considering other attributes. (1 occurrence)
- Task 12: The decision was based solely on the difference in titles, without considering other attributes. (1 occurrence)
- Task 14: The decision was based solely on the difference in titles, without considering other attributes. (1 occurrence)
- Task 15: The decision was based solely on the similarity in titles, without considering other attr

# Structured Clustering

In [41]:
# handler3 = CallbackHandler(os.environ.get("LF_PK"), os.environ.get("LF_SK"), os.environ.get("LF_HOST"))    

In [42]:
llm = ChatOpenAI(temperature=0, model_name=MODEL2)
chain = LLMChain(llm=llm, prompt=USED_PROMPT, callbacks=[handler2])

examples = "Wrong Decicions:\n\n"
flag = True
for i in range(0, len(df)):
    if flag and df["error"][i] == "NONE":
        examples += "Correct Decisions for Reference:\n\n"
        flag = False
    whole_prompt = df["prompt"][i]
    user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
    examples += "Task " + str(i) + ":\n" + user_prompt + "\n"
    if df["predicted_label"][i]:
        prediction = "Match"
    else:
        prediction = "No Match"
    if df["label"][i]:
        label = "Match"
    else:
        label = "No Match"

    examples += "Decision: " + prediction + "\n"
    examples += "Details: " + df["structured_explanation_v2_gpt4"][i] + "\n"
    examples += "Actual Label: " + label + "\n\n"
    # examples += "Fault type: " + df["error"][i] + "\n\n" 


# print(examples)
output = chain.run(examples=examples, callbacks=[handler2])
print(output)

Fault Categories:

1. Attribute Mismatch: This category includes cases where the decision is wrong because the attributes used for the decision do not match between the two products. This can occur when important attributes are missing or when the values of the attributes are different. 

Examples: Task 0, Task 1, Task 2, Task 3, Task 4, Task 5, Task 6, Task 7, Task 8, Task 9

2. Importance Misjudgment: This category includes cases where the decision is wrong because the importance assigned to certain attributes is incorrect. This can occur when important attributes are given low importance or when unimportant attributes are given high importance. 

Examples: Task 1, Task 5, Task 8

3. Incorrect Labeling: This category includes cases where the decision is wrong because the actual label of the task is different from the decision made. This can occur when the decision is made in the opposite direction of the actual label. 

Examples: Task 0, Task 2, Task 3, Task 4, Task 6, Task 7, Task 9