# Read Data and Imports


In [167]:
import pandas as pd 
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

In [168]:
DATAPATH = '../datasets/error_analysis/sampled/auto_gpt3_gpt3_50_50_generations_sampled.csv'
# DATAPATH = '../datasets/error_analysis/sampled/baseline_zero_shot_gpt3_50_50_sampled.csv'
# DATAPATH = '../datasets/error_analysis/sampled/zero_shot_cot_50_50_sampled.csv'
MODEL1 = "gpt-3.5-turbo-0301"
MODEL2 = "gpt-3.5-turbo-16k-0613"



df = pd.read_csv(DATAPATH)

In [169]:
load_dotenv()
handler = CallbackHandler(os.environ.get("LF_PK"), os.environ.get("LF_SK"), os.environ.get("LF_HOST"))    

# Get unstructured explanations

In [170]:


task = """Now explain concisely how you made your decision and explicitly mention the attributes and values that had a high influence on your decision."""




UNSTRUCTURED_ANALYSIS_PROMPT = ChatPromptTemplate.from_messages([
            ("system", "You are a helpful AI."),
            ("human", "{user_prompt}"),
            ("ai", "{ai_answer}"),
            ("human", task),
        ])





In [171]:
if "unstructured_analysis" not in df.columns:
    df["unstructured_analysis"] = ""

In [172]:
df

Unnamed: 0.1,Unnamed: 0,traceId,model,startTime,endTime,prompt,completion,eval,predicted_label,label,error,similarity,confidence,structured_explanation_v2,structured_explanation_v2_gpt4,unstructured_analysis
0,0,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:39:55.032Z,2023-10-20T12:40:03.460Z,Q: Are the following two products the same?\r\...,"First, we can compare the titles and descripti...",no match,False,True,FN,20%,90%,"Based on the information provided, the attribu...","Based on the information provided, the attribu...",I made my decision by comparing the titles and...
1,5,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:38:41.714Z,2023-10-20T12:38:54.408Z,Q: Are the following two products the same?\r\...,"First, we can compare the titles of the two pr...",No match.,False,True,FN,60%,50%,,"[{""attribute"":""brand"",""importance"":""0.7"",""valu...",Based on the comparison of the titles of the t...
2,6,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:38:26.414Z,2023-10-20T12:38:39.759Z,Q: Are the following two products the same?\r\...,"First, we can compare the titles of the two pr...",no match,False,True,FN,40%,70%,,"[{""attribute"":""brand"",""importance"":""0.5"",""valu...",I compared the titles of the two products and ...
3,7,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:38:15.109Z,2023-10-20T12:38:24.862Z,Q: Are the following two products the same?\r\...,"First, we can compare the titles and descripti...",no match,False,True,FN,40%,80%,,"[{""attribute"":""brand"",""importance"":""0.2"",""valu...",I made my decision by comparing the titles and...
4,11,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:37:18.029Z,2023-10-20T12:37:28.523Z,Q: Are the following two products the same?\r\...,"First, we can compare the titles and descripti...",Possible answer: uncertain.,False,True,FN,80%,70%,,"[{""attribute"":""brand"",""importance"":""0.5"",""valu...","Based on the information provided, I determine..."
5,17,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:36:02.653Z,2023-10-20T12:36:14.147Z,Q: Are the following two products the same?\r\...,"First, we can compare the titles and descripti...",No match.,False,True,FN,60%,70%,,"[{""attribute"":""brand"",""importance"":""0.3"",""valu...","Based on the titles and descriptions provided,..."
6,18,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:35:43.712Z,2023-10-20T12:36:00.816Z,Q: Are the following two products the same?\r\...,"First, we can compare the titles and descripti...",No match.,False,True,FN,80%,70%,,"Based on the information provided, the compari...",Based on the comparison of the titles and desc...
7,20,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:35:22.763Z,2023-10-20T12:35:30.727Z,Q: Are the following two products the same?\r\...,"First, we can see that both products are made ...",no match,False,True,FN,50%,80%,,"[{""attribute"":""brand"",""importance"":""0.2"",""valu...",I made my decision by comparing the attributes...
8,21,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:35:08.478Z,2023-10-20T12:35:21.126Z,Q: Are the following two products the same?\r\...,"First, we can see that both products are from ...",Match.,True,True,NONE,90%,80%,,"[{""attribute"":""brand"",""importance"":""0.3"",""valu...",I made my decision by comparing the titles and...
9,22,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:34:52.483Z,2023-10-20T12:35:06.588Z,Q: Are the following two products the same?\r\...,"First, we can compare the titles and descripti...",No match.,False,True,FN,80%,70%,,"Based on the information provided, the attribu...",I made my decision based on a comparison of th...


In [173]:
def unstructured_analysis(start, end):
    llm = ChatOpenAI(temperature=0, model_name=MODEL1)
    chain = LLMChain(llm=llm, prompt=UNSTRUCTURED_ANALYSIS_PROMPT, callbacks=[handler])
    
    for i in range(start, end):
        print(i)
        whole_prompt = df["prompt"][i]
        user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
        ai_answer = df["completion"][i]
        if df["predicted_label"][i]:
            prediction = "Match"
        else:
            prediction = "No Match"
        ai_answer = ai_answer + "\n" + prediction
        try:
            output = chain.run(user_prompt=user_prompt, ai_answer=ai_answer, callbacks=[handler])
        except:
            output = "error"
            i = i - 1
            continue

        print(output)
        df["unstructured_analysis"][i] = output
    

In [174]:
# unstructured_analysis(0, len(df))
# df.to_csv(DATAPATH, index=False)

In [175]:
CATEGORIZATION_PROMPT = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about the decision and the actual label of the task.
Can you please group the wrong decisions into 3-6 fault types and indicate how often each one occurs?
{examples}
""")

CATEGORIZATION_PROMPT2 = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about the decision and the actual label of the task.
Can you please group the wrong decisions into 3 fault categories? 
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
{examples}
""")

CATEGORIZATION_PROMPT3 = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about the decision and the actual label of the task.
Can you please group the wrong decisions into 3 fault categories? Please also indicate how often each one occurs.
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
{examples}
""")

CATEGORIZATION_PROMPT4 = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about attributes used for the decision and the actual label of the task.
Can you please group the wrong decisions into 3 fault categories? Please also indicate how often each one occurs.
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
{examples}
""")

CATEGORIZATION_PROMPT5 = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about attributes used for the decision and the actual label of the task.
Can you please group the wrong decisions into 3-6 fault categories? Please also indicate how often each one occurs.
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
{examples}
""")

USED_PROMPT = CATEGORIZATION_PROMPT4

In [176]:
handler2 = CallbackHandler(os.environ.get("LF_PK"), os.environ.get("LF_SK"), os.environ.get("LF_HOST"))    

In [177]:
df.sort_values(by=["error"], inplace=True, ignore_index=True)

# Unstructred Clustering

In [178]:
llm = ChatOpenAI(temperature=0, model_name=MODEL2)
chain = LLMChain(llm=llm, prompt=USED_PROMPT, callbacks=[handler2])

examples = "Wrong Decicions:\n\n"
flag = True
for i in range(0, len(df)):
    if flag and df["error"][i] == "NONE":
        examples += "Correct Decisions for Reference:\n\n"
        flag = False
    whole_prompt = df["prompt"][i]
    user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
    examples += "Task " + str(i) + ":\n" + user_prompt + "\n"
    if df["predicted_label"][i]:
        prediction = "Match"
    else:
        prediction = "No Match"
    if df["label"][i]:
        label = "Match"
    else:
        label = "No Match"

    examples += "Reasoning: " + df["completion"][i]
    examples += "Decision: " + prediction + "\n"
    examples += "Details: " + df["unstructured_analysis"][i] + "\n"
    examples += "Actual Label: " + label + "\n\n"
    # examples += "Fault type: " + df["error"][i] + "\n\n" 


# print(examples)
output = chain.run(examples=examples, callbacks=[handler2])
print(output)

Fault Categories:

1. Attribute Mismatch: This category includes cases where the wrong decision is made because the attributes used for the decision do not match or are not considered correctly. This can include cases where the wrong attributes are compared, or where the attributes are not given enough weight in the decision-making process. (Occurrences: 3, Task 2, Task 4, Task 6)

2. Lack of Information: This category includes cases where the wrong decision is made due to a lack of information or missing details. This can include cases where one product has a description while the other does not, leading to an incomplete comparison. (Occurrences: 1, Task 0)

3. Assumptions and Uncertainty: This category includes cases where the wrong decision is made due to assumptions or uncertainty about the products. This can include cases where the decision-maker assumes that two products are similar or related without enough evidence, or where the decision-maker acknowledges the possibility of th

# Structured Clustering

In [179]:
# handler3 = CallbackHandler(os.environ.get("LF_PK"), os.environ.get("LF_SK"), os.environ.get("LF_HOST"))    

In [180]:
llm = ChatOpenAI(temperature=0, model_name=MODEL2)
chain = LLMChain(llm=llm, prompt=USED_PROMPT, callbacks=[handler2])

examples = "Wrong Decicions:\n\n"
flag = True
for i in range(0, len(df)):
    if flag and df["error"][i] == "NONE":
        examples += "Correct Decisions for Reference:\n\n"
        flag = False
    whole_prompt = df["prompt"][i]
    user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
    examples += "Task " + str(i) + ":\n" + user_prompt + "\n"
    if df["predicted_label"][i]:
        prediction = "Match"
    else:
        prediction = "No Match"
    if df["label"][i]:
        label = "Match"
    else:
        label = "No Match"

    examples += "Reasoning: " + df["completion"][i]
    examples += "Decision: " + prediction + "\n"
    examples += "Details: " + df["structured_explanation_v2_gpt4"][i] + "\n"
    examples += "Actual Label: " + label + "\n\n"
    # examples += "Fault type: " + df["error"][i] + "\n\n" 


# print(examples)
output = chain.run(examples=examples, callbacks=[handler2])
print(output)

Fault Categories:

1. Attribute Mismatch: This category includes cases where the wrong decision is made because the attributes used for the decision do not match or are not considered important. This can occur when important attributes are missing or when less important attributes are given too much importance. (Occurrences: 3, Task 2, Task 6, Task 9)

2. Insufficient Information: This category includes cases where the wrong decision is made due to insufficient information or missing details. This can occur when important attributes or descriptions are missing, making it difficult to accurately compare the products. (Occurrences: 2, Task 4, Task 15)

3. Incorrect Reasoning: This category includes cases where the wrong decision is made due to incorrect reasoning or interpretation of the attributes and descriptions. This can occur when the reasoning process does not accurately consider the similarities or differences between the products. (Occurrences: 2, Task 0, Task 3)

Please note tha