In [35]:
import pandas as pd 
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

In [36]:
DATAPATH = '../datasets/error_analysis/sampled/auto_gpt3_gpt3_50_50_generations_sampled.csv'
MODEL1 = "gpt-3.5-turbo-0301"
MODEL2 = "gpt-3.5-turbo-16k-0613"



df = pd.read_csv(DATAPATH)

In [37]:
load_dotenv()
handler = CallbackHandler(os.environ.get("LF_PK"), os.environ.get("LF_SK"), os.environ.get("LF_HOST"))    

In [38]:


task = """Now explain your concisely how you made your decision and explicitly mention the attributes and values that had a high influence on your decision."""
task2 = """Now explain concisely how you made your decision and explicitly mention the attributes and values that had a high influence on your decision."""




UNSTRUCTURED_ANALYSIS_PROMPT = ChatPromptTemplate.from_messages([
            ("system", "You are a helpful AI."),
            ("human", "{user_prompt}"),
            ("ai", "{ai_answer}"),
            ("human", task),
        ])





In [39]:
if "unstructured_analysis" not in df.columns:
    df["unstructured_analysis"] = ""

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,traceId,model,startTime,endTime,prompt,completion,eval,predicted_label,label,error,similarity,confidence,unstructured_analysis
0,0,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:39:55.032Z,2023-10-20T12:40:03.460Z,Q: Are the following two products the same?\nP...,"First, we can compare the titles and descripti...",no match,False,True,FN,20%,90%,I made my decision by comparing the titles and...
1,4,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:38:55.776Z,2023-10-20T12:39:08.718Z,Q: Are the following two products the same?\nP...,"First, we can compare the titles of the two pr...",The two products may match.,True,True,NONE,90%,80%,I made my decision based on the similarities b...
2,5,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:38:41.714Z,2023-10-20T12:38:54.408Z,Q: Are the following two products the same?\nP...,"First, we can compare the titles of the two pr...",No match.,False,True,FN,60%,50%,I made my decision based on a comparison of th...
3,6,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:38:26.414Z,2023-10-20T12:38:39.759Z,Q: Are the following two products the same?\nP...,"First, we can compare the titles of the two pr...",no match,False,True,FN,40%,70%,I compared the titles of the two products and ...
4,7,136e98ba-ac14-42f4-8345-103a04fd95e3,gpt-3.5-turbo-0301,2023-10-20T12:38:15.109Z,2023-10-20T12:38:24.862Z,Q: Are the following two products the same?\nP...,"First, we can compare the titles and descripti...",no match,False,True,FN,40%,80%,I compared the titles and descriptions of the ...


In [41]:
def unstructured_analysis(start, end):
    llm = ChatOpenAI(temperature=0, model_name=MODEL1)
    chain = LLMChain(llm=llm, prompt=UNSTRUCTURED_ANALYSIS_PROMPT, callbacks=[handler])
    
    for i in range(start, end):
        print(i)
        whole_prompt = df["prompt"][i]
        user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
        ai_answer = df["completion"][i]
        if df["predicted_label"][i]:
            prediction = "Match"
        else:
            prediction = "No Match"
        ai_answer = ai_answer + "\n" + prediction
        try:
            output = chain.run(user_prompt=user_prompt, ai_answer=ai_answer, callbacks=[handler])
        except:
            output = "error"
            i = i - 1
            continue

        print(output)
        df["unstructured_analysis"][i] = output
    

In [42]:
# unstructured_analysis(0, len(df))
# df.to_csv(DATAPATH, index=False)

In [43]:
CATEGORIZATION_PROMPT = PromptTemplate(
    input_variables=["examples"],
    template="""In the following I will give you a few entity matching tasks together with a matching decision, details about the decision and the actual label of the task.
Can you please group the wrong decisions into 3-6 fault types and indicate how often each one occurs?
{examples}
""")

In [44]:
handler2 = CallbackHandler(os.environ.get("LF_PK"), os.environ.get("LF_SK"), os.environ.get("LF_HOST"))    

In [46]:
llm = ChatOpenAI(temperature=0, model_name=MODEL2)
chain = LLMChain(llm=llm, prompt=CATEGORIZATION_PROMPT, callbacks=[handler2])

examples = ""

for i in range(0, len(df)):
    whole_prompt = df["prompt"][i]
    user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
    examples += "Task: " + user_prompt + "\n"
    if df["predicted_label"][i]:
        prediction = "Match"
    else:
        prediction = "No Match"
    examples += "Decision: " + prediction + "\n"
    examples += "Details: " + df["unstructured_analysis"][i] + "\n"
    examples += "Label: " + str(df["label"][i]) + "\n\n" 

output = chain.run(examples=examples, callbacks=[handler2])
print(output)

Fault types:
1. Attribute mismatch: This occurs when the attributes of the two products being compared are different, leading to a wrong decision.
2. Category mismatch: This occurs when the two products belong to different categories, but are mistakenly considered as the same.
3. Missing information: This occurs when one or both of the products have missing or insufficient information, making it difficult to make an accurate decision.

Frequency of fault types:
1. Attribute mismatch: 10 occurrences
2. Category mismatch: 3 occurrences
3. Missing information: 2 occurrences
