In [1]:
import pandas as pd
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
import os
load_dotenv()
from langchain.chains import LLMChain
import json
from langchain.prompts import (
    ChatPromptTemplate,
)



In [2]:
from langfuse.callback import CallbackHandler
handler = CallbackHandler(os.environ.get("LF_PK"), os.environ.get("LF_SK"), os.environ.get("LF_HOST"))

In [3]:
GENERATIONS_PATH = "../datasets/manual_gpt3_50_50_generations_merged.csv"

In [4]:
data = pd.read_csv(GENERATIONS_PATH)



if "decision" not in data.columns:
    data["decision"] = ""
if "similarity" not in data.columns:
    data["similarity"] = ""
if "confidence" not in data.columns:
    data["confidence"] = ""

data

Unnamed: 0.1,Unnamed: 0,traceId,model,startTime,endTime,prompt,completion,eval,predicted_label,label,error,decision,similarity,confidence
0,0,c58f0c2d-d624-44b6-a73e-900e8ac46026,gpt-3.5-turbo-0301,2023-10-17T10:25:22.070Z,2023-10-17T10:25:25.422Z,Q: Could you please tell me if the following t...,These two products are not the same because th...,No match.,False,True,FN,No Match,0%,100%
1,1,c58f0c2d-d624-44b6-a73e-900e8ac46026,gpt-3.5-turbo-0301,2023-10-17T10:25:12.823Z,2023-10-17T10:25:17.121Z,Q: Could you please tell me if the following t...,"Yes, these two products are the same one. They...",Match.,True,True,NONE,Match,95%,90%
2,2,c58f0c2d-d624-44b6-a73e-900e8ac46026,gpt-3.5-turbo-0301,2023-10-17T10:25:04.230Z,2023-10-17T10:25:07.905Z,Q: Could you please tell me if the following t...,"Yes, these two products are the same. They are...",Match.,True,True,NONE,Match,100%,95%
3,3,c58f0c2d-d624-44b6-a73e-900e8ac46026,gpt-3.5-turbo-0301,2023-10-17T10:24:56.337Z,2023-10-17T10:24:59.610Z,Q: Could you please tell me if the following t...,These two products are likely the same because...,Match.,True,True,NONE,Match,90%,80%
4,4,c58f0c2d-d624-44b6-a73e-900e8ac46026,gpt-3.5-turbo-0301,2023-10-17T10:24:48.374Z,2023-10-17T10:24:51.714Z,Q: Could you please tell me if the following t...,"Yes, these two products are the same one. They...",Match.,True,True,NONE,Match,100%,95%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,216,c58f0c2d-d624-44b6-a73e-900e8ac46026,gpt-3.5-turbo-0301,2023-10-17T09:51:58.234Z,2023-10-17T09:52:02.222Z,Q: Could you please tell me if the following t...,These two products are not the same one. The f...,No match.,False,False,NONE,No Match,0%,100%
217,217,c58f0c2d-d624-44b6-a73e-900e8ac46026,gpt-3.5-turbo-0301,2023-10-17T09:51:46.656Z,2023-10-17T09:51:54.143Z,Q: Could you please tell me if the following t...,These two products are not the same because th...,No match.,False,False,NONE,No Match,0%,100%
218,218,c58f0c2d-d624-44b6-a73e-900e8ac46026,gpt-3.5-turbo-0301,2023-10-17T09:51:37.814Z,2023-10-17T09:51:42.110Z,Q: Could you please tell me if the following t...,These products are not the same one as they ar...,No match.,False,False,NONE,No Match,0%,100%
219,219,c58f0c2d-d624-44b6-a73e-900e8ac46026,gpt-3.5-turbo-0301,2023-10-17T09:51:27.877Z,2023-10-17T09:51:33.410Z,Q: Could you please tell me if the following t...,These two products are not the same because th...,No match.,False,False,NONE,No Match,0%,100%


In [5]:


task = """Now output your decision as a simple 'Match' or 'No Match'. 
Also provide a similarity score for the items and a confidence score for your decision as percentage values, 100% referring to perfect similarity or full confidence. 
Please provide your answer exactly in the following format which includes an example output:
Decision: Match
Similarity: 75%
Confidence: 60%
"""


STEP_CONFIDENCE_PROMPT = ChatPromptTemplate.from_messages([
            ("system", "You are a helpful AI."),
            ("human", "{user_prompt}"),
            ("ai", "{ai_answer}"),
            ("human", task),
        ])





In [6]:
def get_confidence_score(start, end):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo-0301", temperature=0, max_tokens=150)
    chain = LLMChain(llm=llm, prompt=STEP_CONFIDENCE_PROMPT, callbacks=[handler])
    for i in range(start, end):
        print(i)
        whole_prompt = data["prompt"][i]
        user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
        ai_answer = data["completion"][i]
        try:
            output = chain.run(user_prompt=user_prompt, ai_answer=ai_answer, callbacks=[handler])
        except:
            output = "error"
            i = i - 1
            continue
        print(STEP_CONFIDENCE_PROMPT.format(user_prompt=user_prompt, ai_answer=ai_answer))
        print("----")
        print(data["label"][i])
        print(output)
        print("-----------")
        splitted_output = output.split("\n")
        decision = splitted_output[0].split(": ")[1]
        similarity = splitted_output[1].split(": ")[1]
        confidence = splitted_output[2].split(": ")[1]
        data["decision"][i] = decision
        data["similarity"][i] = similarity
        data["confidence"][i] = confidence



In [7]:
# get_confidence_score(40, len(data))
# data.to_csv(GENERATIONS_PATH, index=False)


In [8]:
task_structured_explanation = """Explain your decision in a structured format, listing the attributes that you compared for reaching your decision. Each attribute should be accompanied by the attribute values and a score between 0 and 1 that shows the importance of the attribute for the decision. If an attribute only occurs in one item, specify the value of that attribute for the other item as "missing". The attributes should not be just the Title or Description but more specific attributes like Brand, Model, Color, etc. 
The format should exactly match the following example output: 
[{"attribute":"brand","importance":"0.05","values":["Logitech","Logitech"]},{"attribute":"model","importance":"0.95" "values":["G500","MX Master 3S"]},{"attribute":"color","importance":"0.00" "values":["missing","Graphite"]}]"""




STEP_EXPLANATION_STRUCTURED_PROMPT = ChatPromptTemplate.from_messages([
            ("system", "You are a helpful AI."),
            ("human", "{user_prompt}"),
            ("ai", "{ai_answer}"),
            ("human", "{task_structured_explanation}"),
        ])




In [9]:
if "structured_explanation" not in data.columns:
    data["structured_explanation"] = ""

In [10]:
def get_structured_explanation(start, end):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo-0301", temperature=0, max_tokens=150)
    chain = LLMChain(llm=llm, prompt=STEP_EXPLANATION_STRUCTURED_PROMPT, callbacks=[handler])
    for i in range(start, end):
        print(i)
        whole_prompt = data["prompt"][i]
        user_prompt = "Q:" + whole_prompt.split("Q:")[-1]
        ai_answer = data["completion"][i]
        try:
            output = chain.run(user_prompt=user_prompt, ai_answer=ai_answer, task_structured_explanation=task_structured_explanation, callbacks=[handler])
        except:
            output = "error"
            i = i - 1
            continue
        print(STEP_EXPLANATION_STRUCTURED_PROMPT.format(user_prompt=user_prompt, ai_answer=ai_answer, task_structured_explanation=task_structured_explanation))
        print("----")
        print(data["label"][i])
        print(output)
        print("-----------")
        data["structured_explanation"][i] = output



In [11]:
get_structured_explanation(0, len(data))

0


1


In [None]:
data

Unnamed: 0.1,Unnamed: 0,traceId,model,startTime,endTime,prompt,completion,eval,predicted_label,label,error,decision,similarity,confidence,structured_explanation
0,0,a93975a0-a858-44cc-bbb2-265b51674484,gpt-3.5-turbo-0301,2023-11-01T00:47:39.778Z,2023-11-01T00:47:46.617Z,Q: Are the following two products the same?\nP...,The first product is a refill for a lined note...,no match,False,True,FN,,,,"[\n {\n ""attribute"": ""title"",\n ""import..."
1,1,a93975a0-a858-44cc-bbb2-265b51674484,gpt-3.5-turbo-0301,2023-11-01T00:47:32.139Z,2023-11-01T00:47:38.560Z,Q: Are the following two products the same?\nP...,Both products are Maxxis Minion DHR II tires f...,Match.,True,True,NONE,,,,"[{""attribute"":""Title"",""importance"":""0.5"",""valu..."
2,2,a93975a0-a858-44cc-bbb2-265b51674484,gpt-3.5-turbo-0301,2023-11-01T00:47:22.573Z,2023-11-01T00:47:30.984Z,Q: Are the following two products the same?\nP...,Both products are related to the Traveler's No...,match,True,True,NONE,,,,"[{""attribute"":""Title"",""importance"":""0.5"",""valu..."
3,3,a93975a0-a858-44cc-bbb2-265b51674484,gpt-3.5-turbo-0301,2023-11-01T00:47:14.334Z,2023-11-01T00:47:21.359Z,Q: Are the following two products the same?\nP...,Both products are related to Traveler's notebo...,no match,False,True,FN,,,,"[\n {\n ""attribute"": ""title"",\n ""import..."
4,4,a93975a0-a858-44cc-bbb2-265b51674484,gpt-3.5-turbo-0301,2023-11-01T00:47:07.459Z,2023-11-01T00:47:13.134Z,Q: Are the following two products the same?\nP...,Both products have the same model number (CAZ1...,Match.,True,True,NONE,,,,"[\n {\n ""attribute"": ""brand"",\n ""import..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,216,a93975a0-a858-44cc-bbb2-265b51674484,gpt-3.5-turbo-0301,2023-10-31T23:25:30.670Z,2023-10-31T23:25:37.407Z,Q: Are the following two products the same?\nP...,The first product is a Hikvision DVR with 16 c...,No match.,False,False,NONE,,,,
217,217,a93975a0-a858-44cc-bbb2-265b51674484,gpt-3.5-turbo-0301,2023-10-31T23:25:23.095Z,2023-10-31T23:25:29.324Z,Q: Are the following two products the same?\nP...,Both products are related to Fujifilm Instax S...,No match.,False,False,NONE,,,,
218,218,a93975a0-a858-44cc-bbb2-265b51674484,gpt-3.5-turbo-0301,2023-10-31T23:25:16.669Z,2023-10-31T23:25:21.915Z,Q: Are the following two products the same?\nP...,The first product is a video switcher that all...,No match.,False,False,NONE,,,,
219,219,a93975a0-a858-44cc-bbb2-265b51674484,gpt-3.5-turbo-0301,2023-10-31T23:25:06.628Z,2023-10-31T23:25:14.377Z,Q: Are the following two products the same?\nP...,The first product is a pack of Instax Mini Rai...,No match.,False,False,NONE,,,,


In [None]:
# data.to_csv(GENERATIONS_PATH, index=False)
