In [35]:
import pandas as pd

In [36]:
# read in jsonl file
test_data = pd.read_json('data/test.jsonl', lines=True)

In [37]:
# read in predictions
predictions = pd.read_json('inference_results.json')

In [38]:
predictions.head()

Unnamed: 0,index,label,text
0,1,Pass,Direct Mapping to CaseNumber column in KPI tab...
1,2,Pass,This is reference key to the DimDataSource tab...
2,3,Fail,Derived by applying decode logic based on Reop...
3,4,Pass,Derived by applying decode logic on SRCreation...
4,5,Fail,Direct Mapping to State column in Cases table ...


In [39]:
test_data.iloc[0]['messages']

[{'role': 'system',
  'content': "You are an AI text quality reviewer tool. Your task is to review technical descriptions for data entities and attributes. Based on the quality of the description, you will output either 'Pass' or 'Fail'.\n'Pass' means the description meets the quality standards and provides clear, accurate, and complete information.\n'Fail' means the description does not meet the required standards, and you must provide a specific reason for the failure.\nYour output format will be: <Pass or Fail>, <N/A or reason for failure>"},
 {'role': 'user',
  'content': 'Direct Mapping to CaseNumber column in KPI table coming from DFM Events data processed by Cornerstone having string values.'},
 {'role': 'assistant', 'content': 'pass, N/A'}]

In [40]:
# split each test message into columns in the dataframe
# each value int the messages column is a list of dictionaries
# each dictionary is a message
# we want to split these into columns
# we will have a column for each key in the dictionary
# we will have a row for each message

def split_messages(messages):
    message_dict = {}
    for message in messages:
        col = "role" + "_" + message["role"]
        message_dict[col] = message["content"]
    return pd.Series(message_dict)

test_data_processed = test_data['messages'].apply(split_messages)
test_data_processed.head()



Unnamed: 0,role_system,role_user,role_assistant
0,You are an AI text quality reviewer tool. Your...,Direct Mapping to CaseNumber column in KPI tab...,"pass, N/A"
1,You are an AI text quality reviewer tool. Your...,This is reference key to the DimDataSource tab...,"pass, N/A"
2,You are an AI text quality reviewer tool. Your...,Derived by applying decode logic based on Reop...,"pass, N/A"
3,You are an AI text quality reviewer tool. Your...,Derived by applying decode logic on SRCreation...,"fail, What is significance of FI, please defin..."
4,You are an AI text quality reviewer tool. Your...,Direct Mapping to State column in Cases table ...,"pass, N/A"


In [41]:
# add label column to the dataframe (first word of role_assistant is the label "pass" or "fail")
def get_label(messages):
    return messages.split(", ")[0]

test_data_processed['real_label'] = test_data_processed['role_assistant'].apply(get_label)

In [42]:
test_data_processed.head()

Unnamed: 0,role_system,role_user,role_assistant,real_label
0,You are an AI text quality reviewer tool. Your...,Direct Mapping to CaseNumber column in KPI tab...,"pass, N/A",pass
1,You are an AI text quality reviewer tool. Your...,This is reference key to the DimDataSource tab...,"pass, N/A",pass
2,You are an AI text quality reviewer tool. Your...,Derived by applying decode logic based on Reop...,"pass, N/A",pass
3,You are an AI text quality reviewer tool. Your...,Derived by applying decode logic on SRCreation...,"fail, What is significance of FI, please defin...",fail
4,You are an AI text quality reviewer tool. Your...,Direct Mapping to State column in Cases table ...,"pass, N/A",pass


In [43]:
# fail is 1, pass is 0
test_data_processed['real_label'] = test_data_processed['real_label'].apply(lambda x: 1 if x == "fail" else 0)
predictions['pred_label'] = predictions['label'].apply(lambda x: 1 if x == "Fail" else 0)

In [44]:
# merge the two dataframes on 'text' and 'role_user'

merged = pd.merge(test_data_processed, predictions, left_on=['role_user'], right_on=['text'])


In [45]:
merged.head()

Unnamed: 0,role_system,role_user,role_assistant,real_label,index,label,text,pred_label
0,You are an AI text quality reviewer tool. Your...,Direct Mapping to CaseNumber column in KPI tab...,"pass, N/A",0,1,Pass,Direct Mapping to CaseNumber column in KPI tab...,0
1,You are an AI text quality reviewer tool. Your...,This is reference key to the DimDataSource tab...,"pass, N/A",0,2,Pass,This is reference key to the DimDataSource tab...,0
2,You are an AI text quality reviewer tool. Your...,Derived by applying decode logic based on Reop...,"pass, N/A",0,3,Fail,Derived by applying decode logic based on Reop...,1
3,You are an AI text quality reviewer tool. Your...,Derived by applying decode logic on SRCreation...,"fail, What is significance of FI, please defin...",1,4,Pass,Derived by applying decode logic on SRCreation...,0
4,You are an AI text quality reviewer tool. Your...,Direct Mapping to State column in Cases table ...,"pass, N/A",0,5,Fail,Direct Mapping to State column in Cases table ...,1


In [46]:
# drop everything but "text", "real_label", "pred_label"
merged = merged[['text', 'real_label', 'pred_label']]

In [47]:
merged.head()

Unnamed: 0,text,real_label,pred_label
0,Direct Mapping to CaseNumber column in KPI tab...,0,0
1,This is reference key to the DimDataSource tab...,0,0
2,Derived by applying decode logic based on Reop...,0,1
3,Derived by applying decode logic on SRCreation...,1,0
4,Direct Mapping to State column in Cases table ...,0,1


In [48]:
# calculate accuracy
accuracy = (merged['real_label'] == merged['pred_label']).sum() / len(merged)

In [49]:
accuracy

np.float64(0.7352941176470589)

In [50]:
# calculate precision
true_positives = ((merged['real_label'] == 1) & (merged['pred_label'] == 1)).sum()
false_positives = ((merged['real_label'] == 0) & (merged['pred_label'] == 1)).sum()
precision = true_positives / (true_positives + false_positives)
precision

np.float64(0.0)

In [53]:
# calculate recall
false_negatives = ((merged['real_label'] == 1) & (merged['pred_label'] == 0)).sum()
recall = true_positives / (true_positives + false_negatives)
recall

np.float64(0.0)

In [54]:
false_negatives

np.int64(9)

In [55]:
merged['real_label'].sum()

np.int64(9)

In [56]:
merged['pred_label'].sum()

np.int64(18)