In [31]:
import pandas as pd

In [76]:
# read in jsonl file
test_data = pd.read_json('data/test_ALL_with_labels.jsonl', lines=True)

In [77]:
# read in predictions
predictions = pd.read_json('inference_results.json')

In [78]:
predictions.head()

Unnamed: 0,index,label,text
0,1,Pass,Direct Mapping to CaseNumber column in KPI tab...
1,2,Pass,This is reference key to the DimDataSource tab...
2,3,Fail,Derived by applying decode logic based on Reop...
3,4,Pass,Derived by applying decode logic on SRCreation...
4,5,Fail,Direct Mapping to State column in Cases table ...


In [79]:
test_data.head()

Unnamed: 0,description,fail
0,Direct mapping to AccountTimeLineID column com...,False
1,This is a reference key to DimDataSource table...,True
2,Direct mapping to ServicePackageName column fr...,True
3,This is a reference key to DimSupportAreaPath ...,False
4,Direct mapping to InitiativeId column in OARIn...,False


In [84]:
test_data.iloc[0]['description']

'Direct mapping to AccountTimeLineID column coming from MDAP (MSA Data Analytics Platform) source,FactPerformingAndNewCountmeasures_v1 table'

In [85]:
# split each test message into columns in the dataframe
# each value int the messages column is a list of dictionaries
# each dictionary is a message
# we want to split these into columns
# we will have a column for each key in the dictionary
# we will have a row for each message

# def split_messages(messages):
#     message_dict = {}
#     for message in messages:
#         col = "role" + "_" + message["role"]
#         message_dict[col] = message["content"]
#     return pd.Series(message_dict)

# test_data_processed = test_data['messages'].apply(split_messages)
# test_data_processed.head()



In [86]:
# add label column to the dataframe (first word of role_assistant is the label "pass" or "fail")
# def get_label(messages):
#     return messages.split(", ")[0]

# test_data_processed['real_label'] = test_data_processed['role_assistant'].apply(get_label)

In [87]:
# test_data_processed.head()

In [89]:
# fail is 1, pass is 0
test_data['real_label'] = test_data['fail'].apply(lambda x: 1 if x else 0)
predictions['pred_label'] = predictions['label'].apply(lambda x: 1 if x == "Fail" else 0)

In [90]:
# merge the two dataframes on 'text' and 'role_user'

merged = pd.merge(test_data, predictions, left_on=['description'], right_on=['text'])


In [91]:
merged.head()

Unnamed: 0,description,fail,real_label,index,label,text,pred_label
0,SurrogateKey generated within UDP NRT to ident...,True,1,30,Pass,SurrogateKey generated within UDP NRT to ident...,0
1,Timestamp in UTC at which record is updated by...,False,0,47,Fail,Timestamp in UTC at which record is updated by...,1
2,Timestamp in UTC at which record is updated by...,False,0,47,Fail,Timestamp in UTC at which record is updated by...,1


In [63]:
# drop everything but "text", "real_label", "pred_label"
merged = merged[['text', 'real_label', 'pred_label']]

In [64]:
merged.head()

Unnamed: 0,text,real_label,pred_label
0,Direct Mapping to CaseNumber column in KPI tab...,0,0
1,This is reference key to the DimDataSource tab...,0,0
2,Derived by applying decode logic based on Reop...,0,1
3,Derived by applying decode logic on SRCreation...,0,0
4,Direct Mapping to State column in Cases table ...,0,1


In [65]:
# calculate accuracy
accuracy = (merged['real_label'] == merged['pred_label']).sum() / len(merged)

In [66]:
accuracy

np.float64(0.8235294117647058)

In [67]:
# calculate precision
true_positives = ((merged['real_label'] == 1) & (merged['pred_label'] == 1)).sum()
false_positives = ((merged['real_label'] == 0) & (merged['pred_label'] == 1)).sum()
precision = true_positives / (true_positives + false_positives)
precision

np.float64(0.0)

In [68]:
# calculate recall
false_negatives = ((merged['real_label'] == 1) & (merged['pred_label'] == 0)).sum()
recall = true_positives / (true_positives + false_negatives)
recall

  recall = true_positives / (true_positives + false_negatives)


np.float64(nan)

In [69]:
false_negatives

np.int64(0)

In [70]:
merged['real_label'].sum()

np.int64(0)

In [71]:
merged['pred_label'].sum()

np.int64(18)

In [92]:
# read in data/cleaned_outputs.json file
cleaned_outputs = pd.read_json('data/cleaned_outputs.json')
cleaned_outputs.head()

Unnamed: 0,index,description,decision,full_reasoning
0,1,Direct mapping to AccountTimeLineID column com...,Pass,Reason:This explanation includes source detail...
1,2,This is a reference key to DimDataSource table...,Fail,Reason:The description should specify the sour...
2,3,This is a reference key to DimSupportAreaPath ...,Pass,Reason:This description includes the needed so...
3,4,Direct mapping to InitiativeId column in OARIn...,Pass,Reason:Provides source identification and tran...
4,5,This field references the DimGeography table t...,Fail,Reason:The description is not clear enough bec...


In [93]:
# compare the cleaned outputs to the predictions row by row
# if the cleaned output is the same as the prediction, then the prediction is correct

cleaned_outputs['pred_label'] = cleaned_outputs['decision'].apply(lambda x: 1 if x == "Fail" else 0)
cleaned_outputs.head()

Unnamed: 0,index,description,decision,full_reasoning,pred_label
0,1,Direct mapping to AccountTimeLineID column com...,Pass,Reason:This explanation includes source detail...,0
1,2,This is a reference key to DimDataSource table...,Fail,Reason:The description should specify the sour...,1
2,3,This is a reference key to DimSupportAreaPath ...,Pass,Reason:This description includes the needed so...,0
3,4,Direct mapping to InitiativeId column in OARIn...,Pass,Reason:Provides source identification and tran...,0
4,5,This field references the DimGeography table t...,Fail,Reason:The description is not clear enough bec...,1


In [96]:
# append real_label to the cleaned_outputs dataframe

cleaned_outputs['real_label'] = test_data['real_label']
cleaned_outputs.head()


Unnamed: 0,index,description,decision,full_reasoning,pred_label,real_label
0,1,Direct mapping to AccountTimeLineID column com...,Pass,Reason:This explanation includes source detail...,0,0
1,2,This is a reference key to DimDataSource table...,Fail,Reason:The description should specify the sour...,1,1
2,3,This is a reference key to DimSupportAreaPath ...,Pass,Reason:This description includes the needed so...,0,1
3,4,Direct mapping to InitiativeId column in OARIn...,Pass,Reason:Provides source identification and tran...,0,0
4,5,This field references the DimGeography table t...,Fail,Reason:The description is not clear enough bec...,1,0


In [98]:
# calculate accuracy
accuracy = (cleaned_outputs['real_label'] == cleaned_outputs['pred_label']).sum() / len(cleaned_outputs)
accuracy

np.float64(0.4909090909090909)