This notebook contains the result analysis of the FT experiments on the pathology reports dataset.
Evaluates the model performance on the downstream task, different experiments use a different number of variables/dimensions to add complexity to the task.

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json

In [None]:
df = pd.read_csv('llamabase-eval-results.csv', sep=';')
df.iloc[2]

In [3]:
label = df.iloc[0]['label']
label

'{"intervention":"Biopsie","dignity":"maligne","entity":"Osteosarkom","subentity":null}'

In [None]:
predicted_interventions = []
predicted_dignities = []
predicted_entities = []
predicted_subentities = []
true_interventions = []
true_dignities = []
true_entities = []
true_subentities = []
for _,row in df.iterrows():
  prediction = row['prediction']
  prediction = json.loads(prediction)
  label = row['label']
  label = json.loads(label)

  predicted_interventions.append(prediction['intervention'].lower())
  predicted_dignities.append(prediction['dignity'].lower())
  # predicted_entities.append(label['entity'].lower())
  # predicted_subentities.append(label['subentity'])
  true_interventions.append(label['intervention'].lower())
  true_dignities.append(label['dignity'].lower())
  # true_entities.append(label['entity'].lower())
  # true_subentities.append(label['subentity'])

In [None]:
# Compute accuracy
intervention_accuracy = accuracy_score(true_interventions, predicted_interventions)
dignity_accuracy = accuracy_score(true_dignities, predicted_dignities)

# Compute precision
intervention_precision = precision_score(true_interventions, predicted_interventions, average='macro')
dignity_precision = precision_score(true_dignities, predicted_dignities, average='macro')

# Compute recall
intervention_recall = recall_score(true_interventions, predicted_interventions, average='macro')
dignity_recall = recall_score(true_dignities, predicted_dignities, average='macro')

# Compute F1 score
intervention_f1 = f1_score(true_interventions, predicted_interventions, average='macro')
dignity_f1 = f1_score(true_dignities, predicted_dignities, average='macro')

print(f"Intervention Accuracy: {intervention_accuracy:.4f}")
print(f"Dignity Accuracy: {dignity_accuracy:.4f}")
print(f"Intervention Precision: {intervention_precision:.4f}")
print(f"Dignity Precision: {dignity_precision:.4f}")
print(f"Intervention Recall: {intervention_recall:.4f}")
print(f"Dignity Recall: {dignity_recall:.4f}")
print(f"Intervention F1 Score: {intervention_f1:.4f}")
print(f"Dignity F1 Score: {dignity_f1:.4f}")

Intervention Accuracy: 0.3828
Dignity Accuracy: 0.5000
Intervention Precision: 0.4356
Dignity Precision: 0.3625
Intervention Recall: 0.3890
Dignity Recall: 0.3046
Intervention F1 Score: 0.3448
Dignity F1 Score: 0.3310


In [None]:
files = ['llamabase-eval-results.csv', 'ortho-ft-4dim-X.csv']
for i in range(0,5):
  files.append(f'ortho-ft-4dim-v{i}.csv')
print(files)
for file in files:
  df = pd.read_csv(file, sep=';')
  predicted_interventions = []
  predicted_dignities = []
  true_interventions = []
  true_dignities = []
  predicted_entities = []
  true_entities = []
  predicted_subentities = []
  true_subentities = []
  for _,row in df.iterrows():
    prediction = row['prediction']
    prediction = json.loads(prediction)
    label = row['label']
    label = json.loads(label)

    predicted_interventions.append(prediction['intervention'].lower())
    predicted_dignities.append(prediction['dignity'].lower())
    true_interventions.append(label['intervention'].lower())
    true_dignities.append(label['dignity'].lower())
    if label['entity'] != None and prediction['entity'] != None:
      predicted_entities.append(str(prediction['entity']).lower())
      true_entities.append(str(label['entity']).lower())
    else:
      predicted_entities.append(str(prediction['entity']))
      true_entities.append(str(label['entity']))
    if label['subentity'] != None and prediction['subentity'] != None:
      predicted_subentities.append(str(prediction['subentity']).lower())
      true_subentities.append(str(label['subentity']).lower())
    else:
      predicted_subentities.append(str(prediction['subentity']))
      true_subentities.append(str(label['subentity']))
  
  # Compute accuracy
  intervention_accuracy = accuracy_score(true_interventions, predicted_interventions)
  dignity_accuracy = accuracy_score(true_dignities, predicted_dignities)
  entity_accuracy = accuracy_score(true_entities, predicted_entities)
  subentity_accuracy = accuracy_score(true_subentities, predicted_subentities)

  # Compute precision
  intervention_precision = precision_score(true_interventions, predicted_interventions, average='macro')
  dignity_precision = precision_score(true_dignities, predicted_dignities, average='macro')
  entity_precision = precision_score(true_entities, predicted_entities, average='macro')
  subentity_precision = precision_score(true_subentities, predicted_subentities, average='macro')

  # Compute recall
  intervention_recall = recall_score(true_interventions, predicted_interventions, average='macro')
  dignity_recall = recall_score(true_dignities, predicted_dignities, average='macro')
  entity_recall = recall_score(true_entities, predicted_entities, average='macro')
  subentity_recall = recall_score(true_subentities, predicted_subentities, average='macro')

  # Compute F1 score
  intervention_f1 = f1_score(true_interventions, predicted_interventions, average='macro')
  dignity_f1 = f1_score(true_dignities, predicted_dignities, average='macro')
  entity_f1 = f1_score(true_entities, predicted_entities, average='macro')
  subentity_f1 = f1_score(true_subentities, predicted_subentities, average='macro')
  print(f"-----------------'{file}'----------------")
  print(f"Intervention Accuracy: {intervention_accuracy:.4f}")
  print(f"Dignity Accuracy: {dignity_accuracy:.4f}")
  print(f"Intervention F1 Score: {intervention_f1:.4f}")
  print(f"Dignity F1 Score: {dignity_f1:.4f}")
  print(f"Entity F1 Score: {entity_f1:.4f}")
  print(f"Subentity F1 Score: {subentity_f1:.4f}")
  print(f"Intervention Precision: {intervention_precision:.4f}")
  

['llamabase-eval-results.csv', 'ortho-ft-4dim-X.csv', 'ortho-ft-4dim-v0.csv', 'ortho-ft-4dim-v1.csv', 'ortho-ft-4dim-v2.csv', 'ortho-ft-4dim-v3.csv', 'ortho-ft-4dim-v4.csv']


-----------------'llamabase-eval-results.csv'----------------
Intervention Accuracy: 0.3828
Dignity Accuracy: 0.5000
Intervention F1 Score: 0.3448
Dignity F1 Score: 0.3310
Entity F1 Score: 0.0124
Subentity F1 Score: 0.0011
Intervention Precision: 0.4356
-----------------'ortho-ft-4dim-X.csv'----------------
Intervention Accuracy: 0.7852
Dignity Accuracy: 0.9375
Intervention F1 Score: 0.7831
Dignity F1 Score: 0.9287
Entity F1 Score: 0.3063
Subentity F1 Score: 0.1777
Intervention Precision: 0.7864


-----------------'ortho-ft-4dim-v0.csv'----------------
Intervention Accuracy: 0.5586
Dignity Accuracy: 0.8555
Intervention F1 Score: 0.5192
Dignity F1 Score: 0.8474
Entity F1 Score: 0.1038
Subentity F1 Score: 0.0387
Intervention Precision: 0.5818
-----------------'ortho-ft-4dim-v1.csv'----------------
Intervention Accuracy: 0.7734
Dignity Accuracy: 0.9570
Intervention F1 Score: 0.7577
Dignity F1 Score: 0.9508
Entity F1 Score: 0.3310
Subentity F1 Score: 0.2217
Intervention Precision: 0.7683
-----------------'ortho-ft-4dim-v2.csv'----------------
Intervention Accuracy: 0.5664
Dignity Accuracy: 0.8125
Intervention F1 Score: 0.5547
Dignity F1 Score: 0.7719
Entity F1 Score: 0.1188
Subentity F1 Score: 0.0159
Intervention Precision: 0.5821
-----------------'ortho-ft-4dim-v3.csv'----------------
Intervention Accuracy: 0.7305
Dignity Accuracy: 0.9688
Intervention F1 Score: 0.7196
Dignity F1 Score: 0.9648
Entity F1 Score: 0.3005
Subentity F1 Score: 0.1818
Intervention Precision: 0.7921
--------

In [None]:
files = []
for i in range(3):
  files.append(f'ortho-ft-5dim-v{i}.csv')
print(files)
for file in files:
  df = pd.read_csv(file, sep=';')
  predicted_interventions = []
  predicted_dignities = []
  true_interventions = []
  true_dignities = []
  predicted_entities = []
  true_entities = []
  predicted_subentities = []
  true_subentities = []
  predicted_location = []
  true_location = []
  for _,row in df.iterrows():
    prediction = row['prediction']
    try:
      prediction = json.loads(prediction)
    except:
      prediction = {'intervention': "", 'dignity': "", 'entity': None, 'subentity': None, 'location': None}
    label = row['label']
    label = json.loads(label)

    predicted_interventions.append(prediction['intervention'].lower())
    predicted_dignities.append(prediction['dignity'].lower())
    true_interventions.append(label['intervention'].lower())
    true_dignities.append(label['dignity'].lower())

    try:
      if label['entity'] != None and prediction['entity'] != None:
        predicted_entities.append(str(prediction['entity']).lower())
        true_entities.append(str(label['entity']).lower())
      else:
        predicted_entities.append(str(prediction['entity']))
        true_entities.append(str(label['entity']))
      if label['subentity'] != None and prediction['subentity'] != None:
        predicted_subentities.append(str(prediction['subentity']).lower())
        true_subentities.append(str(label['subentity']).lower())
      else:
        predicted_subentities.append(str(prediction['subentity']))
        true_subentities.append(str(label['subentity']))
      if label['location'] != None and prediction['location'] != None:
        predicted_location.append(str(prediction['location']).lower())
        true_location.append(str(label['location']).lower())
      else:
        predicted_location.append(str(prediction['location']))
        true_location.append(str(label['location']))
    except:
      continue
  
  # Compute accuracy
  intervention_accuracy = accuracy_score(true_interventions, predicted_interventions)
  dignity_accuracy = accuracy_score(true_dignities, predicted_dignities)
  try:
    entity_accuracy = accuracy_score(true_entities, predicted_entities)
    subentity_accuracy = accuracy_score(true_subentities, predicted_subentities)
  except:
    entity_accuracy = 0
    subentity_accuracy = 0
  # Compute precision
  intervention_precision = precision_score(true_interventions, predicted_interventions, average='macro', zero_division=0)
  dignity_precision = precision_score(true_dignities, predicted_dignities, average='macro', zero_division=0)
  try:
    entity_precision = precision_score(true_entities, predicted_entities, average='macro', zero_division=0)
    subentity_precision = precision_score(true_subentities, predicted_subentities, average='macro', zero_division=0)
  except:
    entity_precision = 0
    subentity_precision = 0

  # Compute recall
  intervention_recall = recall_score(true_interventions, predicted_interventions, average='macro', zero_division=0)
  dignity_recall = recall_score(true_dignities, predicted_dignities, average='macro', zero_division=0)
  try:
    entity_recall = recall_score(true_entities, predicted_entities, average='macro', zero_division=0)
    subentity_recall = recall_score(true_subentities, predicted_subentities, average='macro', zero_division=0)
  except:
    entity_recall = 0
    subentity_recall = 0

  # Compute F1 score
  intervention_f1 = f1_score(true_interventions, predicted_interventions, average='macro', zero_division=0)
  dignity_f1 = f1_score(true_dignities, predicted_dignities, average='macro', zero_division=0)
  try:
    entity_f1 = f1_score(true_entities, predicted_entities, average='macro', zero_division=0)
    subentity_f1 = f1_score(true_subentities, predicted_subentities, average='macro', zero_division=0)
    location_f1 = f1_score(true_location, predicted_location, average='macro', zero_division=0)
  except:
    entity_f1 = 0
    subentity_f1 = 0
    location_f1 = 0
  print(f"-----------------'{file}'----------------")
  print(f"Intervention Accuracy: {intervention_accuracy:.4f}")
  print(f"Dignity Accuracy: {dignity_accuracy:.4f}")
  print(f"Intervention F1 Score: {intervention_f1:.4f}")
  print(f"Dignity F1 Score: {dignity_f1:.4f}")
  print(f"Entity F1 Score: {entity_f1:.4f}")
  print(f"Subentity F1 Score: {subentity_f1:.4f}")
  print(f"Intervention Precision: {intervention_precision:.4f}")
  
  print(f"Location F1 Score: {location_f1:.4f}")
  

['ortho-ft-5dim-no_ligerZ.csv', 'DP-Orthov2_results.csv', 'DP-Ortho-3B-v2_results.csv', 'DP-Ortho-3B-20epoch_results.csv', 'DP-Ortho-3B-20epoch_results-2.csv', 'DP-Ortho-3B-20epoch_results-3.csv', 'DP-Orthov4_results.csv', 'DP-Ortho-Eps3_results.csv', 'DP-Ortho-Eps3-20epoch_results.csv', 'DP-Ortho-Eps6-20epoch_results.csv', 'DP-Ortho-Eps6_results.csv', 'DP-Ortho-Eps6-v2_results.csv', 'DP-Ortho-50epoch_results.csv', 'DP-Ortho-50epoch2_results.csv', 'DP-Ortho-50epoch3_results.csv', 'ortho-3B-5dim_results_2.csv', 'DP-Orthov3_results.csv', 'DP-Ortho-Eps3-50epoch2_results.csv', 'DP-Ortho-Eps3-50epoch3_results.csv', 'DP-FT-models-DP-Ortho-20epoch-new_results-2.csv', 'DP-FT-models-DP-Ortho-20epoch-new-final_model_results-2.csv', 'ortho-ft-5dim-v0.csv', 'ortho-ft-5dim-v1.csv', 'ortho-ft-5dim-v2.csv']
-----------------'ortho-ft-5dim-no_ligerZ.csv'----------------
Intervention Accuracy: 0.7812
Dignity Accuracy: 0.9688
Intervention F1 Score: 0.7677
Dignity F1 Score: 0.9654
Entity F1 Score: 0.3444

In [16]:
file = "DP-Orthov-1dim_results.csv"
df = pd.read_csv(file, sep=';')
predicted_interventions = []
true_interventions = []

for _,row in df.iterrows():
  prediction = row['prediction']
  try:
    prediction = json.loads(prediction)
  except:
    prediction = {'intervention': ""}
  label = row['label']
  label = json.loads(label)

  predicted_interventions.append(prediction['intervention'].lower())
  true_interventions.append(label['intervention'].lower())

  
# Compute accuracy
intervention_accuracy = accuracy_score(true_interventions, predicted_interventions)
dignity_accuracy = accuracy_score(true_dignities, predicted_dignities)

# Compute precision
intervention_precision = precision_score(true_interventions, predicted_interventions, average='macro', zero_division=0)

# Compute recall
intervention_recall = recall_score(true_interventions, predicted_interventions, average='macro', zero_division=0)

# Compute F1 score
intervention_f1 = f1_score(true_interventions, predicted_interventions, average='macro', zero_division=0)

print(f"-----------------'{file}'----------------")
print(f"Intervention Accuracy: {intervention_accuracy:.4f}")
print(f"Intervention F1 Score: {intervention_f1:.4f}")
print(f"Intervention Precision: {intervention_precision:.4f}")

print(f"Location F1 Score: {location_f1:.4f}")
  

-----------------'DP-Orthov-1dim_results.csv'----------------
Intervention Accuracy: 0.4453
Intervention F1 Score: 0.4132
Intervention Precision: 0.4154
Location F1 Score: 0.3160


In [17]:
file = "DP-Orthov-2dim_results.csv"
df = pd.read_csv(file, sep=';')
predicted_interventions = []
predicted_dignities = []
true_interventions = []
true_dignities = []

for _,row in df.iterrows():
  prediction = row['prediction']
  try:
    prediction = json.loads(prediction)
  except:
    prediction = {'intervention': "", 'dignity': ""}
  label = row['label']
  label = json.loads(label)

  predicted_interventions.append(prediction['intervention'].lower())
  predicted_dignities.append(prediction['dignity'].lower())
  true_interventions.append(label['intervention'].lower())
  true_dignities.append(label['dignity'].lower())

# Compute accuracy
intervention_accuracy = accuracy_score(true_interventions, predicted_interventions)
dignity_accuracy = accuracy_score(true_dignities, predicted_dignities)

# Compute recall
intervention_recall = recall_score(true_interventions, predicted_interventions, average='macro', zero_division=0)
dignity_recall = recall_score(true_dignities, predicted_dignities, average='macro', zero_division=0)
# Compute F1 score
intervention_f1 = f1_score(true_interventions, predicted_interventions, average='macro', zero_division=0)
dignity_f1 = f1_score(true_dignities, predicted_dignities, average='macro', zero_division=0)

print(f"-----------------'{file}'----------------")
print(f"Intervention Accuracy: {intervention_accuracy:.4f}")
print(f"Dignity Accuracy: {dignity_accuracy:.4f}")

print(f"Intervention F1 Score: {intervention_f1:.4f}")
print(f"Dignity F1 Score: {dignity_f1:.4f}")

print(f"Dignity Precision: {dignity_precision:.4f}")
print(f"Entity Precision: {entity_precision:.4f}")
print(f"Subentity Precision: {subentity_precision:.4f}")
print(f"Intervention Recall: {intervention_recall:.4f}")
print(f"Dignity Recall: {dignity_recall:.4f}")


print(f"Location F1 Score: {location_f1:.4f}")
  

-----------------'DP-Orthov-2dim_results.csv'----------------
Intervention Accuracy: 0.5664
Dignity Accuracy: 0.6992
Intervention F1 Score: 0.5298
Dignity F1 Score: 0.4095
Dignity Precision: 0.6021
Entity Precision: 0.2971
Subentity Precision: 0.1866
Intervention Recall: 0.5384
Dignity Recall: 0.4057
Location F1 Score: 0.3160


In [None]:
files = ['ortho-ft-5dim-no_ligerZ.csv', 'ortho-ft-5dim-v0.csv', 'ortho-ft-5dim-v1.csv', 'ortho-ft-5dim-v2.csv',
          'ortho-ft-5dim-real_results_42.csv',
          'ortho-ft-5dim-real_results_2442.csv']
for i in range(3):
  files.append(f'ortho-ft-5dim-v{i}.csv')
print(files)
for file in files:
  df = pd.read_csv(file, sep=';')
  predicted_interventions = []
  predicted_dignities = []
  true_interventions = []
  true_dignities = []
  predicted_entities = []
  true_entities = []
  predicted_subentities = []
  true_subentities = []
  predicted_location = []
  true_location = []
  for _,row in df.iterrows():
    prediction = row['prediction']
    try:
      prediction = json.loads(prediction)
    except:
      prediction = {'intervention': "", 'dignity': "", 'entity': None, 'subentity': None, 'location': None}
    label = row['label']
    label = json.loads(label)

    predicted_interventions.append(prediction['intervention'].lower())
    predicted_dignities.append(prediction['dignity'].lower())
    true_interventions.append(label['intervention'].lower())
    true_dignities.append(label['dignity'].lower())

    try:
      if label['entity'] != None and prediction['entity'] != None:
        predicted_entities.append(str(prediction['entity']).lower())
        true_entities.append(str(label['entity']).lower())
      else:
        predicted_entities.append(str(prediction['entity']))
        true_entities.append(str(label['entity']))
      if label['subentity'] != None and prediction['subentity'] != None:
        predicted_subentities.append(str(prediction['subentity']).lower())
        true_subentities.append(str(label['subentity']).lower())
      else:
        predicted_subentities.append(str(prediction['subentity']))
        true_subentities.append(str(label['subentity']))
      if label['location'] != None and prediction['location'] != None:
        predicted_location.append(str(prediction['location']).lower())
        true_location.append(str(label['location']).lower())
      else:
        predicted_location.append(str(prediction['location']))
        true_location.append(str(label['location']))
    except:
      continue
  
  # Compute accuracy
  intervention_accuracy = accuracy_score(true_interventions, predicted_interventions)
  dignity_accuracy = accuracy_score(true_dignities, predicted_dignities)
  try:
    entity_accuracy = accuracy_score(true_entities, predicted_entities)
    subentity_accuracy = accuracy_score(true_subentities, predicted_subentities)
  except:
    entity_accuracy = 0
    subentity_accuracy = 0
  # Compute precision
  intervention_precision = precision_score(true_interventions, predicted_interventions, average='macro', zero_division=0)
  dignity_precision = precision_score(true_dignities, predicted_dignities, average='macro', zero_division=0)
  try:
    entity_precision = precision_score(true_entities, predicted_entities, average='macro', zero_division=0)
    subentity_precision = precision_score(true_subentities, predicted_subentities, average='macro', zero_division=0)
  except:
    entity_precision = 0
    subentity_precision = 0

  # Compute recall
  intervention_recall = recall_score(true_interventions, predicted_interventions, average='macro', zero_division=0)
  dignity_recall = recall_score(true_dignities, predicted_dignities, average='macro', zero_division=0)
  try:
    entity_recall = recall_score(true_entities, predicted_entities, average='macro', zero_division=0)
    subentity_recall = recall_score(true_subentities, predicted_subentities, average='macro', zero_division=0)
  except:
    entity_recall = 0
    subentity_recall = 0

  # Compute F1 score
  intervention_f1 = f1_score(true_interventions, predicted_interventions, average='macro', zero_division=0)
  dignity_f1 = f1_score(true_dignities, predicted_dignities, average='macro', zero_division=0)
  try:
    entity_f1 = f1_score(true_entities, predicted_entities, average='macro', zero_division=0)
    subentity_f1 = f1_score(true_subentities, predicted_subentities, average='macro', zero_division=0)
    location_f1 = f1_score(true_location, predicted_location, average='macro', zero_division=0)
  except:
    entity_f1 = 0
    subentity_f1 = 0
    location_f1 = 0
  print(f"-----------------'{file}'----------------")
  print(f"Intervention Accuracy: {intervention_accuracy:.4f}")
  print(f"Dignity Accuracy: {dignity_accuracy:.4f}")
  print(f"Intervention F1 Score: {intervention_f1:.4f}")
  print(f"Dignity F1 Score: {dignity_f1:.4f}")
  print(f"Entity F1 Score: {entity_f1:.4f}")
  print(f"Subentity F1 Score: {subentity_f1:.4f}")
  print(f"Intervention Precision: {intervention_precision:.4f}")
  
  print(f"Location F1 Score: {location_f1:.4f}")
  

['ortho-ft-5dim-no_ligerZ.csv', 'ortho-ft-5dim-v0.csv', 'ortho-ft-5dim-v1.csv', 'ortho-ft-5dim-v2.csv', 'ortho-ft-5dim-real_results_42.csv', 'ortho-ft-5dim-real_results_2442.csv', 'ortho-ft-5dim-v0.csv', 'ortho-ft-5dim-v1.csv', 'ortho-ft-5dim-v2.csv']
-----------------'ortho-ft-5dim-no_ligerZ.csv'----------------
Intervention Accuracy: 0.7812
Dignity Accuracy: 0.9688
Intervention F1 Score: 0.7677
Dignity F1 Score: 0.9654
Entity F1 Score: 0.3444
Subentity F1 Score: 0.2405
Intervention Precision: 0.7659
Location F1 Score: 0.3884
-----------------'ortho-ft-5dim-v0.csv'----------------
Intervention Accuracy: 0.8281
Dignity Accuracy: 0.9336
Intervention F1 Score: 0.8218
Dignity F1 Score: 0.9274
Entity F1 Score: 0.2771
Subentity F1 Score: 0.2820
Intervention Precision: 0.8170
Location F1 Score: 0.3607
-----------------'ortho-ft-5dim-v1.csv'----------------
Intervention Accuracy: 0.7812
Dignity Accuracy: 0.9336
Intervention F1 Score: 0.7730
Dignity F1 Score: 0.9262
Entity F1 Score: 0.3092
Sub