In [None]:
!pip install --quiet datasets mlflow

In [2]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
dataset =  load_dataset("NESPED-GEN/spider_selector_schemaReduzido",split="dev")
df = dataset.to_pandas()

In [6]:
import mlflow
import os

os.environ['MLFLOW_TRACKING_URI'] = ""
os.environ['MLFLOW_TRACKING_USERNAME'] = ""
os.environ['MLFLOW_TRACKING_PASSWORD'] = ""

mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])

experiment_name= ""
mlflow_experiment = mlflow.set_experiment(experiment_name)

In [9]:
def info_result (result_path):
  test_result = {'question':[], 'hardness':[], 'expected':[], 'generated':[]}

  with open(result_path, 'r') as file:
      count = 0
      for line in tqdm(file, desc="Test ..."):

          test_result['question'].append(dataset[count]['question_en'])
          test_result['hardness'].append(dataset[count]['hardness'])
          test_result['expected'].append(dataset[count]['selector_correct'].lower())

          generated = line.lower()
          test_result['generated'].append(generated)

          count += 1
  return test_result

In [10]:
def metricas(experiment_id, df):
  tables = {  'all':   {'test':0, 'accuracy':0, 'precision':0, 'recall':0},
              'easy':  {'test':0, 'accuracy':0, 'precision':0, 'recall':0},
              'medium':{'test':0, 'accuracy':0, 'precision':0, 'recall':0},
              'hard':  {'test':0, 'accuracy':0, 'precision':0, 'recall':0},
              'extra': {'test':0, 'accuracy':0, 'precision':0, 'recall':0}
            }

  columns = { 'all':   {'test':0, 'accuracy':0, 'precision':0, 'recall':0},
              'easy':  {'test':0, 'accuracy':0, 'precision':0, 'recall':0},
              'medium':{'test':0, 'accuracy':0, 'precision':0, 'recall':0},
              'hard':  {'test':0, 'accuracy':0, 'precision':0, 'recall':0},
              'extra': {'test':0, 'accuracy':0, 'precision':0, 'recall':0}
            }

  y = []
  y_hat = []

  for index, row in df.iterrows():

    reference_hardness = row['hardness']

    tables['all']['test'] += 1
    tables[reference_hardness]['test'] += 1

    reference = eval(row['expected'].lower())

    generated_tables_and_columns = eval(row['generated'].lower())

    # -------- tables ------------------------------------------------------------------------------------
    generated_tables = [table for table in generated_tables_and_columns.keys()]
    reference_tables = [table for table in reference.keys()]

    # Calculate accuracy
    if set(generated_tables) == set(reference_tables):
        tables['all']['accuracy'] += 1
        tables[reference_hardness]['accuracy'] += 1

    # Calculate precision and recall
    true_positives = len(set(generated_tables) & set(reference_tables))
    false_positives = len(set(generated_tables) - set(reference_tables))
    false_negatives = len(set(reference_tables) - set(generated_tables))

    if len(generated_tables) > 0:
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
    else:
      precision = 0
      recall = 0

    tables['all']['precision'] += precision
    tables[reference_hardness]['precision'] += precision

    tables['all']['recall'] += recall
    tables[reference_hardness]['recall'] += recall

    # -------- columns ------------------------------------------------------------------------------------
    tables_true_positives = set(generated_tables) & set(reference_tables)

    general_tables = set(generated_tables) | set(reference_tables)

    if len(tables_true_positives) > 0:

      acuracia = 0
      TP_columns = 0
      FP_columns = 0
      FN_columns = 0

      for table in general_tables:
          columns['all']['test'] += 1
          columns[reference_hardness]['test'] += 1

          generated_columns = [x for x in generated_tables_and_columns.get(table, [])]
          reference_columns = [x for x in reference.get(table, [])]

          if set(generated_columns) == set(reference_columns):
            columns['all']['accuracy'] += 1
            columns[reference_hardness]['accuracy'] += 1

          TP_columns += len(set(generated_columns) & set(reference_columns)) # generated and needed
          FP_columns += len(set(generated_columns) - set(reference_columns)) # those that he generated and did not need to generate
          FN_columns += len(set(reference_columns) - set(generated_columns)) # those that did not generate and needed to generate

          precision_columns = TP_columns / (TP_columns + FP_columns) if (TP_columns + FP_columns) > 0 else 0
          recall_columns = TP_columns / (TP_columns + FN_columns) if (TP_columns + FN_columns) > 0 else 0

          columns['all']['precision'] += precision_columns
          columns[reference_hardness]['precision'] += precision_columns

          columns['all']['recall'] += recall_columns
          columns[reference_hardness]['recall'] += recall_columns

  sep = '-'*60
  print(f"{sep}\n{experiment_id}\n{sep}")

  
  with mlflow.start_run(run_name = experiment_id):
      mlflow.log_param("model", experiment_id)
      df.to_csv("df_details.csv")
      mlflow.log_artifact("df_details.csv", "Details")

      print('{:<10} | {:<8} | {:<9} | {:<8} | {:<8} | {:<8}'.format('', 'accuracy', 'precision', 'recall', 'f1-score', 'test'))
      print(f"{sep}\ntables\n{sep}")
      for key, m in tables.items():
        accuracy = m['accuracy'] / m['test']
        precision = m['precision'] / m['test']
        recall = m['recall'] / m['test']
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        print('{:<10} |  {:<6.4f}  |  {:<7.4f}  |  {:<6.4f}  |   {:<6.4f}  |   {:<6}'.format(key, accuracy, precision, recall, f1_score, m['test']))

        mlflow.log_metric(f"tables_{key}_accuracy", accuracy)
        mlflow.log_metric(f"tables_{key}_precision", precision)
        mlflow.log_metric(f"tables_{key}_recall", recall)
        mlflow.log_metric(f"tables_{key}_f1_score", f1_score)
        mlflow.log_metric(f"tables_test", m['test'])

      print(f"{sep}\n{sep}")
      for key, m in columns.items():
        accuracy = m['accuracy'] / m['test']
        precision = m['precision'] / m['test']
        recall = m['recall'] / m['test']
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        print('{:<10} |  {:<6.4f}  |  {:<7.4f}  |  {:<6.4f}  |   {:<6.4f}  |   {:<6}'.format(key, accuracy, precision, recall, f1_score, m['test']))

        mlflow.log_metric(f"columns_{key}_accuracy", accuracy)
        mlflow.log_metric(f"columns_{key}_precision", precision)
        mlflow.log_metric(f"columns_{key}_recall", recall)
        mlflow.log_metric(f"columns_{key}_f1_score", f1_score)
        mlflow.log_metric(f"columns_test", m['test'])
    

In [14]:
folder = ''
result_file = ''
result_path = f'{folder}{result_file}'
result_path

'/content/drive/Shareddrives/LLMs/ResultadoTestes/TestesPOCsL/longformer-schema-linking-2000-0_8.txt'

In [15]:
test_result = info_result(result_path)
df_test_result = pd.DataFrame(test_result)

Testando ...: 1034it [00:01, 781.79it/s]


In [None]:
metricas(result_file, df_test_result)
metricas(result_file, df_test_result)