### HardTables


In [None]:
import pandas as pd
import os

class CEA_Evaluator:
    def __init__(self, answer_file_path, round=1):
        """
    `round` : Holds the round for which the evaluation is being done. 
    can be 1, 2...upto the number of rounds the challenge has.
    Different rounds will mostly have different ground truth files.
    """
        self.answer_file_path = answer_file_path
        self.round = round

    def _evaluate(self, client_payload, _context={}):
        """
    `client_payload` will be a dict with (atleast) the following keys :
      - submission_file_path : local file path of the submitted file
      - aicrowd_submission_id : A unique id representing the submission
      - aicrowd_participant_id : A unique id for participant/team submitting (if enabled)
    """
        submission_file_path = client_payload["submission_file_path"]

        gt_cell_ent = dict()
        gt = pd.read_csv(self.answer_file_path, delimiter=',', names=['tab_id', 'col_id', 'row_id', 'entity'],
                         dtype={'tab_id': str, 'col_id': str, 'row_id': str, 'entity': str},header= None, keep_default_na=False,nrows=1000)

        for index, row in gt.iterrows():
            cell = '%s %s %s' % (row['tab_id'], row['col_id'], row['row_id'])
            gt_cell_ent[cell] = row['entity']

        correct_cells, annotated_cells = set(), set()
        sub = pd.read_csv(submission_file_path, delimiter=',', names=['tab_id', 'col_id', 'row_id','entity'],
                          dtype={'tab_id': str, 'col_id': str, 'row_id': str, 'entity': str}, keep_default_na=False)
        
        for index, row in sub.iterrows():
            cell = '%s %s %s' % (row['tab_id'], row['col_id'], row['row_id'])
            if cell in gt_cell_ent:
            
                if cell in annotated_cells:
                    raise Exception("Duplicate cells in the submission file")
                else:
                    annotated_cells.add(cell)

                annotation = row['entity']
                if not annotation.lower() == 'nil' and not annotation.startswith('http://www.wikidata.org/entity/'):
                    annotation = 'http://www.wikidata.org/entity/' + annotation

                if annotation.lower() in gt_cell_ent[cell].lower().split():
                    correct_cells.add(cell)
                else:
                    print('%s,%s' % (cell.replace(' ', ','), gt_cell_ent[cell]))
        
        if len(annotated_cells) > 0:
            precision = len(correct_cells) / len(annotated_cells)
        else:
            precision = 0.0
        
        recall = len(correct_cells) / len(gt_cell_ent.keys())
    
        if (precision + recall) > 0:
            f1 = (2 * precision * recall) / (precision + recall)
        else:
            f1= 0.0
        
        main_score = f1
        secondary_score = precision
        print('F1: %.10f, Precision: %.10f, Recall: %.10f' % (f1, precision, recall))


        """
    Do something with your submitted file to come up
    with a score and a secondary score.

    if you want to report back an error to the user,
    then you can simply do :
      `raise Exception("YOUR-CUSTOM-ERROR")`

     You are encouraged to add as many validations as possible
     to provide meaningful feedback to your users
    """
        _result_object = {
            "F1": main_score,
            "Precision": secondary_score,
            "Recall": recall
        }
        return _result_object


import os

if __name__ == "__main__":
    # Lets assume the ground_truth is a CSV file
    # and is present at data/ground_truth.csv
    # and a sample submission is present at data/sample_submission.csv
    answer_file_path = "../input/HardTables/HT_gt_WD.csv"
    
    d = '../evaluation/prediction_submissions/HardTables'
    file_resultsHT = {}  # Dictionary to store filename and result pairs

    for ff in os.listdir(d):
        _client_payload = {}
        if ff == '.DS_Store':
            continue
        print(ff)
        _client_payload["submission_file_path"] = os.path.join(d, ff)

        # Instantiate a dummy context
        _context = {}
        # Instantiate an evaluator
        cea_evaluator = CEA_Evaluator(answer_file_path)
        # Evaluate
        result = cea_evaluator._evaluate(_client_payload, _context)
        file_resultsHT[ff] = result  # Store the result for each file in the dictionary

    # Print filename and result for each file after the loop
    for filename, result in file_resultsHT.items():
        print(f"File: {filename}, Result: {result}")


In [None]:
df1 = pd.DataFrame.from_dict(file_resultsHT, orient='index')

# Save the DataFrame to an Excel file
df1.to_excel('../evaluation/excel_results/HT_Results.xlsx')

### ToughTables

In [None]:
import pandas as pd
import os

class CEA_Evaluator:
    def __init__(self, answer_file_path, round=1):
        """
    `round` : Holds the round for which the evaluation is being done. 
    can be 1, 2...upto the number of rounds the challenge has.
    Different rounds will mostly have different ground truth files.
    """
        self.answer_file_path = answer_file_path
        self.round = round

    def _evaluate(self, client_payload, _context={}):
        """
    `client_payload` will be a dict with (atleast) the following keys :
      - submission_file_path : local file path of the submitted file
      - aicrowd_submission_id : A unique id representing the submission
      - aicrowd_participant_id : A unique id for participant/team submitting (if enabled)
    """
        submission_file_path = client_payload["submission_file_path"]

        gt_cell_ent = dict()
        gt = pd.read_csv(self.answer_file_path, delimiter=',', names=['tab_id', 'col_id', 'row_id', 'entity'],
                         dtype={'tab_id': str, 'col_id': str, 'row_id': str, 'entity': str},header= None, keep_default_na=False,nrows=1000)

        for index, row in gt.iterrows():
            cell = '%s %s %s' % (row['tab_id'], row['col_id'], row['row_id'])
            gt_cell_ent[cell] = row['entity']

        correct_cells, annotated_cells = set(), set()
        sub = pd.read_csv(submission_file_path, delimiter=',', names=['tab_id','col_id', 'row_id','entity'],
                          dtype={'tab_id': str, 'row_id': str, 'col_id': str, 'entity': str}, keep_default_na=False)
        
        for index, row in sub.iterrows():
            cell = '%s %s %s' % (row['tab_id'], row['col_id'], row['row_id'])
            if cell in gt_cell_ent:
            
                if cell in annotated_cells:
                    raise Exception("Duplicate cells in the submission file")
                else:
                    annotated_cells.add(cell)

                annotation = row['entity']
                if not annotation.lower() == 'nil' and not annotation.startswith('http://www.wikidata.org/entity/'):
                    annotation = 'http://www.wikidata.org/entity/' + annotation

                if annotation.lower() in gt_cell_ent[cell].lower().split():
                    
                    correct_cells.add(cell)
                else:
                    print('%s,%s' % (cell.replace(' ', ','), gt_cell_ent[cell]))
        
        if len(annotated_cells) > 0:
            precision = len(correct_cells) / len(annotated_cells)
        else:
            precision = 0.0
        
        recall = len(correct_cells) / len(gt_cell_ent.keys())
    
        if (precision + recall) > 0:
            f1 = (2 * precision * recall) / (precision + recall)
        else:
            f1= 0.0
    
        main_score = f1
        secondary_score = precision
        print('F1: %.10f, Precision: %.10f, Recall: %.10f' % (f1, precision, recall))
        print(len(gt_cell_ent.keys())," ",len(correct_cells))

        """
    Do something with your submitted file to come up
    with a score and a secondary score.

    if you want to report back an error to the user,
    then you can simply do :
      `raise Exception("YOUR-CUSTOM-ERROR")`

     You are encouraged to add as many validations as possible
     to provide meaningful feedback to your users
    """
        _result_object = {
            "F1": main_score,
            "Precision": secondary_score,
            "Recall": recall
        }
        return _result_object


import os

if __name__ == "__main__":
    # Lets assume the ground_truth is a CSV file
    # and is present at data/ground_truth.csv
    # and a sample submission is present at data/sample_submission.csv
    answer_file_path = "../input/ToughTables/2T_gt_WD.csv"
    
    d = '../evaluation/prediction_submissions/ToughTables'
    file_results2T = {}  # Dictionary to store filename and result pairs

    for ff in os.listdir(d):
        _client_payload = {}
        if ff == '.DS_Store':
            continue
        print(ff)
        _client_payload["submission_file_path"] = os.path.join(d, ff)

        # Instantiate a dummy context
        _context = {}
        # Instantiate an evaluator
        cea_evaluator = CEA_Evaluator(answer_file_path)
        # Evaluate
        result = cea_evaluator._evaluate(_client_payload, _context)
        file_results2T[ff] = result  # Store the result for each file in the dictionary

    # Print filename and result for each file after the loop
    for filename, result in file_results2T.items():
        print(f"File: {filename}, Result: {result}")

In [None]:
df2 = pd.DataFrame.from_dict(file_results2T, orient='index')

# Save the DataFrame to an Excel file
df2.to_excel('../evaluation/excel_results/2T_Results.xlsx')