In [81]:
import os
import time
import shutil
import pathlib
import zipfile
import pandas as pd
from dotenv import load_dotenv

In [82]:
import json
import config
import logging
from pathlib import Path
from farm.infer import Inferencer
from config_farm_train import InferConfig
from src.data.s3_communication import S3Communication, S3FileType

In [83]:
from src.models.relevance_infer import TextRelevanceInfer
from src.components.utils.kpi_mapping import get_kpi_mapping_category

In [84]:
from collections import defaultdict
from farm.infer import QAInferencer
from config_qa_farm_train import QAFileConfig, QAInferConfig

## 1. Setup for benchmark runs

### 1.1. Load S3 credentials

In [85]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

### 1.2. Creating S3 connection

In [86]:
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_LANDING_ENDPOINT"),
    aws_access_key_id=os.getenv("S3_LANDING_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("S3_LANDING_SECRET_KEY"),
    s3_bucket=os.getenv("S3_LANDING_BUCKET"),
)

### 1.3. View Objects in the bucket

### 1.4. Base Configurations

In [87]:
infer_config = InferConfig("infer_demo")
_logger = logging.getLogger(__name__)

### 1.5. Downloading the model

In [88]:
# download the pretrained model
model_root = pathlib.Path(infer_config.load_dir['Text']).parent
model_rel_zip = pathlib.Path(model_root, "relevance_roberta.zip")

s3c.download_file_from_s3(
    model_rel_zip, config.CHECKPOINT_S3_PREFIX, "relevance_roberta.zip"
)

with zipfile.ZipFile(pathlib.Path(model_root, "relevance_roberta.zip"), "r") as z:
    z.extractall(model_root)

### 1.6. Loading KPIs

In [89]:
kpi_df = s3c.download_df_from_s3(
    "aicoe-osc-demo/kpi_mapping",
    "kpi_mapping.csv",
    filetype=S3FileType.CSV,
    header=0,
)
kpi_df

Unnamed: 0,kpi_id,question,sectors,add_year,kpi_category,Unnamed: 5,Unnamed: 6
0,0.0,What is the company name?,"OG, CM, CU",False,TEXT,,
1,1.0,In which year was the annual report or the sus...,"OG, CM, CU",False,TEXT,,
2,2.0,What is the total volume of proven and probabl...,OG,True,"TEXT, TABLE",,
3,2.1,What is the volume of estimated proven hydroca...,OG,True,"TEXT, TABLE",,
4,2.2,What is the volume of estimated probable hydro...,OG,True,"TEXT, TABLE",,
5,3.0,What is the total volume of hydrocarbons produ...,OG,True,"TEXT, TABLE",,
6,3.1,What is the total volume of crude oil liquid p...,OG,True,"TEXT, TABLE",,
7,3.2,What is the total volume of natural gas liquid...,OG,True,"TEXT, TABLE",,
8,3.3,What is the total volume of natural gas produc...,OG,True,"TEXT, TABLE",,
9,4.0,What is the annual total production from coal?,CU,True,"TEXT, TABLE",,


## 2. Running Benchmarks on Relevance Model

### 2.1. Setting up scratch directories and files for benchmarking

In [90]:
BENCHMARK_FOLDER = config.DATA_FOLDER / "benchmark"
if not os.path.exists(BENCHMARK_FOLDER):
    BENCHMARK_FOLDER.mkdir(parents=True, exist_ok=True)

if not os.path.exists(BENCHMARK_FOLDER / "extraction"):
    pathlib.Path(BENCHMARK_FOLDER / "extraction").mkdir(parents=True, exist_ok=True)

if not os.path.exists(BENCHMARK_FOLDER / "infer_relevance"):
    pathlib.Path(BENCHMARK_FOLDER / "infer_relevance").mkdir(parents=True, exist_ok=True)
    
s3c.download_files_in_prefix_to_dir(
        config.BASE_EXTRACTION_S3_PREFIX,
        BENCHMARK_FOLDER / "extraction",
    )

### 2.2. Relevance Infer Configurations

In [91]:
infer_config.extracted_dir = BENCHMARK_FOLDER / "extraction"
infer_config.result_dir = BENCHMARK_FOLDER / "infer_relevance"
print(infer_config.load_dir)
print(infer_config.extracted_dir)
print(infer_config.result_dir)

{'Text': '/opt/app-root/src/aicoe-osc-demo-2022-07-13-21-52/models/RELEVANCE'}
/opt/app-root/src/aicoe-osc-demo-2022-07-13-21-52/data/benchmark/extraction
/opt/app-root/src/aicoe-osc-demo-2022-07-13-21-52/data/benchmark/infer_relevance


### 2.3. Defining Methods from the TextRelevanceInfer class for benchmarking

In [92]:
def initialize_relevance_infer():
        """Initialize BaseRelevanceInfer class."""
        data_type = "Text"
        global questions
        global model
        # Questions can be set in the config file. If not provided, the prediction will be made for all KPI questions
        if len(infer_config.kpi_questions) > 0:
            questions = infer_config.kpi_questions
        else:
            # Filter KPIs based on section and whether they can be found in text or table.
            kmc = get_kpi_mapping_category(kpi_df)
            questions = [
                q_text
                for q_id, (q_text, sect) in kmc["KPI_MAPPING_MODEL"].items()
                if len(set(sect).intersection(set(infer_config.sectors))) > 0
                and data_type.upper() in kmc["KPI_CATEGORY"][q_id]
            ]

        if not os.path.exists(infer_config.result_dir):
            os.makedirs(infer_config.result_dir)

        farm_logger = logging.getLogger("farm")
        farm_logger.setLevel(infer_config.farm_infer_logging_level)
        model = Inferencer.load(
            infer_config.load_dir[data_type],
            batch_size=infer_config.batch_size,
            gpu=infer_config.gpu,
            num_processes=infer_config.num_processes,
            disable_tqdm=infer_config.disable_tqdm,
        )

In [93]:
def read_text_from_json(file):
        """Read text from json."""
        with open(file) as f:
            text = json.load(f)
            return text

In [94]:
def gather_data(pdf_name, pdf_path):
        """Gather all the text data inside the given pdf and prepares it to be passed to text model.

        Args:
            pdf_name (str): Name of the pdf
            pdf_path (str): Path to the pdf
        Returns:
            text_data (A list of a list of dicts): The dict has "page", "pdf_name",
                                                    "text", "text_b" keys.
        """
        # Get all the extracted pdf text from the json file in the extracted folder
        pdf_content = read_text_from_json(pdf_path)
        text_data = []
        # build all possible combinations of paragraphs and  questions
        # Keep track of page number which the text is extracted from and the pdf it belongs to.
        for kpi_question in questions:
            text_data.extend(
                [
                    {
                        "page": page_num,
                        "pdf_name": pdf_name,
                        "text": kpi_question,
                        "text_b": paragraph,
                    }
                    for page_num, page_content in pdf_content.items()
                    for paragraph in page_content
                ]
            )

        _logger.info(
            "###### Received {} examples for Text, number of questions: {}".format(
                int(len(text_data) / len(questions)), len(questions)
            )
        )

        return text_data

In [95]:
def gather_extracted_files():
        """Gather all the extracted texts for each pdf.

        Returns:
            A dictionary where the keys are the pdf names and the values are the path to the json files containing
            the extracted text for each pdf
        """
        # Get all the json extracted from pdfs which are located in extracted folder
        _logger.info("Searching for extracted files on {}".format(infer_config.extracted_dir))
        text_paths = sorted(Path(infer_config.extracted_dir).rglob("*.json"))
        return {
            os.path.splitext(os.path.basename(file_path))[0]: file_path
            for file_path in text_paths
            if "table_meta" not in str(file_path)
        }

In [96]:
def run_folder():
        """Make prediction on all the data (csv files or json) inside a folder.

        It also saves the relevant tables or
        paragraphs for questions inside a csv file.
        """
        data_type = "Text"
        all_text_path_dict = gather_extracted_files()
        df_list = []
        metrics_df_list = []
        num_pdfs = len(all_text_path_dict)
        _logger.info(
            "{} Starting Relevence Inference for the following extracted pdf files found in {}:\n{} ".format(
                "#" * 20, infer_config.result_dir, [pdf for pdf in all_text_path_dict.keys()]
            )
        )
        for i, (pdf_name, file_path) in enumerate(all_text_path_dict.items()):
            _logger.info("{} {}/{} PDFs".format("#" * 20, i + 1, num_pdfs))
            predictions_file_name = "{}_{}".format(pdf_name, "predictions_relevant.csv")
            if (
                infer_config.skip_processed_files
                and predictions_file_name in os.listdir(infer_config.result_dir)
            ):
                _logger.info(
                    "The relevance infer results for {} already exists. Skipping.".format(
                        pdf_name
                    )
                )
                _logger.info(
                    "If you would like to re-process the already processed files, set "
                    "`skip_processed_files` to False in the config file. "
                )
                continue
            _logger.info("Running inference for {}:".format(pdf_name))

            try:
                start = time.time()
                data = gather_data(pdf_name, file_path)
                num_data_points = len(data)
                num_pages = data[len(data)-1]['page']
                _logger.info(
                    "Gathered the extracted data ({} points) from the file {} in {} sec.".format(num_data_points, pdf_name, str(time.time() - start))
                )
                predictions = []
                chunk_size = 1000
                chunk_idx = 0
                total_file_time = 0
        
                while chunk_idx * chunk_size < num_data_points:
                    chunk_start = time.time()
                    data_chunk = data[
                        chunk_idx * chunk_size : (chunk_idx + 1) * chunk_size
                    ]
                    predictions_chunk = model.inference_from_dicts(
                        dicts=data_chunk
                    )
                    
                    predictions.extend(predictions_chunk)
                    chunk_idx += 1
                    
                    chunk_end = time.time()
                    total_file_time += (chunk_end - chunk_start)

                time_per_data_point = total_file_time / num_data_points
                data_points_per_sec = 1/time_per_data_point
                _logger.info(
                    "Ran inference on file {} with {} pages and {} data points in {} sec ({} sec per data point, {} data points per second)".format(
                        pdf_name, num_pages, num_data_points, total_file_time, time_per_data_point, data_points_per_sec
                    )
                )
                
                metrics_list = [[pdf_name, int(num_pages), num_data_points, total_file_time, time_per_data_point, data_points_per_sec]]
                metrics_df = pd.DataFrame(metrics_list, columns = ['PDF Name', 'Number of Pages', 'Number of Data Points', 'Total Inference Time', 'Time per data point', 'Data points per sec'])
                metrics_df_list.append(metrics_df)
                
                flat_predictions = [
                    example for batch in predictions for example in batch["predictions"]
                ]
                positive_examples = [
                    data[index]
                    for index, pred_example in enumerate(flat_predictions)
                    if pred_example["label"] == "1"
                ]
                
                df = pd.DataFrame(positive_examples)
                df["source"] = data_type

                df_list.append(df)
                predictions_file_path = os.path.join(
                    infer_config.result_dir, predictions_file_name
                )
                df.to_csv(predictions_file_path)
                _logger.info(
                    "Saved {} relevant {} examples for {} in {}".format(
                        len(df), data_type, pdf_name, predictions_file_path
                    )
                )
            except Exception as exc:
                _logger.warning(exc)
                e = sys.exc_info()[0]
                _logger.warning(
                    "There was an error making inference (RELEVANCE) on {}".format(
                        pdf_name
                    )
                )
                _logger.warning("The error is\n{}\nSkipping this pdf".format(e))

        concatenated_dfs = pd.concat(df_list) if len(df_list) > 0 else pd.DataFrame()
        metrics_df = pd.DataFrame()
        if len(metrics_df_list) > 0:
            metrics_df = pd.concat(metrics_df_list) if len(metrics_df_list) > 0 else pd.DataFrame()
            _logger.info(
                "Metrics for inferring paragraphs relevant to KPI are: \nTotal Number of Data Points Processed = {} \nTotal Inference Time = {} \nAverage Number of Pages Per PDF = {} \nAverage Inference Time Per PDF = {} \nMinimum Inference Time of PDF = {} \nMaximum Inference Time of PDF = {} \nStd of Inference Times of PDFs = {} \nAverage Time Per Data Point Processed= {} \nAverage Data Points Processed Per Second = {} \n"
                .format(metrics_df['Number of Data Points'].sum(), 
                        metrics_df['Total Inference Time'].sum(),
                        int(metrics_df['Number of Pages'].mean()),
                        metrics_df['Total Inference Time'].mean(),
                        metrics_df['Total Inference Time'].min(),
                        metrics_df['Total Inference Time'].max(),
                        metrics_df['Total Inference Time'].std(),
                        metrics_df['Time per data point'].mean(),
                        metrics_df['Data points per sec'].mean()
                        )
            )
        
        model.close_multiprocessing_pool()
        return concatenated_dfs, metrics_df

In [97]:
def cleanup_for_relevance():
    if not os.path.exists(BENCHMARK_FOLDER / "infer_relevance"):
        pathlib.Path(BENCHMARK_FOLDER / "infer_relevance").mkdir(parents=True, exist_ok=True)
    for filename in os.listdir(BENCHMARK_FOLDER / "infer_relevance"):
        file_path = os.path.join(BENCHMARK_FOLDER / "infer_relevance", filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

### 2.4. Running benchmarks

In [98]:
num_runs = 2
metrics_dfs = []
initialize_relevance_infer()
for i in range(num_runs):
    cleanup_for_relevance()
    initialize_relevance_infer()
    result_df, metrics_df = run_folder()
    metrics_dfs.append(metrics_df)

08/08/2022 09:19:42 - INFO - __main__ -   Searching for extracted files on /opt/app-root/src/aicoe-osc-demo-2022-07-13-21-52/data/benchmark/extraction
08/08/2022 09:19:42 - INFO - __main__ -   #################### Starting Relevence Inference for the following extracted pdf files found in /opt/app-root/src/aicoe-osc-demo-2022-07-13-21-52/data/benchmark/infer_relevance:
['04_NOVATEK_AR_2016_ENG_11'] 
08/08/2022 09:19:42 - INFO - __main__ -   #################### 1/1 PDFs
08/08/2022 09:19:42 - INFO - __main__ -   Running inference for 04_NOVATEK_AR_2016_ENG_11:
08/08/2022 09:19:42 - INFO - __main__ -   ###### Received 1011 examples for Text, number of questions: 24
08/08/2022 09:19:42 - INFO - __main__ -   Gathered the extracted data (24264 points) from the file 04_NOVATEK_AR_2016_ENG_11 in 0.016501188278198242 sec.
08/08/2022 09:22:22 - INFO - __main__ -   Ran inference on file 04_NOVATEK_AR_2016_ENG_11 with 119 pages and 24264 data points in 160.24668073654175 sec (0.006604297755380059

In [99]:
for i in range(num_runs):
    print(metrics_dfs[i].head())

                    PDF Name  Number of Pages  Number of Data Points  \
0  04_NOVATEK_AR_2016_ENG_11              119                  24264   

   Total Inference Time  Time per data point  Data points per sec  
0            160.246681             0.006604           151.416553  
                    PDF Name  Number of Pages  Number of Data Points  \
0  04_NOVATEK_AR_2016_ENG_11              119                  24264   

   Total Inference Time  Time per data point  Data points per sec  
0            160.224211             0.006603           151.437788  


## 3. Running Benchmarks on KPI Model

### 3.1. Setting up scratch directories and model for benchmarking

In [100]:
BENCHMARK_FOLDER = config.DATA_FOLDER / "benchmark"

if not os.path.exists(BENCHMARK_FOLDER / "infer_kpi"):
    pathlib.Path(BENCHMARK_FOLDER / "infer_kpi").mkdir(parents=True, exist_ok=True)

### 3.2. KPI Infer Configurations and Model Loading

In [101]:
file_config = QAFileConfig("infer_demo")
infer_config = QAInferConfig("infer_demo")
infer_config.relevance_dir = BENCHMARK_FOLDER / "infer_relevance"
infer_config.result_dir = BENCHMARK_FOLDER / "infer_kpi"
print(infer_config.load_dir)
print(infer_config.relevance_dir)
print(infer_config.result_dir)

{'Text': '/opt/app-root/src/aicoe-osc-demo-2022-07-13-21-52/models/KPI_EXTRACTION'}
/opt/app-root/src/aicoe-osc-demo-2022-07-13-21-52/data/benchmark/infer_relevance
/opt/app-root/src/aicoe-osc-demo-2022-07-13-21-52/data/benchmark/infer_kpi


In [102]:
model_root = pathlib.Path(file_config.saved_models_dir).parent
model_rel_zip = pathlib.Path(model_root, 'KPI_EXTRACTION.zip')
s3c.download_file_from_s3(model_rel_zip, config.CHECKPOINT_S3_PREFIX, "KPI_EXTRACTION.zip")
with zipfile.ZipFile(pathlib.Path(model_root, 'KPI_EXTRACTION.zip'), 'r') as z:
    z.extractall(model_root)

### 3.3. Defining Methods from the TextKPIInfer class for benchmarking

In [103]:
def initialize_kpi_infer(n_best_per_sample=1):
        """Initialize TextKPIInfer class."""

        farm_logger = logging.getLogger("farm")
        farm_logger.setLevel(infer_config.farm_infer_logging_level)
        global model
        model = QAInferencer.load(
            infer_config.load_dir["Text"],
            batch_size=infer_config.batch_size,
            gpu=infer_config.gpu,
            num_processes=infer_config.num_processes,
        )
        # num span-based candidate answer spans to consider from each passage
        model.model.prediction_heads[0].n_best_per_sample = n_best_per_sample
        # If positive, this will boost "No Answer" as prediction.
        # If negative, this will decrease the model from giving "No Answer" as prediction.
        model.model.prediction_heads[
            0
        ].no_ans_boost = infer_config.no_ans_boost
        if not os.path.exists(infer_config.result_dir):
            os.makedirs(infer_config.result_dir)

In [104]:
def aggregate_result(x):
    """Aggregate result method (helper function).

    Helper function Used in `infer_on_relevance_results`
    For relevant paragraphs related to a single pdf and question, find groups that the answer with highest score
    is always no_answer. If that happens, we consider that question is not answerable for the given pdf.
    """
    rank_1 = x[x["rank"] == "rank_1"]
    aggregated_no_answer = all(rank_1["answer"] == "no_answer")
    if aggregated_no_answer:
        max_no_answer_score = rank_1["score"].max()
        return max_no_answer_score

In [105]:
def infer_on_relevance_results():
        """Make inference using the qa model on the relevant paragraphs.

        Args:
            relevance_results_dir (str): path to the directory where the csv file containing the relevant paragraphs
            and KPIs for text are stored (output from the relevance stage).
            kpi_df (Pandas.DataFrame): A dataframe with kpi questions
        Returns:
            span_df (Pandas.DataFrame): A dataframe, containing best n answers for each KPI question for each pdf.
                The n is defined by top_k. The following columns are added:
                    `answer_span`: answer span
                    `score`: The score of span from qa model
                    `rank`: for the given context and question, what is the rank of score for answer_span. For examples,
                        if rank of a span is rank_1, it means that for the give context and question,
                        the qa model gives the highest score to that span. while rank_2 means, the best guess of model
                        is either`no_answer` or another span.

        Note: The  result data frame will be saved in the `result_dir` directory.
        """
        all_relevance_results_paths = glob.glob(
            os.path.join(infer_config.relevance_dir, "*.csv")
        )
        all_span_dfs = []
        num_csvs = len(all_relevance_results_paths)
        metrics_df_list = []
        _logger.info(
            "{} Starting KPI Inference for the following relevance CSV files found in {}:\n{} ".format(
                "#" * 20,
                infer_config.relevance_dir,
                [
                    os.path.basename(relevance_results_path)
                    for relevance_results_path in all_relevance_results_paths
                ],
            )
        )
        for i, relevance_results_path in enumerate(all_relevance_results_paths):
            _logger.info("{} {}/{}".format("#" * 20, i + 1, num_csvs))
            pdf_name = os.path.basename(relevance_results_path).split(
                "_predictions_relevant"
            )[0]
            predictions_file_name = "{}_{}".format(pdf_name, "predictions_kpi.csv")
            if (
                infer_config.skip_processed_files
                and predictions_file_name in os.listdir(infer_config.result_dir)
            ):
                _logger.info(
                    "The KPI infer results for {} already exists. Skipping.".format(
                        pdf_name
                    )
                )
                _logger.info(
                    "If you would like to re-process the already processed files, set "
                    "`skip_processed_files` to False in the config file. "
                )
                continue
            _logger.info("Starting KPI Extraction for {}".format(pdf_name))
            
            
            input_df = pd.read_csv(relevance_results_path)
            column_names = ["text_b", "text", "page", "pdf_name", "source"]

            if len(input_df) == 0:
                _logger.info(
                    "The received relevance file is empty for {}".format(pdf_name)
                )
                df_empty = pd.DataFrame([])
                df_empty.to_csv(os.path.join(infer_config.result_dir, predictions_file_name))
                continue

            assert set(column_names).issubset(
                set(input_df.columns)
            ), """The result of relevance detector has {} columns,
            while expected {}""".format(
                input_df.columns, column_names
            )

            qa_dict = [
                {"qas": [question], "context": context}
                for question, context in zip(input_df["text"], input_df["text_b"])
            ]
            num_data_points = len(qa_dict)
            result = []
            chunk_size = 1000
            chunk_idx = 0
            total_file_time = 0
            while chunk_idx * chunk_size < num_data_points:
                chunk_start = time.time()
                
                data_chunk = qa_dict[
                    chunk_idx * chunk_size : (chunk_idx + 1) * chunk_size
                ]
                predictions_chunk = model.inference_from_dicts(dicts=data_chunk)
                result.extend(predictions_chunk)
                chunk_idx += 1
                
                chunk_end = time.time()
                total_file_time += (chunk_end - chunk_start)
            # result = self.model.inference_from_dicts(dicts=qa_dict)

            time_per_data_point = total_file_time / num_data_points
            data_points_per_sec = 1/time_per_data_point
            
            _logger.info("Ran inference on the file {} with {} relevant data points in {} sec. ({} sec per data point, {} data points per sec)".format(
                pdf_name, num_data_points, total_file_time, time_per_data_point, data_points_per_sec
            ))
            
            metrics_list = [[pdf_name, num_data_points, total_file_time, time_per_data_point, data_points_per_sec]]
            metrics_df = pd.DataFrame(metrics_list, columns = ['PDF Name', 'Number of Data Points', 'Total Inference Time', 'Time per data point', 'Data points per sec'])
            metrics_df_list.append(metrics_df)
            
            head_num = 0
            num_answers = model.model.prediction_heads[0].n_best_per_sample + 1
            answers_dict = defaultdict(list)

            for exp in result:
                preds = exp["predictions"][head_num]["answers"]
                # Get the no_answer_score
                no_answer_score = [
                    p["score"] for p in preds if p["answer"] == "no_answer"
                ]
                if (
                    len(no_answer_score) == 0
                ):  # Happens if no answer is not among the n_best predictions.
                    no_answer_score = (
                        preds[0]["score"] - exp["predictions"][head_num]["no_ans_gap"]
                    )
                else:
                    no_answer_score = no_answer_score[0]

                # Based on Farm implementation, no_answer_score already is equal = "CLS score" + no_ans_boost
                # https://github.com/deepset-ai/FARM/blob/978da5d7600c48be458688996538770e9334e71b/farm/modeling/prediction_head.py#L1348
                pure_no_ans_score = no_answer_score - infer_config.no_ans_boost

                for i in range(
                    num_answers
                ):  # This param is not exactly representative, n_best mostly defines num answers.
                    answers_dict[f"rank_{i+1}"].append(
                        (
                            preds[i]["answer"],
                            preds[i]["score"],
                            pure_no_ans_score,
                            no_answer_score,
                        )
                    )
            for i in range(num_answers):
                input_df[f"rank_{i+1}"] = answers_dict[f"rank_{i+1}"]

            # Let's put different kpi predictions and their scores into one column so we can sort them.
            var_cols = [i for i in list(input_df.columns) if i.startswith("rank_")]
            id_vars = [i for i in list(input_df.columns) if not i.startswith("rank_")]
            input_df = pd.melt(
                input_df,
                id_vars=id_vars,
                value_vars=var_cols,
                var_name="rank",
                value_name="answer_score",
            )

            # Separate a column with tuple value into two columns
            input_df[
                ["answer", "score", "no_ans_score", "no_answer_score_plus_boost"]
            ] = pd.DataFrame(input_df["answer_score"].tolist(), index=input_df.index)
            input_df = input_df.drop(columns=["answer_score"], axis=1)

            no_answerables = (
                input_df.groupby(["pdf_name", "text"])
                .apply(lambda grp: aggregate_result(grp))
                .dropna(how="all")
            )
            no_answerables = pd.DataFrame(
                no_answerables, columns=["score"]
            ).reset_index()
            no_answerables["answer"] = "no_answer"
            no_answerables["source"] = "Text"

            # Filter to span-based answers
            span_df = input_df[input_df["answer"] != "no_answer"]
            # Concatenate the result of span answers with non answerable examples.
            span_df = pd.concat([span_df, no_answerables], ignore_index=True)

            # Get the predictions with n highest score for each pdf and question.
            # If the question is considered unanswerable, the best prediction is "no_answer", but the best span-based answer
            # is also returned. if the question is answerable, the best span-based answers are returned.
            span_df = (
                span_df.groupby(["pdf_name", "text"])
                .apply(lambda grp: grp.nlargest(infer_config.top_k, "score"))
                .reset_index(drop=True)
            )

            # Final cleaning on the dataframe, removing unnecessary columns and renaming `text` and `text_b` columns.
            unnecessary_cols = ["rank"] + [
                i for i in list(span_df.columns) if i.startswith("Unnamed")
            ]
            span_df = span_df.drop(columns=unnecessary_cols, axis=1)
            span_df.rename(columns={"text": "kpi", "text_b": "paragraph"}, inplace=True)

            # Add the kpi id
            reversed_kpi_mapping = {
                value[0]: key
                for key, value in get_kpi_mapping_category(kpi_df)[
                    "KPI_MAPPING"
                ].items()
            }
            span_df["kpi_id"] = span_df["kpi"].map(reversed_kpi_mapping)

            # Change the order of columns
            first_cols = ["pdf_name", "kpi", "kpi_id", "answer", "page"]
            column_order = first_cols + [
                col for col in span_df.columns if col not in first_cols
            ]
            span_df = span_df[column_order]

            result_path = os.path.join(infer_config.result_dir, predictions_file_name)
            span_df.to_csv(result_path)
            _logger.info("Save the result of KPI extraction to {}".format(result_path))
            all_span_dfs.append(span_df)
        concatenated_dfs = (
            pd.concat(all_span_dfs) if len(all_span_dfs) > 0 else pd.DataFrame()
        )
        metrics_df = pd.DataFrame()
        if len(metrics_df_list) > 0:
            metrics_df = pd.concat(metrics_df_list) if len(metrics_df_list) > 0 else pd.DataFrame()
            _logger.info(
                "Metrics for KPI from revelant paragraphs are: \nTotal Number of Data Points Processed = {} \nTotal Inference Time = {} \nAverage Inference Time Per CSV = {} \nMinimum Inference Time of CSV = {} \nMaximum Inference Time of CSV = {} \nStd of Inference Times of CSVs = {} \nAverage Time Per Data Point Processed= {} \nAverage Data Points Processed Per Second = {} \n"
                .format(metrics_df['Number of Data Points'].sum(), 
                        metrics_df['Total Inference Time'].sum(),
                        metrics_df['Total Inference Time'].mean(),
                        metrics_df['Total Inference Time'].min(),
                        metrics_df['Total Inference Time'].max(),
                        metrics_df['Total Inference Time'].std(),
                        metrics_df['Time per data point'].mean(),
                        metrics_df['Data points per sec'].mean()
                        )
            )
        model.close_multiprocessing_pool()
        return concatenated_dfs, metrics_df

In [106]:
def cleanup_for_kpi():
    if not os.path.exists(BENCHMARK_FOLDER / "infer_kpi"):
        pathlib.Path(BENCHMARK_FOLDER / "infer_kpi").mkdir(parents=True, exist_ok=True)
    for filename in os.listdir(BENCHMARK_FOLDER / "infer_kpi"):
        file_path = os.path.join(BENCHMARK_FOLDER / "infer_kpi", filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

### 3.4. Running Benchmarks

In [107]:
num_runs = 2
kpi_metrics_dfs = []
for i in range(num_runs):
    cleanup_for_kpi()
    initialize_kpi_infer()
    result_df, kpi_metrics_df = infer_on_relevance_results()
    kpi_metrics_dfs.append(kpi_metrics_df)

08/08/2022 09:25:47 - INFO - __main__ -   #################### Starting KPI Inference for the following relevance CSV files found in /opt/app-root/src/aicoe-osc-demo-2022-07-13-21-52/data/benchmark/infer_relevance:
['04_NOVATEK_AR_2016_ENG_11_predictions_relevant.csv'] 
08/08/2022 09:25:47 - INFO - __main__ -   #################### 1/1
08/08/2022 09:25:47 - INFO - __main__ -   Starting KPI Extraction for 04_NOVATEK_AR_2016_ENG_11
Inferencing Samples: 100%|██████████| 3/3 [00:00<00:00,  4.96 Batches/s]
Inferencing Samples: 100%|██████████| 3/3 [00:00<00:00,  8.47 Batches/s]
Inferencing Samples: 100%|██████████| 3/3 [00:00<00:00,  6.73 Batches/s]
Inferencing Samples: 100%|██████████| 3/3 [00:00<00:00,  7.26 Batches/s]
Inferencing Samples: 100%|██████████| 3/3 [00:00<00:00,  6.48 Batches/s]
Inferencing Samples: 100%|██████████| 3/3 [00:00<00:00,  7.03 Batches/s]
Inferencing Samples: 100%|██████████| 3/3 [00:00<00:00,  8.23 Batches/s]
Inferencing Samples: 100%|██████████| 3/3 [00:00<00:00,

In [108]:
for i in range(num_runs):
    print(kpi_metrics_dfs[i].head())

                    PDF Name  Number of Data Points  Total Inference Time  \
0  04_NOVATEK_AR_2016_ENG_11                    404              9.389342   

   Time per data point  Data points per sec  
0             0.023241            43.027508  
                    PDF Name  Number of Data Points  Total Inference Time  \
0  04_NOVATEK_AR_2016_ENG_11                    404              9.409791   

   Time per data point  Data points per sec  
0             0.023292            42.934006  
