## Inference with Sparse Models 
In this notebook, we will benchmark the inference timings for the sparse models. 

In [None]:
import os
import pathlib
from dotenv import load_dotenv
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import AutoModelForSequenceClassification
from src.data.s3_communication import S3Communication, S3FileType
from src.components.utils.kpi_mapping import get_kpi_mapping_category
import json
import time
import config
from transformers import AutoTokenizer
from torch import cuda
import torch
import warnings
warnings.filterwarnings("ignore")
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

In [6]:
# init s3 connector
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

## Helper functions

In [48]:
def create_batches(data_df, batch_size=32):
    encoded_dataset = list()
    batch = list()
    for df, row in data_df.iterrows():
        if len(batch) < batch_size:
            batch.append([row['question'], row['sentence']])
        else:
            encoded_dataset.append(tokenizer(batch,
                                             truncation=True,
                                             return_tensors='pt',
                                             padding=True))
            batch = [[row['question'], row['sentence']]]
    if batch:
        encoded_dataset.append(tokenizer(batch,
                                         truncation=True,
                                         return_tensors='pt',
                                         padding=True))
    return encoded_dataset

In [49]:
def gather_data(pdf_name, pdf_path):
    pdf_content = read_text_from_json(file_path)
    text_data = []
    # Build all possible combinations of paragraphs and  questions
    # Keep track of page number which the text is extracted from and
    # the pdf it belongs to.
    for kpi_question in questions:
        text_data.extend([{
            "page": page_num,
            "pdf_name": pdf_name,
            "question": kpi_question,
            "sentence": paragraph}
            for page_num, page_content in pdf_content.items()
            for paragraph in page_content])
    return text_data

In [50]:
def predict(encoded_dataset):
    outputs = list()
    for batch in encoded_dataset:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            outs = model(input_ids=input_ids, attention_mask=attention_mask)
            outputs.extend(outs.logits.argmax(axis=1).tolist())
    return outputs

In [51]:
def read_text_from_json(file):
    """Read text from json."""

    with open(file) as f:
        text = json.load(f)
        return text

## Retrieve the test dataset and the trained models

In [52]:
s3c.download_files_in_prefix_to_dir(
    config.BASE_TRAIN_TEST_DATASET_S3_PREFIX,
    config.BASE_PROCESSED_DATA)

In [53]:
test_data_path = str(config.BASE_PROCESSED_DATA)+'/rel_test_split.csv'
test_data = pd.read_csv(test_data_path, index_col=0)
test_data.rename(columns={'text': 'question', 'text_b':'sentence'}, inplace=True)

train_data_path = str(config.BASE_PROCESSED_DATA)+'/rel_train_split.csv'
train_data = pd.read_csv(train_data_path, index_col=0)
train_data.rename(columns={'text': 'question', 'text_b':'sentence'}, inplace=True)

In [54]:
trds = Dataset.from_pandas(train_data)
teds = Dataset.from_pandas(test_data.drop('label', axis=1))

climate_dataset = DatasetDict()

climate_dataset['train'] = trds
climate_dataset['test'] = teds

In [55]:
climate_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'question', 'sentence', '__index_level_0__'],
        num_rows: 2033
    })
    test: Dataset({
        features: ['question', 'sentence', '__index_level_0__'],
        num_rows: 509
    })
})

**Model Paths**

In [56]:
local_model_paths=['/opt/app-root/src/aicoe-osc-demo/models/transformers/distilbert_mnli_pruned80/',
                   '/opt/app-root/src/aicoe-osc-demo/models/transformers/distilbert_qqp_pruned80/',
                   '/opt/app-root/src/aicoe-osc-demo/models/transformers/obert_mnli_pruned90/']

model_names = ['distilbert_mnli_pruned80',
               'distilbert_qqp_pruned80',
               'obert_mnli_pruned90']

In [57]:
config.DATA_FOLDER

PosixPath('/opt/app-root/src/data')

In [58]:
BENCHMARK_FOLDER = config.DATA_FOLDER
if not os.path.exists(BENCHMARK_FOLDER):
    BENCHMARK_FOLDER.mkdir(parents=True, exist_ok=True)

BENCHMARK_EXTRACTION_FOLDER = BENCHMARK_FOLDER / "extraction"
if not os.path.exists(BENCHMARK_EXTRACTION_FOLDER):
    pathlib.Path(BENCHMARK_EXTRACTION_FOLDER).mkdir(parents=True, exist_ok=True)

In [59]:
kpi_df = s3c.download_df_from_s3(
    "aicoe-osc-demo/kpi_mapping",
    "kpi_mapping.csv",
    filetype=S3FileType.CSV,
    header=0)

kmc = get_kpi_mapping_category(kpi_df)
questions = [q_text for q_id, (q_text, sect) in kmc["KPI_MAPPING_MODEL"].items()
             if len(set(sect).intersection({"OG", "CM", "CU"})) > 0
             and "TEXT" in kmc["KPI_CATEGORY"][q_id]]

text_paths = sorted(BENCHMARK_EXTRACTION_FOLDER.rglob("*.json"))
all_text_path_dict = {os.path.splitext(os.path.basename(file_path))[0]:
                      file_path for file_path in text_paths
                      if "table_meta" not in str(file_path)}

In [60]:
#import itertools
#all_text_path_dict = dict(itertools.islice(all_text_path_dict.items(), 2))

In [61]:
df_list = []
metrics_df_list = []
metrics_list = []
metric_dfs = pd.DataFrame()
num_pdfs = len(all_text_path_dict)

In [62]:
all_text_path_dict

{'04_NOVATEK_AR_2016_ENG_11': PosixPath('/opt/app-root/src/data/extraction/04_NOVATEK_AR_2016_ENG_11.json'),
 '04_NOVATEK_AR_2018_ENG_15': PosixPath('/opt/app-root/src/data/extraction/04_NOVATEK_AR_2018_ENG_15.json'),
 '2013_book_mol_ar_eng_fin': PosixPath('/opt/app-root/src/data/extraction/2013_book_mol_ar_eng_fin.json'),
 '2015_BASF_Report': PosixPath('/opt/app-root/src/data/extraction/2015_BASF_Report.json'),
 '2017 Sustainability Report': PosixPath('/opt/app-root/src/data/extraction/2017 Sustainability Report.json'),
 '2017-Sustainability-Report': PosixPath('/opt/app-root/src/data/extraction/2017-Sustainability-Report.json'),
 '2017_SustainabilityReport_2_9_Web': PosixPath('/opt/app-root/src/data/extraction/2017_SustainabilityReport_2_9_Web.json'),
 '2017_sustainability_report': PosixPath('/opt/app-root/src/data/extraction/2017_sustainability_report.json'),
 '2017_sustainability_report_tcm14-130393': PosixPath('/opt/app-root/src/data/extraction/2017_sustainability_report_tcm14-1303

In [63]:
print(num_pdfs)

144


In [None]:
metric_list = []
for local_model_path, model_name in zip(local_model_paths,model_names):
    for i, (pdf_name,file_path) in enumerate(all_text_path_dict.items()):
        print(f"loop : {i}")
        tokenizer = AutoTokenizer.from_pretrained(local_model_path, use_fast=True)
        model = AutoModelForSequenceClassification.from_pretrained(local_model_path).to(device)
        encoded_dataset = create_batches(test_data)

        print(f'Processing {i+1}/{len(all_text_path_dict)}, {pdf_name}')
        data = gather_data(pdf_name, file_path)
        num_data_points = len(data)
        num_pages = data[len(data)-1]['page']
        chunk_size = 1000
        chunk_idx = 0
        total_file_time = 0

        predictions = list()
        while chunk_idx * chunk_size < num_data_points:
            chunk_start = time.time()
            data_chunk = data[chunk_idx * chunk_size:(chunk_idx + 1) * chunk_size]
            temp_df = pd.DataFrame(data_chunk).drop(['pdf_name', 'page'], axis=1)
            encoded_dataset = create_batches(temp_df, batch_size=128)
            predictions.extend(predict(encoded_dataset))
            chunk_idx += 1
            chunk_end = time.time()
            total_file_time += (chunk_end - chunk_start)

        time_per_data_point = total_file_time / num_data_points
        data_points_per_sec = 1/time_per_data_point
        model_size = os.path.getsize(local_model_path + 'pytorch_model.bin')/1000000

        metric_list.append(
            {'Model Name':model_name,
             'Model Size(MB)': model_size,
             'PDF Name':pdf_name,
             'Number of Pages':int(num_pages),
             'Number of Data Points':num_data_points,
             'Total Inference Time':total_file_time,
             'Time per data point':time_per_data_point,
             'Data points per sec':data_points_per_sec})

    file_to_save = pd.DataFrame(metric_list)
    file_to_save.to_csv(f"file_to_save_{model_name}.csv")

In [None]:
metric_dfs = pd.DataFrame(metric_list)

In [None]:
metric_dfs.head()

**Model Name: distilbert_mnli_pruned80, Size: 267.85 MB**

In [32]:
df = pd.read_csv("file_to_save_models_perf.csv")
df1 = df[df['Model Name']=='distilbert_mnli_pruned80']

In [33]:
df1.head()

Unnamed: 0.1,Unnamed: 0,Model Name,Model Size(MB),PDF Name,Number of Pages,Number of Data Points,Total Inference Time,Time per data point,Data points per sec
0,0,distilbert_mnli_pruned80,267.857431,04_NOVATEK_AR_2016_ENG_11,119,24264,67.520567,0.002783,359.357172
1,1,distilbert_mnli_pruned80,267.857431,04_NOVATEK_AR_2018_ENG_15,105,23112,61.819848,0.002675,373.860511
2,2,distilbert_mnli_pruned80,267.857431,2013_book_mol_ar_eng_fin,135,63024,214.157398,0.003398,294.288223
3,3,distilbert_mnli_pruned80,267.857431,2015_BASF_Report,261,76392,270.319963,0.003539,282.598441
4,4,distilbert_mnli_pruned80,267.857431,2017 Sustainability Report,57,21936,37.991996,0.001732,577.384773


In [34]:
#Average number of pages

df1['Number of Pages'].describe()

count    144.000000
mean     156.958333
std      117.550041
min        0.000000
25%       73.250000
50%      127.500000
75%      224.250000
max      653.000000
Name: Number of Pages, dtype: float64

In [35]:
# Average time per data point

df1['Time per data point'].describe()

count    144.000000
mean       0.002414
std        0.000953
min        0.000629
25%        0.001872
50%        0.002326
75%        0.002907
max        0.005167
Name: Time per data point, dtype: float64

In [36]:
# Average number of data points

df1['Number of Data Points'].describe()

count       144.0000
mean      60703.5000
std       68273.4301
min         240.0000
25%       20136.0000
50%       42192.0000
75%       80496.0000
max      469680.0000
Name: Number of Data Points, dtype: float64

The average time per data point is 0.002414 seconds. A pdf with on average ~157 pages, and ~ 387 data points per page, will take ~146 seconds or 2.4 mins to execute.

**Model_Name : distilbert_qqp_pruned80, Size: 267.8 MB**

In [37]:
df = pd.read_csv("file_to_save_models_perf.csv")
df2 = df[df['Model Name']=='distilbert_qqp_pruned80']

In [38]:
df2.head()

Unnamed: 0.1,Unnamed: 0,Model Name,Model Size(MB),PDF Name,Number of Pages,Number of Data Points,Total Inference Time,Time per data point,Data points per sec
144,144,distilbert_qqp_pruned80,267.857431,04_NOVATEK_AR_2016_ENG_11,119,24264,67.192897,0.002769,361.109598
145,145,distilbert_qqp_pruned80,267.857431,04_NOVATEK_AR_2018_ENG_15,105,23112,61.663003,0.002668,374.811459
146,146,distilbert_qqp_pruned80,267.857431,2013_book_mol_ar_eng_fin,135,63024,213.188499,0.003383,295.625704
147,147,distilbert_qqp_pruned80,267.857431,2015_BASF_Report,261,76392,269.923002,0.003533,283.014042
148,148,distilbert_qqp_pruned80,267.857431,2017 Sustainability Report,57,21936,37.874583,0.001727,579.174686


In [39]:
# Average time per data point
df2['Time per data point'].describe()

count    144.000000
mean       0.002413
std        0.000952
min        0.000624
25%        0.001876
50%        0.002324
75%        0.002906
max        0.005172
Name: Time per data point, dtype: float64

The average time per data point is 0.002413 seconds. A pdf with on average ~157 pages, and ~387 data points per page, will take ~146 seconds or ~2.4 mins to execute.

**Model_Name : obert_mnli_pruned90, Size: 438.011 MB**

In [42]:
df = pd.read_csv("file_to_save_models_perf.csv")
df3 = df[df['Model Name']=='obert_mnli_pruned90']

In [43]:
df3.head()

Unnamed: 0.1,Unnamed: 0,Model Name,Model Size(MB),PDF Name,Number of Pages,Number of Data Points,Total Inference Time,Time per data point,Data points per sec
288,288,obert_mnli_pruned90,438.011337,04_NOVATEK_AR_2016_ENG_11,119,24264,124.831353,0.005145,194.374244
289,289,obert_mnli_pruned90,438.011337,04_NOVATEK_AR_2018_ENG_15,105,23112,113.925587,0.004929,202.869264
290,290,obert_mnli_pruned90,438.011337,2013_book_mol_ar_eng_fin,135,63024,399.951331,0.006346,157.579173
291,291,obert_mnli_pruned90,438.011337,2015_BASF_Report,261,76392,508.218936,0.006653,150.313171
292,292,obert_mnli_pruned90,438.011337,2017 Sustainability Report,57,21936,68.904382,0.003141,318.354206


In [44]:
# Average time per data point

df3['Time per data point'].describe()

count    144.000000
mean       0.004468
std        0.001838
min        0.001012
25%        0.003391
50%        0.004323
75%        0.005401
max        0.009847
Name: Time per data point, dtype: float64

The average time per data point is 0.004468 seconds. A pdf with on average ~157 pages, and ~387 data points per page, will take ~271 seconds or 4.5 mins to execute.

# Conclusion

Here we create a conclusive table which contains the information about different models.

In [39]:
final_table = {'model_name':['farm bert based model',
                             'distilbert-base-uncased',
                             'distilbert_mnli_pruned80',
                             'distilbert_qqp_pruned80',
                             'obert_mnli_pruned90'],
               'model_size (MB)':[475.6,268, 267.8, 267.8, 438.0],
               'inference_time_for_avg_pdf (mins)':[6.5, 2.1, 2.4, 2.4, 4.5],
               'F1_Score':[0.9156, 0.9179, 0.9071, 0.9139, 0.8959]}

In [40]:
pd.DataFrame(final_table)

Unnamed: 0,model_name,model_size (MB),inference_time_for_avg_pdf (mins),F1_Score
0,farm bert based model,475.6,6.5,0.9156
1,distilbert-base-uncased,268.0,2.1,0.9179
2,distilbert_mnli_pruned80,267.8,2.4,0.9071
3,distilbert_qqp_pruned80,267.8,2.4,0.9139
4,obert_mnli_pruned90,438.0,4.5,0.8959
