# Inference with transformers

In [1]:
import os
import pathlib
from dotenv import load_dotenv
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import AutoModelForSequenceClassification
from src.data.s3_communication import S3Communication, S3FileType
from src.components.utils.kpi_mapping import get_kpi_mapping_category
import json
import time
import config
from transformers import AutoTokenizer
from torch import cuda
import torch
device = 'cuda' if cuda.is_available() else 'cpu'
local_model_path = '/opt/app-root/src/aicoe-osc-demo/models/transformers/RELEVANCE'

In [2]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

In [3]:
# init s3 connector
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

## Retrieve the test dataset and the trained models

In [4]:
s3c.download_files_in_prefix_to_dir(
    config.BASE_TRAIN_TEST_DATASET_S3_PREFIX,
    config.BASE_PROCESSED_DATA)

In [5]:
test_data_path = str(config.BASE_PROCESSED_DATA)+'/rel_test_split.csv'
test_data = pd.read_csv(test_data_path, index_col=0)
test_data.rename(columns={'text': 'question', 'text_b':'sentence'}, inplace=True)

train_data_path = str(config.BASE_PROCESSED_DATA)+'/rel_train_split.csv'
train_data = pd.read_csv(train_data_path, index_col=0)
train_data.rename(columns={'text': 'question', 'text_b':'sentence'}, inplace=True)

In [6]:
trds = Dataset.from_pandas(train_data)
teds = Dataset.from_pandas(test_data.drop('label', axis=1))

climate_dataset = DatasetDict()

climate_dataset['train'] = trds
climate_dataset['test'] = teds

In [7]:
tokenizer = AutoTokenizer.from_pretrained(local_model_path, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(local_model_path).to(device)

In [8]:
def create_batches(data_df, batch_size=32):
    encoded_dataset = list()
    batch = list()
    for df, row in data_df.iterrows():
        if len(batch) < batch_size:
            batch.append([row['question'], row['sentence']])
        else:
            encoded_dataset.append(tokenizer(batch,
                                             truncation=True,
                                             return_tensors='pt',
                                             padding=True))
            batch = [[row['question'], row['sentence']]]

    if batch:
        encoded_dataset.append(tokenizer(batch,
                                         truncation=True,
                                         return_tensors='pt',
                                         padding=True))
    return encoded_dataset


encoded_dataset = create_batches(test_data)

In [9]:
def predict(encoded_dataset):
    outputs = list()
    for batch in encoded_dataset:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            outs = model(input_ids=input_ids, attention_mask=attention_mask)
            outputs.extend(outs.logits.argmax(axis=1).tolist())
    return outputs

# Infer for all pdfs

In [11]:
def gather_data(pdf_name, pdf_path):
    pdf_content = read_text_from_json(file_path)
    text_data = []
    # Build all possible combinations of paragraphs and  questions
    # Keep track of page number which the text is extracted from and
    # the pdf it belongs to.
    for kpi_question in questions:
        text_data.extend([{
            "page": page_num,
            "pdf_name": pdf_name,
            "question": kpi_question,
            "sentence": paragraph}
            for page_num, page_content in pdf_content.items()
            for paragraph in page_content])
    return text_data


def read_text_from_json(file):
    """Read text from json."""

    with open(file) as f:
        text = json.load(f)
        return text

In [12]:
BENCHMARK_FOLDER = config.DATA_FOLDER / "benchmark"
if not os.path.exists(BENCHMARK_FOLDER):
    BENCHMARK_FOLDER.mkdir(parents=True, exist_ok=True)

BENCHMARK_EXTRACTION_FOLDER = BENCHMARK_FOLDER / "extraction"
if not os.path.exists(BENCHMARK_EXTRACTION_FOLDER):
    pathlib.Path(BENCHMARK_EXTRACTION_FOLDER).mkdir(parents=True, exist_ok=True)

In [None]:
kpi_df = s3c.download_df_from_s3(
    "aicoe-osc-demo/kpi_mapping",
    "kpi_mapping.csv",
    filetype=S3FileType.CSV,
    header=0)

kmc = get_kpi_mapping_category(kpi_df)
questions = [q_text for q_id, (q_text, sect) in kmc["KPI_MAPPING_MODEL"].items()
             if len(set(sect).intersection({"OG", "CM", "CU"})) > 0
             and "TEXT" in kmc["KPI_CATEGORY"][q_id]]

text_paths = sorted(BENCHMARK_EXTRACTION_FOLDER.rglob("*.json"))
all_text_path_dict = {os.path.splitext(os.path.basename(file_path))[0]:
                      file_path for file_path in text_paths
                      if "table_meta" not in str(file_path)}

df_list = []
metrics_df_list = []
num_pdfs = len(all_text_path_dict)

In [None]:
for i, (pdf_name, file_path) in enumerate(all_text_path_dict.items()):
    print(f'Processing {i}/{len(all_text_path_dict)}, {pdf_name}')
    data = gather_data(pdf_name, file_path)
    num_data_points = len(data)
    num_pages = data[len(data)-1]['page']
    chunk_size = 1000
    chunk_idx = 0
    total_file_time = 0

    predictions = list()
    while chunk_idx * chunk_size < num_data_points:
        chunk_start = time.time()
        data_chunk = data[chunk_idx * chunk_size:(chunk_idx + 1) * chunk_size]
        temp_df = pd.DataFrame(data_chunk).drop(['pdf_name', 'page'], axis=1)
        encoded_dataset = create_batches(temp_df,
                                         batch_size=128)
        predictions.extend(predict(encoded_dataset))
        chunk_idx += 1

        chunk_end = time.time()
        total_file_time += (chunk_end - chunk_start)

    time_per_data_point = total_file_time / num_data_points
    data_points_per_sec = 1/time_per_data_point

    metrics_list = [
        [pdf_name,
         int(num_pages),
         num_data_points,
         total_file_time,
         time_per_data_point,
         data_points_per_sec]]

    metrics_df = pd.DataFrame(
        metrics_list, columns=['PDF Name', 'Number of Pages', 'Number of Data Points',
                               'Total Inference Time', 'Time per data point', 'Data points per sec'])
    metrics_df_list.append(metrics_df)


concatenated_dfs = pd.concat(df_list) if len(df_list) > 0 else pd.DataFrame()
metrics_df = pd.DataFrame()
if len(metrics_df_list) > 0:
    metrics_df = pd.concat(metrics_df_list) if len(metrics_df_list) > 0 else pd.DataFrame()

Processing 0/144, 04_NOVATEK_AR_2016_ENG_11
Processing 1/144, 04_NOVATEK_AR_2018_ENG_15
Processing 2/144, 2013_book_mol_ar_eng_fin
Processing 3/144, 2015_BASF_Report
Processing 4/144, 2017 Sustainability Report
Processing 5/144, 2017-Sustainability-Report
Processing 6/144, 2017_SustainabilityReport_2_9_Web
Processing 7/144, 2017_sustainability_report
Processing 8/144, 2017_sustainability_report_tcm14-130393
Processing 9/144, 2018 Annual Report
Processing 10/144, 2018_sustainability_report
Processing 11/144, 2019 Annual Report
Processing 12/144, 2019_global_sustainability_plan_tcm14-148662
Processing 13/144, 28022019-Repsol-Annual-Financial-Report-2018_tcm14-147383
Processing 14/144, 2_LOTOS_Group Directors Report 2019
Processing 15/144, AGL Energy Ltd Annual Report 2019
Processing 16/144, AGL Energy Ltd FY19 Carbon Scenario Analysis
Processing 17/144, AKERBP-Annual-Report-2016
Processing 18/144, AKERBP-Annual-Report-2017
Processing 19/144, ANNUAL REPORT 2017
Processing 20/144, AR_FS_20

In [None]:
metrics_df.to_pickle('../../reports/benchmarks/distilbert_relevance.pkl')

In [18]:
metrics_df = pd.read_pickle('../../reports/benchmarks/distilbert_relevance.pkl')

In [20]:
metrics_df['Time per data point'].describe()

count    144.000000
mean       0.002231
std        0.000903
min        0.000556
25%        0.001702
50%        0.002161
75%        0.002694
max        0.004880
Name: Time per data point, dtype: float64

The average time per data point is 0.002231 seconds. A pdf with on average 157 pages, and 360 data points per page, will take 125 seconds or 2.1min to execute that is almost 3 times faster than farm model that takes 6.5mins for the same task.