## ANNOTATION TOOL

In [144]:
import pandas as pd
import os
import glob
import numpy as np
import ipywidgets as widgets

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", None)

#### KPI MAPPING DOWNLOAD

In [145]:
import os
import pathlib
from dotenv import load_dotenv
from src.data.s3_communication import S3FileType, S3Communication

# Load credentials?
dotenv_dir = os.environ.get("CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src"))
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

# Init s3 communication
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("S3_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("S3_SECRET_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

# Read kpi mapping csv from s3
kpi_mapping_df = s3c.download_df_from_s3(
    "corpdata/ESG/kpi_mapping",
    "kpi_mapping.csv",
    filetype=S3FileType.CSV,
    header=0,
)

Save kpi mapping to csv

In [146]:
kpi_mapping_df.to_csv("kpi_mapping.csv")

Print kpi mapping

In [147]:
kpi_mapping_df.head(30)

Unnamed: 0,kpi_id,question,sectors,add_year,kpi_category,Unnamed: 5,Unnamed: 6
0,0.0,What is the company name?,"OG, CM, CU",False,TEXT,,
1,1.0,In which year was the annual report or the sustainability report published?,"OG, CM, CU",False,TEXT,,
2,2.0,What is the total volume of proven and probable hydrocarbons reserves?,OG,True,"TEXT, TABLE",,
3,2.1,What is the volume of estimated proven hydrocarbons reserves?,OG,True,"TEXT, TABLE",,
4,2.2,What is the volume of estimated probable hydrocarbons reserves?,OG,True,"TEXT, TABLE",,
5,3.0,What is the total volume of hydrocarbons production?,OG,True,"TEXT, TABLE",,
6,3.1,What is the total volume of crude oil liquid production?,OG,True,"TEXT, TABLE",,
7,3.2,What is the total volume of natural gas liquid production?,OG,True,"TEXT, TABLE",,
8,3.3,What is the total volume of natural gas production?,OG,True,"TEXT, TABLE",,
9,4.0,What is the annual total production from coal?,CU,True,"TEXT, TABLE",,


### 1. Settings

Please change the settings to the needed ones.

In [148]:
annotations_path = "/opt/app-root/src/corporate_data_pipeline/NLP_ANNOTATION_TOOL"
output_path = "/opt/app-root/src/corporate_data_pipeline/NLP_ANNOTATION_TOOL/output"
annotator = "Max"
all_kpi = list(kpi_mapping_df["kpi_id"])
# kpi_of_interest = all_kpi
kpi_of_interest = [0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [149]:
for idx in range(len(all_kpi)):
    if all_kpi[idx].is_integer():
        all_kpi[idx] = int(all_kpi[idx])
print(f"All KPI's are " + ", ".join([str(x) for x in all_kpi]) + ".")
print(f"KPI's of interest are " + ", ".join([str(x) for x in kpi_of_interest]) + ".")

All KPI's are 0, 1, 2, 2.1, 2.2, 3, 3.1, 3.2, 3.3, 4, 4.1, 4.2, 5, 5.1, 5.2, 6, 7, 8, 9, 10, 11, 12, 13, 14.
KPI's of interest are 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14.


### 2. Preloading

loading packages and the existing annotations (just execute)

In [150]:
df_annotations = pd.read_excel(annotations_path + "/annotations.xlsx")
df_out = df_annotations.copy()

### 3. Annotations overview

The following gives an overview of the existing predictions and which file needs a further investigation (just execute)

In [151]:
outputs = glob.glob(output_path + "/*")
outputs = [x.rsplit("/", 1)[1] for x in outputs]
if len(outputs) > 1:
    print(f"There are {len(outputs)} output files in the output folder:" + "\n" + "\n".join([str(x) for x in outputs]))
elif len(outputs) == 1:
    print(f"There is one output file in the output folder.")
else:
    print(f"There are no new files in the output folder.")


for output in outputs:
    print("\n Next file considered: " + output)
    df_output = pd.read_csv(output_path + "/" + output)
    pdf_name = df_output["pdf_name"].values[0]
    df_annotations_temp = df_annotations[df_annotations["source_file"] == pdf_name]
    kpis_contained = [x for x in df_annotations_temp["kpi_id"].values if x in kpi_of_interest]
    if len(kpis_contained) > 1:
        print(
            'For pdf with name "'
            + pdf_name
            + '" in file '
            + output
            + " we have already annotations for the kpi's "
            + ", ".join([str(x) for x in kpis_contained])
            + "."
        )
    elif len(kpis_contained) == 1:
        print(
            'For pdf with name "'
            + pdf_name
            + "\ in file "
            + output
            + " we have already an annotation for the kpi "
            + ",".join([str(x) for x in kpis_contained])
            + "."
        )
    else:
        print('For pdf with name "' + pdf_name + "\" we have no annotations yet for the kpi's under investigation.")
    if kpis_contained == kpi_of_interest:
        print("DONE: All kpi's of interest have been annotated for this file.")
    else:
        print("TODO: There are open annotations for that file.")

There are 14 output files in the output folder:
408712981_ENEOS Hldgs Inc_2021-01-26_predictions_kpi.csv
381772374_Valero Energy Corp_2019-12-31_predictions_kpi.csv
389905632_Engie_2019-12-31_predictions_kpi.csv
386749585_Hyundai Motor Co_2020-08-06_predictions_kpi.csv
392903246_Nissan Motor Co_2020-03-31_predictions_kpi.csv
405604636_Exxon Mobil Corp_2021-01-05_predictions_kpi.csv
286359582_Mitsubishi Estate_2018-03-31_predictions_kpi.csv
231699798_Volkswagen AG_2017-12-31_predictions_kpi.csv
test_predictions_kpi.csv
371393592_Royal Dutch Shell_2019-12-31_predictions_kpi.csv
388265206_General Motors Co_2019-12-31_predictions_kpi.csv
366497869_Imperial Oil Ltd_2018-12-31_predictions_kpi.csv
test2_predictions_kpi.csv
345132457_FDG Electric Vehic_2019-03-31_predictions_kpi.csv

 Next file considered: 408712981_ENEOS Hldgs Inc_2021-01-26_predictions_kpi.csv
For pdf with name "408712981_ENEOS Hldgs Inc_2021-01-26" we have no annotations yet for the kpi's under investigation.
TODO: There ar

### 4. Check new annotations

Please set the file you want to investigate.

In [209]:
output_file = "286359582_Mitsubishi Estate_2018-03-31_predictions_kpi.csv"

List of open tasks (just execute)

In [210]:
df_output = pd.read_csv(output_path + "/" + output_file)
pdf_name = df_output["pdf_name"].values[0]
df_annotations_temp = df_annotations[df_annotations["source_file"] == pdf_name]
kpis_contained = [x for x in df_annotations_temp["kpi_id"].values if x in kpi_of_interest]
open_kpis = [x for x in kpi_of_interest if x not in kpis_contained]
if len(open_kpis) > 1:
    print("The open kpi's are " + ", ".join([str(x) for x in open_kpis]) + ".")
elif len(open_kpis) == 1:
    print("The open kpi is " + ", ".join([str(x) for x in open_kpis]) + ".")
else:
    print("There are no open kpi's.")

The open kpi's are 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14.


### 4.1 Detailed investigation
Please set the kpi you want to investigate.

In [402]:
kpi_to_investigate = 7
kpi = kpi_mapping_df.loc[kpi_mapping_df["kpi_id"] == kpi_to_investigate, "question"].values[0]

Get the outcome of the machine (just execute)

In [404]:
def dropdown_eventhandler(change):
    output.clear_output()
    with output:
        print("id_correct_answer: " + str(dropdown.value[1]))
        print("correct paragraph: " + str(toggle_paragraph.value))


# Check specific KPIs
df_output = pd.read_csv(output_path + "/" + output_file)
df_output = df_output.drop(columns="Unnamed: 0")
df_output_check = df_output[df_output["kpi"] == kpi]


index_list = df_output_check.index.values
select_list = []
for i, x in enumerate(index_list):
    select_list = select_list + [(i + 1, [i + 1, x])]
select_list = select_list + [("no correct answer", [-1, -1])]

output = widgets.Output()
dropdown = widgets.Dropdown(description="Choose rank:", options=select_list, value=None)
toggle_paragraph = widgets.ToggleButtons(
    options=[("Correct", 1), ("Incorrect", -1)],
    description="Paragraph:",
    disabled=False,
    button_style="warning",  # 'success', 'info', 'warning', 'danger' or ''
    tooltips=["Description of slow", "Description of regular", "Description of fast"],
    #     icons=['check'] * 3
)
dropdown.observe(dropdown_eventhandler, names="value")
toggle_paragraph.observe(dropdown_eventhandler, names="value")

display(df_output_check.head(4))
display(dropdown)
display(toggle_paragraph)
display(output)

Unnamed: 0,pdf_name,kpi,kpi_id,answer,page,paragraph,source,score,no_ans_score,no_answer_score_plus_boost
33,286359582_Mitsubishi Estate_2018-03-31,What is the total amount of energy indirect greenhouse gases emissions referred to as scope 2 emissions?,,no_answer,,,Text,2.655437,,
34,286359582_Mitsubishi Estate_2018-03-31,What is the total amount of energy indirect greenhouse gases emissions referred to as scope 2 emissions?,,150,29.0,Emissions per unit of floor space (kg-CO2/m2) 150,Text,-3.332447,17.333275,2.333275
35,286359582_Mitsubishi Estate_2018-03-31,What is the total amount of energy indirect greenhouse gases emissions referred to as scope 2 emissions?,,.,29.0,Fuel oil 0.0% City gas 0.9% Heat (district heating and cooling) 23.6%,Text,-8.171566,17.655437,2.655437
36,286359582_Mitsubishi Estate_2018-03-31,What is the total amount of energy indirect greenhouse gases emissions referred to as scope 2 emissions?,,"529,000 t a year",10.0,"About 272,000 kl/ About 529,000 t a year 44,967 t/About 5,648,000 m3 a year 9/20/22 buildings and projects",Text,-12.567283,17.399271,2.399271


Dropdown(description='Choose rank:', options=((1, [1, 33]), (2, [2, 34]), (3, [3, 35]), (4, [4, 36]), ('no cor…



Output()

### 4.2 Set answer

Define the id where one can find the correct paragraph and/or the answer. In case an optimal paragraph and/or answer does not exist, please specify it in the variables "correct_*".

In [395]:
# Campany related values
company = "Mitsubishi Estate"
year = 2018
sector = "Automotive"

# Only if paragraph is not contained
correct_paragraph = "Publication date: December 27, 2018"  #'Use of our refinery and natural gas products (Scope 3 Category 11) (million tonnes CO2 equivalent) [Q]'
correct_paragraph_page = 3
correct_paragraph_source = "TEXT"  # Either "TEXT" or "TABLE"

# Only if answer is not contained
correct_answer = "December 27, 2018"  # "576"

### 4.3 Generate annotation outcome

After having set the correct annotations we can generate a new entry for the annotations file (just execute).

In [396]:
id_correct_paragraph = dropdown.value[1]
rank_correct_paragraph = dropdown.value[0]
id_correct_answer = dropdown.value[1]
rank_correct_answer = dropdown.value[0]

df_temp = df_annotations.head(0)
if toggle_paragraph.value == -1:
    paragraph = "[" + str(correct_paragraph) + "]"
    source_page = "[" + str(correct_paragraph_page) + "]"
    source = correct_paragraph_source
    paragraph_pred_rank = -1
    paragraph_pred_score = -100
else:
    paragraph = "[" + str(df_output_check.loc[id_correct_paragraph, "paragraph"]) + "]"
    source_page = "[" + str(df_output_check.loc[id_correct_paragraph, "page"]) + "]"
    source = df_output_check.loc[id_correct_paragraph, "source"]
    paragraph_pred_rank = rank_correct_paragraph
    paragraph_pred_score = 100


if id_correct_answer == -1:
    answer = correct_answer
    kpi_pred_rank = -1
    kpi_pred_score = -100
else:
    answer = df_output_check.loc[id_correct_paragraph, "answer"]
    kpi_pred_rank = rank_correct_answer
    kpi_pred_score = df_output_check.loc[df_output_check.index == id_correct_answer, "score"].values[0]

try:
    max_num = np.max(df_out["number"].values)
except ValueError:
    max_num = 1

new_data = [
    max_num + 1,
    company,
    df_output_check["pdf_name"].values[0],
    source_page,
    kpi_to_investigate,
    year,
    answer,
    source,
    paragraph,
    annotator,
    sector,
    "",
    paragraph_pred_rank,
    paragraph_pred_score,
    kpi_pred_rank,
    kpi_pred_score,
]
df_series = pd.Series(new_data, index=df_temp.columns)
df_temp = df_temp.append(df_series, ignore_index=True)
df_temp = df_temp.set_index([pd.Index([np.max(df_out.index) + 1])])
df_out = df_out.append(df_temp)
df_out.tail(4)

Unnamed: 0,number,company,source_file,source_page,kpi_id,year,answer,data_type,relevant_paragraphs,annotator,sector,issue,paragraph_pred_rank,paragraph_pred_score,kpi_pred_rank,kpi_pred_score
36,36,Mitsubishi Estate,286359582_Mitsubishi Estate_2018-03-31,[10.0],14,2018,"529,000 t a year",Text,"[About 272,000 kl/ About 529,000 t a year 44,967 t/About 5,648,000 m3 a year 9/20/22 buildings and projects]",Max,Automotive,,3,100,3,-12.213235
37,37,Mitsubishi Estate,286359582_Mitsubishi Estate_2018-03-31,[29.0],14,2018,150,Text,[Emissions per unit of floor space (kg-CO2/m2) 150],Max,Automotive,,2,100,2,-2.841095
38,38,Mitsubishi Estate,286359582_Mitsubishi Estate_2018-03-31,[29.0],14,2018,150,Text,[Emissions per unit of floor space (kg-CO2/m2) 150],Max,Automotive,,2,100,2,-2.841095
39,39,Mitsubishi Estate,286359582_Mitsubishi Estate_2018-03-31,[3],14,2018,no_answer,TEXT,"[Publication date: December 27, 2018]",Max,Automotive,,-1,-100,1,2.258224


Check if there are still open kpi's. If yes start again at point 4.1. (just execute)

In [272]:
df_out_temp = df_out[df_out["source_file"] == pdf_name]
kpis_contained = [x for x in df_out_temp["kpi_id"].values if x in kpi_of_interest]
open_kpis = [x for x in kpi_of_interest if x not in kpis_contained]
if len(open_kpis) > 1:
    print("The open kpi's are " + ", ".join([str(x) for x in open_kpis]) + ".")
elif len(open_kpis) == 1:
    print("The open kpi is " + ", ".join([str(x) for x in open_kpis]) + ".")
else:
    print("There are no open kpi's.")

There are no open kpi's.


### 4.4 Export outcome

In [273]:
df_out.to_excel(annotations_path + "/annotations.xlsx", index=False)

In [274]:
from datetime import datetime

construction_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
df_out.to_excel(annotations_path + "/annotations_" + construction_time + ".xlsx", index=False)

Note: After having exported the new annotations please start with the jupyter notebook from the beginning if you want to check the annotations of another file.

## Notes:

* In output file year, company and sector are missing

Annotation Notes:
* No kpi_ids in the inference output
* No answer has no "no answer"-score
* How to handle no answer in the annotations, if the answer is really not in the pdf? This is also an information, but probably can not be handled yet.