# Automatic Sustainability Objective Detection

## === Setup ===

### Importing Libraries

In [None]:
import os
import sys
import pandas
import IPython.display

sys.path.append("../../goalspotter_core/source")
import document
import data_preprocessing
import transformer_model

pandas.set_option("display.max_rows", None)
pandas.set_option("display.max_columns", None)
pandas.set_option("display.max_colwidth", None)

### Setting up the Data Preprocessor

In [None]:
data_preprocessor = data_preprocessing.DataPreprocessing()

### Loading Our Trained Models

In [None]:
target_values = ["Not Goal", "Goal"]
goal_detection_model = transformer_model.TextClassification(target_values, name="distilroberta-base", load_from="../../goalspotter_core/models/goal-detection")

### Objective Extraction Helper Function

In [None]:
def extract_objectives_from_url(url, content_type="pdf"):
    
    # Extracting Text Blocks of the Sustainability Report
    doc = document.Document(url)
    doc.content_type = content_type
    content = doc.read_local_file()
    parsed_content = doc.parse_content(content)
    text_blocks = doc.segment_text(parsed_content)
    sentences = doc.get_sentences(text_blocks)
    tdf = pandas.DataFrame({"Text Blocks": sentences})    
    
    # Running the Goal Detection Model
    tdf["text"] = tdf["Text Blocks"].copy()
    tdf = data_preprocessor.clean_text_blocks(tdf, "text", level="essential")
    tdf = data_preprocessor.filter_text_blocks(tdf, "text", keep_only_size=(0, 300))
    predictions = goal_detection_model.predict(tdf["text"].tolist())
    tdf["Goal Score"] = predictions["Goal"].values
    tdf = tdf.drop(["text"], axis=1)
    tdf = tdf.sort_values("Goal Score", ascending=False)    
    tdf = tdf.fillna("")
    return tdf    

## === Processing New Sustainability Reports ===

In [None]:
company_name = "walmart"
for file_name in os.listdir(f"../documents/{company_name}"):
    input_path = f"../documents/{company_name}/{file_name}"
    base_name = os.path.splitext(file_name)[0]
    output_dir = f"../objectives/{company_name}"
    output_path = f"{output_dir}/{base_name}.csv"
    os.makedirs(output_dir, exist_ok=True)
    # IPython.display.display(IPython.display.IFrame(f"file://{input_path}", width=1000, height=800))
    df = extract_objectives_from_url(input_path )
    df = df.drop_duplicates(subset="Text Blocks", keep="first")
    df = df[df["Text Blocks"].str.len() >= 25]
    df = df[df["Goal Score"] >= 0.1]
    df.insert(0, "ID", range(1, len(df) + 1))
    df.to_csv(output_path, index=False)
    IPython.display.display(df)