# Automatic Sustainability Objective Detection

## === Setup ===

### Importing Libraries

In [11]:
import sys
import pandas
import IPython.display

sys.path.append("../../goalspotter_core/source")
import document
import data_preprocessing
import transformer_model

pandas.set_option("display.max_rows", None)
pandas.set_option("display.max_columns", None)
pandas.set_option("display.max_colwidth", None)

### Setting up the Data Preprocessor

In [12]:
data_preprocessor = data_preprocessing.DataPreprocessing()

### Loading Our Trained Models

In [13]:
target_values = ["Not Goal", "Goal"]
goal_detection_model = transformer_model.TextClassification(target_values, name="distilroberta-base", load_from="../../goalspotter_core/models/goal-detection")

### Objective Extraction Helper Function

In [14]:
def extract_objectives_from_url(url, content_type="pdf"):
    
    # Extracting Text Blocks of the Sustainability Report
    doc = document.Document(url)
    doc.content_type = content_type
    content = doc.read_local_file()
    parsed_content = doc.parse_content(content)
    text_blocks = doc.segment_text(parsed_content)
    sentences = doc.get_sentences(text_blocks)
    tdf = pandas.DataFrame({"Text Blocks": sentences})    
    
    # Running the Goal Detection Model
    tdf["text"] = tdf["Text Blocks"].copy()
    tdf = data_preprocessor.clean_text_blocks(tdf, "text", level="essential")
    tdf = data_preprocessor.filter_text_blocks(tdf, "text", keep_only_size=(0, 300))
    predictions = goal_detection_model.predict(tdf["text"].tolist())
    tdf["Goal Score"] = predictions["Goal"].values
    tdf = tdf.drop(["text"], axis=1)
    tdf = tdf.sort_values("Goal Score", ascending=False)    
    
    tdf = tdf.fillna("")
    return tdf    

## === Processing New Sustainability Reports ===

In [None]:
file_name = "google_2021"
url = f"../documents/google/{file_name}.pdf"
# IPython.display.display(IPython.display.IFrame(f"file://{url}", width=1000, height=800))
df = extract_objectives_from_url(url)
df = df.drop_duplicates(subset="Text Blocks", keep="first")
df = df[df["Text Blocks"].str.len() >= 25]
df = df[df["Goal Score"] >= 0.1]
df.insert(0, "ID", range(1, len(df) + 1))
df.to_csv(f"../objectives/google/{file_name}.csv", index=False)
df

Unnamed: 0,ID,Text Blocks,Goal Score
271,1,Maintain ISO 50001 energy management system certification for Google-owned data centers that meet certain operational milestones.,0.987067
278,2,"In 2020, our global landfill diversion rate for data center operations was 81%.",0.986799
362,3,Achieve UL 2799 Zero Waste to Landfill certification at all final assembly consumer hardware manufacturing sites by 2022.,0.982384
90,4,replenish more water than we consume by 2030 and to support water security,0.981121
342,5,Achieve carbon neutrality for 100% of shipments of Made by Google products to and from Google’s direct customers by 2020.,0.980361
343,6,Publish product environmental reports for 100% of flagship consumer hardware products launching in 2020 and beyond.,0.976243
276,7,Achieve Zero Waste to Landfill for our global data center operations.,0.97615
357,8,Include recycled materials in 100% of Made by Google products launching in 2022 and every year after.,0.970373
280,9,"Replenish 120% of the water we consume, on average, across our offices and data centers.",0.969653
328,10,Pursue the ILFI Living Building Challenge Certification for our Charleston East and Bay View campuses— two of Google’s first ground-up development projects at our Bay Area headquarters.,0.966345
