# Evaluation

## Importing Libraries

In [None]:
import io
import os
import sys
import json
import glob
import pickle
import PIL.Image
import pandas as pd
import IPython.display as dsp
import matplotlib.pyplot as plt
import rouge_score.rouge_scorer
import bert_score
import transformers

transformers.logging.set_verbosity_error()

sys.path.append("../source")
import company
import baseline

## Initializing the Company and Baseline

In [None]:
company_name = "pepsico"
c = company.Company(company_name)
c.load_database()
b = baseline.Baseline(company_name)
b.extract_text_and_images()


## Loading the Sustainability Objective

In [None]:
csv_files = glob.glob(os.path.join("../objectives", company_name, "*.csv"))
df_list = [pd.read_csv(f) for f in csv_files]
df = pd.concat(df_list, ignore_index=True)
objective = df["Text Blocks"].sample(n=1).iloc[0]
objective

## Generating Verification Report

In [None]:
text_evidence, image_evidence = c.retrieve_evidence(objective)
system_verification_report = c.verify_objective(objective=objective, text_evidence=text_evidence, llm_model="llama3.2")
image_path = image_evidence[0]["record"]["image_path"]
with PIL.Image.open(image_path) as img:
    img_byte_arr = io.BytesIO()
    img.save(img_byte_arr, format=img.format)
    system_image_bytes = img_byte_arr.getvalue()


In [None]:
baseline_verification_report = b.verify_objective(objective=objective, llm_model="llama3.2")
baseline_image_bytes = b.rank_images_for_query(objective)[0]

## Saving the Outputs

In [None]:
output = {
    "objective": objective, 
    "system_verification_report": system_verification_report, 
    "system_image_bytes": system_image_bytes, 
    "baseline_verification_report": baseline_verification_report, 
    "baseline_image_bytes": baseline_image_bytes 
} 
files = os.listdir("../outputs") 
matching_files = [f for f in files if company_name in f] 
count = len(matching_files) 
output_path = os.path.join("../outputs", f"{company_name}_{count}.pkl") 
pickle.dump(output, open(output_path, "wb"))

## Calculating the Evaluation Measures

In [None]:
scorer = rouge_score.rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

results = []
for file in os.listdir("../outputs"):
    if not file.endswith(".pkl"):
        continue
    
    with open(os.path.join("../outputs", file), "rb") as f:
        data = pickle.load(f)

    objective = data["objective"]
    system_report = data["system_verification_report"]
    baseline_report = data["baseline_verification_report"]

    rouge_system = scorer.score(objective, system_report)
    rouge_baseline = scorer.score(objective, baseline_report)

    P_sys, R_sys, F1_sys = bert_score.score([system_report], [objective], lang="en")
    P_base, R_base, F1_base = bert_score.score([baseline_report], [objective], lang="en")

    results.append({
        "file": file,
        "rouge_system": rouge_system,
        "rouge_baseline": rouge_baseline,
        "bertscore_system": F1_sys.item(),
        "bertscore_baseline": F1_base.item()
    })

df = pd.DataFrame(results)

df["rouge1_system"] = df["rouge_system"].apply(lambda x: x["rouge1"].fmeasure)
df["rouge2_system"] = df["rouge_system"].apply(lambda x: x["rouge2"].fmeasure)
df["rougeL_system"] = df["rouge_system"].apply(lambda x: x["rougeL"].fmeasure)

df["rouge1_baseline"] = df["rouge_baseline"].apply(lambda x: x["rouge1"].fmeasure)
df["rouge2_baseline"] = df["rouge_baseline"].apply(lambda x: x["rouge2"].fmeasure)
df["rougeL_baseline"] = df["rouge_baseline"].apply(lambda x: x["rougeL"].fmeasure)

df = df[[
    "file",
    "rouge1_system", "rouge1_baseline",
    "rouge2_system", "rouge2_baseline",
    "rougeL_system", "rougeL_baseline",
    "bertscore_system", "bertscore_baseline"
]]

avg_row = df.mean(numeric_only=True)
avg_row["file"] = "Mean"
df = pd.concat([df, pd.DataFrame([avg_row])], ignore_index=True)

numeric_cols = df.select_dtypes(include=["float", "int"]).columns
df[numeric_cols] = df[numeric_cols].round(2)

print(df.to_markdown(index=False))

In [None]:
df = pd.read_csv("../outputs/human_evaluation_results.csv")
df["preference"].value_counts()