In [5]:
import pandas as pd
import numpy as np

In [1]:
from google.colab import files
uploaded = files.upload()
image_paths = list(uploaded.keys())

ModuleNotFoundError: No module named 'google.colab'

In [2]:
!pip install git+https://github.com/openai/CLIP.git transformers pillow torch torchvision pandas sentence-transformers scikit-learn

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-qkfhcz9j
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-qkfhcz9j
^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
import torch
import clip
from PIL import Image
import pandas as pd
from transformers import BlipProcessor, BlipForConditionalGeneration
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

## Creating the dataset's.

In [None]:
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

google_goals = ['Educate employees across all levels',
            'Reward progress and integrate recognition in employee performance assessment',
            'Help develop a Forest Positive approach for the forest products industry that is based on sound science and delivers forest health benefits. As the concept of a Forest Positive approach is better defined, we will seek to implement Forest Positive actions that we believe will serve to sustain and expand the protection of working forests P&G depends on']

goal_tokens_clip = [clip.tokenize([goal]).to(device) for goal in google_goals]
goal_embeddings_sbert = sbert_model.encode(google_goals)

column_names = ["Image Name", "BLIP Caption"] + \
               [f"Goal_{i+1}" for i in range(len(google_goals))] + \
               [f"TF-IDF_Goal_{i+1}" for i in range(len(google_goals))] + \
               [f"SBERT_Goal_{i+1}" for i in range(len(google_goals))] + \
               [f"Image-Goal_{i+1}" for i in range(len(google_goals))]

results = []

for image_path in image_paths:
    try:
        image = Image.open(image_path)
        blip_inputs = blip_processor(image, return_tensors="pt").to(device)
        with torch.no_grad():
            blip_output = blip_model.generate(**blip_inputs)
            blip_caption = blip_processor.decode(blip_output[0], skip_special_tokens=True)

        image_input = clip_preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_features = clip_model.encode_image(image_input)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        caption_tokenized = clip.tokenize([blip_caption]).to(device)
        with torch.no_grad():
            caption_features = clip_model.encode_text(caption_tokenized)
        caption_features = caption_features / caption_features.norm(dim=-1, keepdim=True)

        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform([blip_caption] + google_goals)
        tfidf_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

        caption_embedding_sbert = sbert_model.encode([blip_caption])
        sbert_similarities = cosine_similarity(caption_embedding_sbert, goal_embeddings_sbert).flatten()

        image_goal_similarities = []
        for goal_token in goal_tokens_clip:
            with torch.no_grad():
                goal_features = clip_model.encode_text(goal_token)
            goal_features = goal_features / goal_features.norm(dim=-1, keepdim=True)
            image_goal_similarity = (image_features @ goal_features.T).item()
            image_goal_similarities.append(image_goal_similarity)

        row_data = {
            "Image Name": image_path,
            "BLIP Caption": blip_caption,
        }
        row_data.update({f"Goal_{i+1}": google_goals[i] for i in range(len(google_goals))})
        row_data.update({f"TF-IDF_Goal_{i+1}": tfidf_similarities[i] for i in range(len(tfidf_similarities))})
        row_data.update({f"SBERT_Goal_{i+1}": sbert_similarities[i] for i in range(len(sbert_similarities))})
        row_data.update({f"Image-Goal_{i+1}": image_goal_similarities[i] for i in range(len(image_goal_similarities))})

        results.append(row_data)

    except Exception as e:
        print(f"Error processing {image_path}: {e}")

dataset = pd.DataFrame(results, columns=column_names)

dataset.head()


## extracted goals from the pivot goal 

In [17]:
cardinal_focus = ['Climate', 'GHG', 'Governance', 'Distribution', 'Compliance Standards', 'Transparency, Waste']
cardinal_goals = ['Minimize waste generated in our operations and maintain high rates of reuse and recycling' ,
                 'By the end of 2021, we plan to set a public greenhouse gas emissions reduction goal for our pharmaceutical distribution business']

In [18]:
boeing_focus = ['Toxics', 'Compliance Standards', 'Transparency','Renewables', 'Fuel']
boeing_goals = ['Catalyze the industry toward sustainable aviation fuel commercialization',
               'Work with aviation stakeholders to ensure that all Boeing products comply with chemical restrictions and bans ']

In [19]:
citi_focus = ['Governance', 'Compliance Standards', 'Transparency']
citi_goals = ['Update Statement of Supplier Principles and disseminate it to all suppliers.',
             'Monitor emerging risks and trends',
             'Achieve 100% of suppliers to respond to Corporate Responsibility Questionnaire (CRQ) in all regions by 2019',
             'Finalize and implement supply chain policies for paper and paper products, IT and e-waste disposal and travel and logistics by 2019.']

In [20]:
google_focus = ['Energy', 'Buildings','Climate','GHG','Water']
google_goals = ['Maintain or improve quarterly PUE at each Google data center, year over year', 
               'Maintain carbon neutrality for our operations each year',
               'Reduce potable water intensity at our Bay Area headquarters by 5% by the end of 2019, against a 2017 baseline']

In [21]:
khc_focus = ['Energy', 'Water','Land Use', 'Forest', 'Food & Ag', 'Transparency']
khc_goals = ['Reduce energy consumption by 15% by 2020 (2015 baseline)', 
             'Reduce water consumption by 15% by 2020 (2015 baseline)',
             'Purchase 100% sustainable and traceable palm oil']

In [22]:
pfizer_focus = ['Climate', 'GHG', 'Water' ]
pfizer_goals = ['By 2020, total hazardous and nonhazardous waste will be reduced by 15% compared with the 2012 baseline',
                'By 2020, greenhouse gas emissions (Total Scope 1 and Scope 2) will be reduced by 20% compared with the 2012 baseline',
                'By 2020, total water withdrawal (excluding non-contact cooling water) will be reduced by 5% compared with the 2012 baseline.']

In [23]:
PG_focus = ['Forest','ComplianceStandards']
PG_goals = ['Educate employees across all levels',
            'Reward progress and integrate recognition in employee performance assessment',
            'Help develop a Forest Positive approach for the forest products industry that is based on sound science and delivers forest health benefits. As the concept of a Forest Positive approach is better defined, we will seek to implement Forest Positive actions that we believe will serve to sustain and expand the protection of working forests P&G depends on']

## labeling part code.

In [25]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from IPython.display import clear_output 

google_goals = ['Educate employees across all levels',
            'Reward progress and integrate recognition in employee performance assessment',
            'Help develop a Forest Positive approach for the forest products industry that is based on sound science and delivers forest health benefits. As the concept of a Forest Positive approach is better defined, we will seek to implement Forest Positive actions that we believe will serve to sustain and expand the protection of working forests P&G depends on']

dataset_path = './datasets/V2/p&g.csv'

df = pd.read_csv(dataset_path)

if 'Image Name' not in df.columns:
    raise ValueError("Dataset must have a column named 'Image Name'")

image_folder = './Pictures/P&G/'

labels_data = []

def clear_screen():
    if os.name == 'nt':
        os.system('cls')
    else:
        os.system('clear')

for _, row in df.iterrows():
    image_name = row['Image Name']
    image_path = os.path.join(image_folder, image_name)
    
    if not os.path.exists(image_path):
        print(f"Warning: Image {image_name} not found. Skipping.")
        continue

    image = Image.open(image_path)
    plt.imshow(image)
    plt.axis('off')  
    plt.draw() 
    plt.pause(0.1)  
    plt.close()  

    print(f"\nLabeling Image: {image_name}")
    for i, goal in enumerate(google_goals, start=1):
        print(f"\nGoal {i}: {goal}")
        label = input(f"Is this image relevant to Goal {i}? (1 for Yes, 0 for No): ")

        while label not in ['0', '1']:
            print("Invalid input. Please enter 1 (Yes) or 0 (No).")
            label = input(f"Is this image relevant to Goal {i}? (1 for Yes, 0 for No): ")

        labels_data.append((image_name, f"Goal {i} Label", label))

    if os.environ.get('DISPLAY', '') == '':  # Terminal-based environments
        clear_screen()
    else:
        clear_output(wait=True)

labels_df = pd.DataFrame(labels_data, columns=['Image Name', 'Goal', 'Label'])

labels_df_pivot = labels_df.pivot(index='Image Name', columns='Goal', values='Label')

df = pd.merge(df, labels_df_pivot, on='Image Name', how='left')

output_path = './datasets/V2/labeled_p&g.csv'  # Specify the output path
df.to_csv(output_path, index=False)

print(f"\nLabeling complete! The updated dataset has been saved to {output_path}.")



Labeling complete! The updated dataset has been saved to ./datasets/V2/labeled_p&g.csv.


## result aggregation

In [18]:
import pandas as pd
import os

def calculate_map(relevance_list):
    total_relevant = sum(relevance_list)
    if total_relevant == 0:
        return 0.0  
    
    precision_sum = 0.0
    relevant_count = 0
    
    for idx, relevance in enumerate(relevance_list, start=1):
        if relevance == 1:
            relevant_count += 1
            precision_at_k = relevant_count / idx
            precision_sum += precision_at_k
    
    ap = precision_sum / total_relevant
    return ap

def compute_map_for_datasets(file_paths, top_k=10):
    results = []
    
    for file_path in file_paths:
        dataset_name = os.path.basename(file_path).split('.')[0]
        data = pd.read_csv(file_path)
                methods = ["TF-IDF", "SBERT", "Image"]
        
        for method in methods:
            for col in data.columns:
                if method in col:
                    goal_number = col.split("_")[-1]  # e.g., Goal_1, Goal_2, etc.
                    goal_label_col = f"Goal {goal_number} Label"  # Corresponding goal label column
                    
                    if goal_label_col in data.columns:
                        sorted_data = data.sort_values(by=col, ascending=False)
                        
                        top_k_relevance = sorted_data[goal_label_col].head(top_k).tolist()
                        
                        map_score = calculate_map(top_k_relevance)
                                                results.append({
                            "Dataset": dataset_name,
                            "Goal": goal_number,
                            "Method": method,
                            "MAP": map_score
                        })
    
    return pd.DataFrame(results)

file_paths = [
    './datasets/V2/labeled_boeing.csv',
    './datasets/V2/labeled_cardinal.csv',
    './datasets/V2/labeled_citi.csv',
    './datasets/V2/labeled_google.csv',
    './datasets/V2/labeled_khc.csv',
    './datasets/V2/labeled_pfizer.csv',
    './datasets/V2/labeled_p&g.csv'
]

top_k = 10  
map_results = compute_map_for_datasets(file_paths, top_k=top_k)

output_file = './datasets/V2/map_results_for_top_10.csv'
map_results.to_csv(output_file, index=False)


MAP results saved to './datasets/V2/map_results_for_top_10.csv'.


In [19]:
import pandas as pd

map_results = pd.read_csv('./datasets/V2/map_results_for_top_10.csv')

method_stats = map_results.groupby("Method")["MAP"].agg(["mean", "std"]).reset_index()

method_stats_file = './datasets/V2/method_stats.csv'
method_stats.to_csv(method_stats_file, index=False)

print(f"Method statistics (mean, std) saved to '{method_stats_file}'.")
print(method_stats)


Method statistics (mean, std) saved to './datasets/V2/method_stats.csv'.
   Method      mean       std
0   Image  0.464315  0.369896
1   SBERT  0.384954  0.394615
2  TF-IDF  0.213894  0.289296
