In [None]:
# Parameters
use_ml_obj = True
use_ml_capt = False
use_human_labels = False
n_jobs = 24
embeddings_path = "/data/outputs_50/finetuning_all-MiniLM-L12-v2_embeddings.csv"


# Script to Perform Region Classification Using Embeddings

Sources:
- https://imbalanced-learn.org/stable/ensemble.html#boosting

Author: Nardiena A. Pratama


In [None]:
model_output_dir = "clsf-v3"
run_code = True

In [None]:
!pip install imblearn fairlearn joblib seaborn
!pip install wordsegment autocorrect 
!pip install spacy==3.8.0
!python -m spacy download en_core_web_trf

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import joblib
import boto3
import pandas as pd
from io import StringIO, BytesIO

from imblearn.ensemble import RUSBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import confusion_matrix, classification_report
from fairlearn.metrics import demographic_parity_ratio, equalized_odds_ratio


from helper_scripts.preprocess import *
from helper_scripts.utility_functions import *

## Set AWS Credentials

Do not put quotation marks around the value.

In [None]:
%env BUCKET_NAME=aws_bucket_name

## Connect to AWS

In [None]:
# Create a session using the default credentials (IAM role attached to the instance)
session = boto3.Session()

# Create an S3 client
s3 = session.client('s3')

# Specify your bucket name
bucket_name = os.getenv('BUCKET_NAME')


In [None]:
ANNOTATIONS = []
if use_ml_obj:
    ANNOTATIONS.append("ml_object_embed")
if use_ml_capt:
    ANNOTATIONS.append("ml_caption_embed")

if use_human_labels:
    ANNOTATIONS.append("human_embed")

In [None]:
print(ANNOTATIONS)

In [None]:
response = s3.get_object(Bucket=bucket_name, Key=embeddings_path)
csv_content = response['Body'].read().decode('utf-8')
data = pd.read_csv(StringIO(csv_content))
# Read the embedding columns as arrays
data["ml_object_embed"]= data.apply(lambda x: convert_str_to_array(x["ml_object_embed"]), axis=1)
data["ml_caption_embed"]= data.apply(lambda x: convert_str_to_array(x["ml_caption_embed"]), axis=1)
data["human_embed"]= data.apply(lambda x: convert_str_to_array(x["human_embed"]), axis=1)
# data = data.groupby('category').apply(lambda x: x.sample(n=40, random_state=42)).reset_index(drop=True)

data

In [None]:
subset = data[['id', 'category','ml_object_embed','ml_caption_embed', 'human_embed', 'region', 'country', 'income']].copy()
subset

In [None]:
subset['embeddings'] = subset.apply(lambda row: np.concatenate([row[i] for i in ANNOTATIONS]), axis=1)
subset

In [None]:
# subset_X = subset[['ml_object_embed','ml_caption_embed', 'human_labels_embed']]
# subset_X = subset['embeddings']
subset_X = subset['embeddings'].apply(lambda x: np.array(x).flatten())
subset_X = np.stack(subset_X)
subset_y = subset['region'].values
categories = subset[['category']]
unique_categories = np.unique(categories)
unique_regions = np.unique(subset_y)


subset_X


In [None]:
X_train, X_test, y_train, y_test, cats_train, cats_test = train_test_split(subset_X, subset_y,categories, test_size=0.2, random_state=42, stratify=subset_y)

In [None]:
X_train

In [None]:
cats_test['category'].to_list()

## Use RUSBoostClassifier


In [None]:
unique_regions

In [None]:
%%time
assert run_code == True, "Run code is set to False! Change value to run code below."

n_runs = 10
random_seeds = list(range(n_runs))

all_y_predictions = dict()

all_scores = dict()
all_scores['overall_weighted'] = []
all_scores['overall_macro'] = []


for label in unique_regions:
    all_scores[label] = {'none-average': [], 'group-by': []}

all_scores_category = dict()


for curr_cat in unique_categories:
    all_scores_category[curr_cat] = {'weighted': [], 'macro': [], 'group-by': []}

for seed in random_seeds:
    print(f"Seed: {seed}")
    parameters = {
        'n_estimators': list(range(50,150,50)),
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0],
    }
    # f1_scorer = make_scorer(f1_score, average='weighted')

    dt_base_learner = DecisionTreeClassifier(random_state = seed, class_weight="balanced")

    rusboost = RUSBoostClassifier(estimator=dt_base_learner, sampling_strategy='not minority',
                                random_state=seed)
    clf = GridSearchCV(rusboost, parameters, scoring="f1_weighted", return_train_score=True, n_jobs=n_jobs)
    with joblib.parallel_backend(backend='loky', n_jobs=n_jobs):
        clf.fit(X_train,  y_train)
    print("Grid Search CV done...")
    print("Training based on best parameters...")
    real_rusboost = RUSBoostClassifier(estimator=DecisionTreeClassifier(
                                    class_weight=clf.best_estimator_.estimator_.class_weight,
                                    random_state=seed),
                   learning_rate=clf.best_estimator_.learning_rate,                     
                   n_estimators=clf.best_estimator_.n_estimators, 
                   random_state=seed,
                   sampling_strategy=clf.best_estimator_.sampling_strategy)
    real_rusboost.fit(X_train, y_train)
    print("Model has been fitted!")
    y_test_predictions = real_rusboost.predict(X_test)
    all_y_predictions[seed] = y_test_predictions
    
    print(np.mean(f1_score(y_test, y_test_predictions, average=None)))
    
    macro_f1 = f1_score(y_test, y_test_predictions, average='macro')
    print("overall Class f1: ", macro_f1)
    all_scores['overall_macro'].append(macro_f1)
    
    weighted_f1 = f1_score(y_test, y_test_predictions, average='weighted')
    print("weighted f1: ", weighted_f1 )
    all_scores['overall_weighted'].append(weighted_f1)

    #  This approach below or do subset.groupby('y_test').apply(lambda x: f1_score(x['y_test'], x['y_pred']), axis=1)
    class_f1 = f1_score(y_test, y_test_predictions, average=None, labels=unique_regions)
    print(class_f1)

    df = pd.DataFrame({'y_test': y_test, 'y_pred': y_test_predictions})
    f1_groupby = df.groupby('y_test').apply(lambda group: f1_score(group['y_test'], group['y_pred'], average='weighted')).to_dict()
    
    for idx, cls in enumerate(unique_regions):
        print(idx)
        all_scores[cls]['none-average'].append(class_f1[idx])
        all_scores[cls]['group-by'].append(f1_groupby[cls])


    # Calculate the F1 score for the current category
    print("creating df...")
    df = pd.DataFrame({'y_test': y_test, 'y_pred': y_test_predictions, 'category': cats_test['category'].to_list()})
    f1_groupby_cat = df.groupby('category').apply(lambda group: f1_score(group['y_test'], group['y_pred'], average='weighted')).to_dict()
    print("Group-By Weighted f1 cat: ", f1_groupby_cat)
    
    # Loop through each category to calculate the F1 score for that category
    for category in unique_categories:
        all_scores_category[category]['group-by'].append(f1_groupby_cat[category])
        
        # Create a mask to select the data for the current category
        category_mask = cats_test['category'] == category
        
        y_true_category = y_test[category_mask]
        y_pred_category = y_test_predictions[category_mask]
        
        # Calculate the F1 score for the current category
        f1 = f1_score(y_true_category, y_pred_category, average='macro')  
        print("Macro f1 cat: ", f1)
        all_scores_category[category]['macro'].append(f1)

        # Calculate the F1 score for the current category
        f1 = f1_score(y_true_category, y_pred_category, average='weighted')  
        print("Weighted f1 cat: ", f1)
        all_scores_category[category]['weighted'].append(f1)

        
        print(f"F1 score for category {category}: {f1}")
    

   
    


In [None]:
(all_scores)

In [None]:
all_scores_category

In [None]:
parts = []
# Append relevant terms based on the flags
if use_ml_obj:
    parts.append('ml_obj')
if use_ml_capt:
    parts.append('ml_capt')
if use_human_labels:
    parts.append('human_labels')
    

# Join the parts with underscores
annotations_used_underscore = '_'.join(parts)
annotations_used = " ".join(annotations_used_underscore.split("_"))
annotations_used_underscore

In [None]:
title = annotations_used.title()


s3_path_all_scores = f'/data/outputs_50/model_outputs/{model_output_dir}/{annotations_used_underscore}_clsf_all_scores.pickle'
s3_path_all_scores_category = f'/data/outputs_50/model_outputs/{model_output_dir}/{annotations_used_underscore}_clsf_all_scores_category.pickle'
s3_path_all_y_preds = f'/data/outputs_50/model_outputs/{model_output_dir}/{annotations_used_underscore}_clsf_all_y_predictions.pickle'

if run_code:
    upload_pickle_to_s3(s3, bucket_name, s3_path_all_scores, all_scores)
    upload_pickle_to_s3(s3, bucket_name, s3_path_all_scores_category, all_scores_category)
    upload_pickle_to_s3(s3, bucket_name, s3_path_all_y_preds, all_y_predictions)

else:
    all_scores = read_pickle_from_s3(s3, bucket_name, s3_path_all_scores)
    all_scores_category = read_pickle_from_s3(s3, bucket_name, s3_path_all_scores_category)
    y_test_predictions = read_pickle_from_s3(s3, bucket_name, s3_path_all_y_preds)
    y_test_predictions = np.mean(
                            np.stack(list(y_test_predictions.values())), axis=0
                        )
    

# Upload to S3
buffer = StringIO()

# Create the content to upload
line = f"Results for {title}...\n"
print(line)
buffer.write(line + '\n')

line = "============================== Overall F1 =============================="
print(line)
buffer.write(line + '\n')
line = f"Overall Weighted Mean: {normal_round(np.mean(all_scores['overall_weighted']),2)}, Standard Deviation: {normal_round(np.std(all_scores['overall_weighted']),2)}"
print(line)
buffer.write(line + '\n')
line = f"Overall Macro Mean: {normal_round(np.mean(all_scores['overall_macro']),2)}, Standard Deviation: {normal_round(np.std(all_scores['overall_macro']),2)}"
print(line)
buffer.write(line + '\n')

line = f"\n============================= Grouped by Region ============================="
print(line)
buffer.write(line + '\n')

for cls in unique_regions:
    line = f"\n============================== {cls} =============================="
    print(line)
    buffer.write(line + '\n')
    line = f"None-Average F1 Mean: {normal_round(np.mean(all_scores[cls]['none-average']),2)}, Standard Deviation: {normal_round(np.std(all_scores[cls]['none-average']),2)}"
    print(line)
    buffer.write(line + '\n')
    line = f"Group-By F1 Mean: {normal_round(np.mean(all_scores[cls]['group-by']),2)}, Standard Deviation: {normal_round(np.std(all_scores[cls]['group-by']),2)}"
    print(line)
    buffer.write(line + '\n')

line = f"\n============================= Grouped by Image Category ============================="
print(line)
buffer.write(line + '\n')

for cls in unique_categories:
    # print("shape:", len(all_scores_category[cls]), all_scores_category[cls])
    line = f"\n============================== {cls} =============================="
    print(line)
    buffer.write(line + '\n')
    line = f"Weighted F1 Mean: {normal_round(np.mean(all_scores_category[cls]['weighted']),2)}, Standard Deviation: {normal_round(np.std(all_scores_category[cls]['weighted']),2)}"
    print(line)
    buffer.write(line + '\n')
    line = f"Macro F1 Mean: {normal_round(np.mean(all_scores_category[cls]['macro']),2)}, Standard Deviation: {normal_round(np.std(all_scores_category[cls]['macro']),2)}"
    print(line)
    buffer.write(line + '\n')
    # this should be same as weighted f1 mean
    line = f"Group-By F1 Mean: {normal_round(np.mean(all_scores_category[cls]['group-by']),2)}, Standard Deviation: {normal_round(np.std(all_scores_category[cls]['group-by']),2)}"
    print(line)
    buffer.write(line + '\n')


buffer.seek(0)  # Move to the start of the buffer
s3 = boto3.client('s3')

s3_key = f'/data/outputs_50/model_outputs/{model_output_dir}/{annotations_used_underscore}_clsf_final_results.txt'
if run_code:
    s3.put_object(Bucket=bucket_name, Key=s3_key, Body=buffer.getvalue())
    print(f"\nFinal results file created and written successfully into {s3_key}...")

In [None]:
all_scores

### Only applicable to last test run!!!

In [None]:
cm = confusion_matrix(y_test, y_test_predictions)
# Visualize the confusion matrix as a heatmap
class_labels = np.unique(subset_y)
cm_df = pd.DataFrame(cm, index=class_labels, columns=class_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_df, annot=True, cmap="Blues", fmt="d")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')




# filename += annotations_used
# plt.savefig(f'clsf_figs/clsf_conf_matrix_{annotations_used_underscore}.png')
plt.show()

In [None]:
# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_test_predictions))
print("f1 per class: ", f1_score(y_test, y_test_predictions, average=None))

print("weighted f1: ", f1_score(y_test, y_test_predictions, average='weighted'))

s3_file_key = f"/data/outputs_50/model_outputs/{model_output_dir}/{annotations_used_underscore}_clsf_report.txt"

# Generate classification report and F1 scores
classification_report_str = "Classification Report:\n"
classification_report_str += classification_report(y_test, y_test_predictions) + "\n"
classification_report_str += "f1 per class: " + str(f1_score(y_test, y_test_predictions, average=None)) + "\n"
classification_report_str += "weighted f1: " + str(f1_score(y_test, y_test_predictions, average='weighted')) + "\n"

# Upload the report to S3
# s3.put_object(
#     Bucket=bucket_name,
#     Key=s3_file_key,
#     Body=classification_report_str,
#     ContentType='text/plain'
# )

# print(f"Classification report saved to S3 at: s3://{bucket_name}/{s3_file_key}")


In [None]:
for region in np.unique(y_test):
    region_mask = (y_test == region)
    positive_predictions = sum(y_test_predictions[region_mask] == region)
    print(f"Region: {region}, Positive Predictions: {positive_predictions}")
print(f"Demographic parity: {demographic_parity_ratio(y_test, y_test_predictions, sensitive_features=y_test)}")

In [None]:
y_test.shape

In [None]:
y_test_predictions.shape

In [None]:
from sklearn.metrics import accuracy_score
from fairlearn.metrics import MetricFrame

# some random numbers
y_true = y_test
y_pred = y_test_predictions
sensitive_features = y_true

# compute metric frame
mf = MetricFrame(metrics=accuracy_score, 
                  y_true=y_true, y_pred=y_pred, 
                  sensitive_features=sensitive_features)
# print results
print(mf.by_group) # series with accuracy for each sensitive group
print(mf.difference()) # difference in accuracy between the two sensitive groups




In [None]:
def upload_directory(local_dir, bucket_name, s3_path):
    for root, dirs, files in os.walk(local_dir):
        for file in files:
            local_file_path = os.path.join(root, file)
            relative_path = os.path.relpath(local_file_path, local_dir)
            s3_file_path = os.path.join(s3_path, relative_path).replace("\\", "/")  # Replace for S3 compatibility
            
            print(f"Uploading {local_file_path} to s3://{bucket_name}/{s3_file_path}")
            s3.upload_file(local_file_path, bucket_name, s3_file_path)


local_directory = "clsf_figs/"  # Local directory to upload
s3_directory = f"/data/outputs_50/model_outputs/{model_output_dir}/"  # S3 path where the directory will be uploaded

# upload_directory(local_directory, bucket_name, s3_directory)

# END