In [None]:
# Parameters
use_ml_obj = True
use_ml_capt = True
use_human_labels = False
n_jobs = 24
embeddings_path = "/data/outputs_50/finetuning_all-MiniLM-L12-v2_embeddings.csv"


# Script to Perform Income Regression Using Embeddings

Sources:
- https://imbalanced-learn.org/stable/ensemble.html#boosting

Author: Nardiena A. Pratama


In [None]:
model_output_dir = "reg-v3"
run_code = False

In [None]:
!pip install imblearn fairlearn joblib seaborn
!pip install wordsegment autocorrect 
!pip install spacy==3.8.0
!pip install -U kaleido
!python -m spacy download en_core_web_trf

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import joblib
import os
import boto3
import pandas as pd
from io import StringIO, BytesIO


from imblearn.ensemble import RUSBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report, root_mean_squared_error, r2_score

from fairlearn.metrics import demographic_parity_ratio, equalized_odds_ratio


from helper_scripts.preprocess import *
from helper_scripts.utility_functions import *

## Set AWS Credentials

Do not put quotation marks around the value.

In [None]:
%env BUCKET_NAME=aws_bucket_name

## Connect to AWS

In [None]:
# Create a session using the default credentials (IAM role attached to the instance)
session = boto3.Session()

# Create an S3 client
s3 = session.client('s3')

# Specify your bucket name
bucket_name = os.getenv('BUCKET_NAME')


In [None]:
ANNOTATIONS = []
if use_ml_obj:
    ANNOTATIONS.append("ml_object_embed")
if use_ml_capt:
    ANNOTATIONS.append("ml_caption_embed")

if use_human_labels:
    ANNOTATIONS.append("human_embed")

In [None]:
print(ANNOTATIONS)

In [None]:
response = s3.get_object(Bucket=bucket_name, Key=embeddings_path)
csv_content = response['Body'].read().decode('utf-8')
data = pd.read_csv(StringIO(csv_content))

# Read the embedding columns as arrays
data["ml_object_embed"]= data.apply(lambda x: convert_str_to_array(x["ml_object_embed"]), axis=1)
data["ml_caption_embed"]= data.apply(lambda x: convert_str_to_array(x["ml_caption_embed"]), axis=1)
data["human_embed"]= data.apply(lambda x: convert_str_to_array(x["human_embed"]), axis=1)
data

In [None]:
subset = data[['id', 'category', 'ml_object_embed','ml_caption_embed', 'human_embed', 'region', 'country', 'income']].copy()
subset

In [None]:
subset['embeddings'] = subset.apply(lambda row: np.concatenate([row[i] for i in ANNOTATIONS]), axis=1)
subset

In [None]:
subset_X = subset['embeddings'].apply(lambda x: np.array(x).flatten())
subset_X = np.stack(subset_X)

subset_y = subset[['income']] 
regions = subset[['region']]
categories = subset[['category']]
unique_regions = np.unique(regions)
unique_categories = np.unique(categories)


subset_X

In [None]:
X_train, X_test, y_train, y_test, regions_train, regions_test, cats_train, cats_test = train_test_split(subset_X, subset_y, regions, categories, test_size=0.2, random_state=42)

In [None]:
categories.value_counts()

In [None]:
cats_test.value_counts()

In [None]:
import pandas as pd

X_train_df = pd.DataFrame(X_train)

y_train_series = pd.DataFrame(y_train)
regions_train_series = pd.DataFrame(regions_train)

categories_train_series = pd.DataFrame(cats_train)

concatenated_df = pd.concat([y_train_series, regions_train_series, categories_train_series], axis=1)

concatenated_df.columns = list(concatenated_df.columns[:-3]) + ['target', 'region', 'category']

concatenated_df

In [None]:
concatenated_df[concatenated_df.region == 'Asia']['target'].describe()

In [None]:
concatenated_df[concatenated_df.region == 'The Americas']['target'].hist()

## Use Ada Boost Regression



In [None]:
%%time

assert run_code == True, "Run code is set to False! Change value to run code below."
n_runs = 10
random_seeds = list(range(n_runs))

all_y_predictions = dict()

all_scores = dict()
all_scores['overall'] = {'rmse': [], 'r2': []}


for label in unique_regions:
    all_scores[label] = {'rmse': [], 'r2': []}

all_scores_category = dict()


for curr_cat in unique_categories:
    all_scores_category[curr_cat] = {'rmse': [], 'r2': []}

for seed in random_seeds:
    print(f"Seed: {seed}")
    parameters = {
        'n_estimators': list(range(50,150,50)),
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0],
        'loss': ['linear', 'square', 'exponential']
    }
    dt_base_learner = DecisionTreeRegressor(random_state = seed)

    adaboostreg = AdaBoostRegressor(estimator=dt_base_learner, 
                                random_state=seed)
    reg = GridSearchCV(adaboostreg, parameters, scoring="neg_mean_squared_error", return_train_score=True, n_jobs=n_jobs)
    with joblib.parallel_backend(backend='loky', n_jobs=n_jobs):
        reg.fit(X_train, y_train.values.ravel())
    print("Grid Search CV done...")
    print("Training based on best parameters...")
    real_adaboost = AdaBoostRegressor(estimator=DecisionTreeRegressor(
                                    random_state=seed),
                   learning_rate=reg.best_estimator_.learning_rate,                     
                   n_estimators=reg.best_estimator_.n_estimators, 
                   loss=reg.best_estimator_.loss,
                   random_state=seed)
    real_adaboost.fit(X_train, y_train.values.ravel())
    print("Model has been fitted!")
    y_test_predictions = real_adaboost.predict(X_test)
    all_y_predictions[seed] = y_test_predictions
    overall_rmse = root_mean_squared_error(y_test.values.ravel(), y_test_predictions)
    all_scores['overall']['rmse'].append(overall_rmse)
    overall_r2 = r2_score(y_test.values.ravel(), y_test_predictions)
    all_scores['overall']['r2'].append(overall_r2)
    

    # Calculate RMSE for each region
    for region in unique_regions:
        # Mask for the current region
        mask = regions_test == region

        class_rmse = root_mean_squared_error(y_test.values[mask],\
                                y_test_predictions[mask.values.ravel()])
        class_r2 = r2_score(y_test.values[mask],\
                                y_test_predictions[mask.values.ravel()])
        all_scores[region]['rmse'].append(class_rmse)
        all_scores[region]['r2'].append(class_r2)

    for curr_cat in unique_categories:
        # Mask for the current region
        mask = cats_test == curr_cat

        class_rmse = root_mean_squared_error(y_test.values[mask],\
                                y_test_predictions[mask.values.ravel()])
        class_r2 = r2_score(y_test.values[mask],\
                                y_test_predictions[mask.values.ravel()])
        all_scores_category[curr_cat]['rmse'].append(class_rmse)
        all_scores_category[curr_cat]['r2'].append(class_r2)
   
    
    


In [None]:
all_scores

In [None]:
all_scores_category

In [None]:
all_y_predictions

In [None]:
parts = []
# Append relevant terms based on the flags
if use_ml_obj:
    parts.append('ml_obj')
if use_ml_capt:
    parts.append('ml_capt')
if use_human_labels:
    parts.append('human_labels')

# Join the parts with underscores
annotations_used_underscore = '_'.join(parts)
annotations_used = " ".join(annotations_used_underscore.split("_"))

In [None]:
title = annotations_used.title()

s3_path_all_scores = f'/data/outputs_50/model_outputs/{model_output_dir}/{annotations_used_underscore}_reg_all_scores.pickle'
s3_path_all_scores_category = f'/data/outputs_50/model_outputs/{model_output_dir}/{annotations_used_underscore}_reg_all_scores_category.pickle'
s3_path_all_y_preds = f'/data/outputs_50/model_outputs/{model_output_dir}/{annotations_used_underscore}_reg_all_y_predictions.pickle'

if run_code:
    upload_pickle_to_s3(s3, bucket_name, s3_path_all_scores, all_scores)
    upload_pickle_to_s3(s3, bucket_name, s3_path_all_scores_category, all_scores_category)
    upload_pickle_to_s3(s3, bucket_name, s3_path_all_y_preds, all_y_predictions)

else:
    all_scores = read_pickle_from_s3(s3, bucket_name, s3_path_all_scores)
    all_scores_category = read_pickle_from_s3(s3, bucket_name, s3_path_all_scores_category)
    y_test_predictions = read_pickle_from_s3(s3, bucket_name, s3_path_all_y_preds)
    y_test_predictions = np.mean(
                            np.stack(list(y_test_predictions.values())), axis=0
                        )
    

# Upload to S3
buffer = StringIO()

# Create the content to upload
line = f"Results for {title}...\n"
print(line)
buffer.write(line + '\n')


line = "============================== Overall RMSE =============================="
print(line)
buffer.write(line + '\n')
line = f"Mean RMSE: {normal_round(np.mean(all_scores['overall']['rmse']),2)}, Standard Deviation: {normal_round(np.std(all_scores['overall']['rmse']),2)}"
print(line)
buffer.write(line + '\n')
line = f"Mean R2: {normal_round(np.mean(all_scores['overall']['r2']),2)}, Standard Deviation: {normal_round(np.std(all_scores['overall']['r2']),2)}"
print(line)
buffer.write(line + '\n')

line = f"\n============================= Grouped by Region ============================="
print(line)
buffer.write(line + '\n')

for cls in unique_regions:
    line = f"\n============================== {cls} =============================="
    print(line)
    buffer.write(line + '\n')
    line = f"Mean RMSE: {normal_round(np.mean(all_scores[cls]['rmse']),2)}, Standard Deviation: {normal_round(np.std(all_scores[cls]['rmse']),2)}"
    print(line)
    buffer.write(line + '\n')
    line = f"Mean R2: {normal_round(np.mean(all_scores[cls]['r2']),2)}, Standard Deviation: {normal_round(np.std(all_scores[cls]['r2']),2)}"
    print(line)
    buffer.write(line + '\n')

line = f"\n============================= Grouped by Image Category ============================="
print(line)
buffer.write(line + '\n')

for cls in unique_categories:
    line = f"\n============================== {cls} =============================="
    print(line)
    buffer.write(line + '\n')
    line = f"Mean RMSE: {normal_round(np.mean(all_scores_category[cls]['rmse']),2)}, Standard Deviation: {normal_round(np.std(all_scores_category[cls]['rmse']),2)}"
    print(line)
    buffer.write(line + '\n')
    line = f"Mean R2: {normal_round(np.mean(all_scores_category[cls]['r2']),2)}, Standard Deviation: {normal_round(np.std(all_scores_category[cls]['r2']),2)}"
    print(line)
    buffer.write(line + '\n')


buffer.seek(0)  # Move to the start of the buffer
s3 = boto3.client('s3')

s3_key = f'/data/outputs_50/model_outputs/{model_output_dir}/{annotations_used_underscore}_reg_final_results.txt'
if run_code:
    s3.put_object(Bucket=bucket_name, Key=s3_key, Body=buffer.getvalue())
    print(f"\nFinal results file created and written successfully into {s3_key}...")


In [None]:
y_test_predictions == y_test_predictions.flatten()

In [None]:
np.log10(y_test_predictions)

In [None]:
run_code = True

In [None]:
import plotly.express as px
import plotly.graph_objects as go



# Create a DataFrame with ground truth, predicted, and country information
df = pd.DataFrame({'Ground Truth': y_test.values.flatten(), 'Predicted': y_test_predictions, 
                    'Country': subset.loc[y_test.index, 'country'],
                    'Continent': subset.loc[y_test.index, 'region']})

# Reset the index of the DataFrame
df.reset_index(drop=True, inplace=True)

# Create the interactive scatter plot
fig = px.scatter(df, x='Ground Truth', y='Predicted', color='Continent', hover_data=['Country', 'Continent'], template='seaborn')

# Add the regression line
coefficients = np.polyfit(df['Ground Truth'], df['Predicted'], 1)
x_vals = np.array([df['Ground Truth'].min(), df['Ground Truth'].max()])
y_vals = np.polyval(coefficients, x_vals)

fig.add_trace(go.Scatter(x=x_vals, y=y_vals, mode='lines', line_shape='linear', name='Best Fit Line', line=dict(color='blue'), line_dash='dot'))

# Create the diagonal line trace
min_gt_val = df['Ground Truth'].min()
max_gt_val = df['Ground Truth'].max()
identity_x = np.linspace(min_gt_val, max_gt_val, 100)

identity_x_logged = np.log10(identity_x)

identity_y = identity_x

line_trace = go.Scatter(
    x=identity_x,  
    y=identity_y,
    mode='lines',
    line=dict(color='red'),
    name='Identity Line'
)


# Add the diagonal line trace to the scatter plot
fig.add_trace(line_trace)



title_components = []
# Append relevant terms based on the flags
if use_ml_obj:
    title_components.append("ML Objects")
if use_ml_capt:
    title_components.append("ML Captions")
if use_human_labels:
    title_components.append("Human Labels")

if len(title_components) > 1:
    if len(title_components) == 2:
        title = " and ".join(title_components) 
    else:
        title = ", ".join(title_components[:-1]) + ", and " + title_components[-1] 
else:
    title = title_components[0] 


# Update layout for labels and titles
fig.update_layout(title=f'{title} Scatter Plot',
                  xaxis_title='Ground Truth',
                  yaxis_title='Predicted',
                  font_size=28,
                  showlegend=True,
                  hoverlabel=dict(bgcolor='white', font_size=12, font_family='Arial'),
                  legend=dict(title='Continent', font = dict(size = 19))
)

log_ticks = [10**i for i in range(int(np.floor(np.log10(df['Ground Truth'].min()))),
                                  int(np.ceil(np.log10(df['Ground Truth'].max()))) + 1)]


fig.update_layout(
    xaxis=dict(
        type="log",
        tickvals=log_ticks  # Only show ticks at powers of 10
    ),
    # yaxis=dict(type="log")
)

fig.update_traces(marker={'size': 8})

fig.update_layout(yaxis_range=[df['Predicted'].min() - 1000, 10000])


if run_code:
    # Save the figure as PNG to an in-memory binary buffer
    buffer = BytesIO()    
    fig.write_image(buffer, format="png", width=1355, height=360, scale=2)  # Requires `kaleido` or `orca`
    buffer.seek(0)
    
    s3_file_path = f"/data/outputs_50/model_outputs/{model_output_dir}/scatter_plot_{annotations_used_underscore}_new.png"
    
    # Upload the buffer content to S3
    s3.put_object(Body=buffer.getvalue(), Bucket=bucket_name, Key=s3_file_path)
    
    print(f"Plot saved to S3 at s3://{bucket_name}/{s3_file_path}")


    buffer = BytesIO()
    fig.write_image(buffer, format="svg", width=1355, height=360, scale=2)  # Requires `kaleido` or `orca`
    buffer.seek(0)
    s3_file_path = f"/data/outputs_50/model_outputs/{model_output_dir}/scatter_plot_{annotations_used_underscore}_new.svg"
    s3.put_object(Body=buffer.getvalue(), Bucket=bucket_name, Key=s3_file_path)
    
    print(f"Plot saved to S3 at s3://{bucket_name}/{s3_file_path}")

    # Close the buffer
    buffer.close()

fig.show()

# END