# Script to Convert Annotations to Embeddings


Author: Nardiena A. Pratama


In [None]:
!pip install wordsegment autocorrect 
!pip install spacy==3.8.0
!python -m spacy download en_core_web_trf
!pip install wandb seaborn 
!pip install 'sentence-transformers==3.0.1'


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import seaborn as sns
import boto3
import pandas as pd
from io import StringIO, BytesIO
import re
from helper_scripts.utility_functions import *
from helper_scripts.preprocess import *

## Set AWS Credentials

Do not put quotation marks around the value.

In [None]:
%env BUCKET_NAME=aws_bucket_name

## Connect to AWS

In [None]:
# Create a session using the default credentials (IAM role attached to the instance)
session = boto3.Session()

# Create an S3 client
s3 = session.client('s3')

# Specify your bucket name
bucket_name = os.getenv('BUCKET_NAME')

In [None]:
# Parameters

s3_model_path = "/data/outputs_50/models/finetuning_all-MiniLM-L12-v2/v1"  # Make sure this matches with the path to your saved embedding model

local_model_dir = "models/finetuning_all-MiniLM-L12-v2/v1"  # create a local directory to save the model files

# Download model files from S3
download_model_from_s3(bucket_name, s3_model_path, local_model_dir)


In [None]:
ft_minilm12 = 'models/finetuning_all-MiniLM-L12-v2/v1' # 5 epochs


model = SentenceTransformer(local_model_dir)

model_name = 'finetuning_all-MiniLM-L12-v2'


print(model_name)


In [None]:
key = "/data/outputs_50/final_combined_ml_human.csv"
response = s3.get_object(Bucket=bucket_name, Key=key)
csv_content = response['Body'].read().decode('utf-8')
labelled_data = pd.read_csv(StringIO(csv_content))

labelled_data

In [None]:
labelled_data["ml_object_embed"]= labelled_data.apply(lambda x: make_embedding(x["ml_labels"], model), axis=1)
labelled_data["ml_caption_embed"]= labelled_data.apply(lambda x: make_embedding(x["ml_captions"], model), axis=1)
labelled_data["human_embed"]= labelled_data.apply(lambda x: make_embedding(x["human_labels"], model), axis=1)


labelled_data

In [None]:
labelled_data[['ml_labels','ml_captions', 'human_labels']].iloc[2]['ml_labels']

In [None]:
csv_buffer = StringIO()
labelled_data.to_csv(csv_buffer, index=False)


file_path = f"/data/outputs_50/{model_name}_embeddings.csv"
# s3.put_object(Bucket=bucket_name, Key=file_path, Body=csv_buffer.getvalue())

print(f"DataFrame saved as CSV and uploaded to {file_path} successfully.")

In [None]:
# scratch_distilroberta_jan3_icwsm25_embeddings.csv
# finetuning_all-MiniLM-L6-v2_jan3_icwsm25_embeddings.csv
# finetuning_all-MiniLM-L6-v2_jan7_icwsm25_embeddings.csv
# all-distilroberta-v1_embeddings
key = f"/data/outputs_50/{model_name}_embeddings.csv"
print(key)
response = s3.get_object(Bucket=bucket_name, Key=key)
csv_content = response['Body'].read().decode('utf-8')
labelled_data_embed = pd.read_csv(StringIO(csv_content))

labelled_data_embed

In [None]:
labelled_data_embed["ml_object_embed"]= labelled_data_embed.apply(lambda x: convert_str_to_array(x["ml_object_embed"]), axis=1)
labelled_data_embed["ml_caption_embed"]= labelled_data_embed.apply(lambda x: convert_str_to_array(x["ml_caption_embed"]), axis=1)
labelled_data_embed["human_embed"]= labelled_data_embed.apply(lambda x: convert_str_to_array(x["human_embed"]), axis=1)
labelled_data_embed

In [None]:
labelled_data_embed["ml_object_embed"][0].shape

## Flatten embeddings and add all into list for t-SNE visualisation

In [None]:
columns = ["ml_object_embed", "ml_caption_embed", "human_embed"]
tsne_arr = []

for col in columns:
    temp_arr = []
    for idx, row in labelled_data_embed.iterrows():
        temp_arr.append(row[col].reshape(-1).tolist())
        # print(np.array(temp_arr).shape)
    tsne_arr.append(temp_arr)
    # print(np.array(tsne_arr).shape)
tsne_arr = np.array(tsne_arr)
tsne_arr.shape

In [None]:
data_ml_obj = tsne_arr[0]
data_ml_capt = tsne_arr[1]
data_human_lab = tsne_arr[2]

In [None]:
# Define parameter grid
perplexities = [5, 10]
learning_rates = [10, 100, 200, 500, 1000]
n_iter = [250, 500, 1000]

best_score = float('inf')
best_params = {}
combined_data = np.vstack((data_ml_obj, data_ml_capt, data_human_lab))

for perplexity in perplexities:
    for lr in learning_rates:
        for iters in n_iter:
            tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=lr, max_iter=iters, random_state=42)
            tsne_result = tsne.fit_transform(combined_data)

            # Example: Evaluating based on Kullback-Leibler divergence (lower is better)
            kl_divergence = tsne.kl_divergence_
            
            if kl_divergence < best_score:
                best_score = kl_divergence
                best_params = {'perplexity': perplexity, 'learning_rate': lr, 'n_iter': iters}

print(f"Best Parameters: {best_params} with KL Divergence: {best_score}")


In [None]:
# OLD - learning rate: 'auto', perplexity: 5
# NEW - Best Parameters: {'perplexity': 5, 'learning_rate': 1000, 'n_iter': 1000} with KL Divergence: 0.30165356397628784

tsne = TSNE(n_components=2, learning_rate=best_params['learning_rate'],
                  init='pca', 
                  perplexity=best_params['perplexity'], 
                  max_iter=best_params['n_iter'], 
                  random_state=27) #42
tsne_result = tsne.fit_transform(combined_data)


In [None]:
# Determine the split points for each dataset in the combined data
num_ml_obj = len(data_ml_obj)
num_ml_capt = len(data_ml_capt)
num_human_lab = len(data_human_lab)

# Split the tsne_result back into the original datasets
tsne_ml_obj = tsne_result[:num_ml_obj]
tsne_ml_capt = tsne_result[num_ml_obj:num_ml_obj + num_ml_capt]
tsne_human_lab = tsne_result[num_ml_obj + num_ml_capt:]

df_ml_obj = pd.DataFrame(tsne_ml_obj, columns=['tsne-2d-one', 'tsne-2d-two'])
df_ml_obj['Annotation Type'] = 'ML Object Labels'

df_ml_capt = pd.DataFrame(tsne_ml_capt, columns=['tsne-2d-one', 'tsne-2d-two'])
df_ml_capt['Annotation Type'] = 'ML Captions'

df_human_lab = pd.DataFrame(tsne_human_lab, columns=['tsne-2d-one', 'tsne-2d-two'])
df_human_lab['Annotation Type'] = 'Human Labels'

df_tsne = pd.concat([df_ml_obj, df_ml_capt, df_human_lab], ignore_index=False)

plt.figure(figsize=(10,8))
plt.title('t-SNE Visualization of Embeddings', fontsize=20)

sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="Annotation Type",
    palette=sns.color_palette(palette='bright', n_colors=3, desat=1),
    data=df_tsne,
    legend="full",
    alpha=0.7
)

# Set axis labels and font size
plt.xlabel("t-SNE Component 1", fontsize=16)
plt.ylabel("t-SNE Component 2", fontsize=16)

# Customize tick labels' font size
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

# Customize legend font size
plt.legend(title="Annotation Type", title_fontsize=18, fontsize=16
           , bbox_to_anchor=(0.5, -0.15), loc='upper center',
           markerscale=2 
          )


plt.savefig(f"figs/{model_name}_tsne_visualisation.png", dpi=300, bbox_inches='tight')
plt.savefig(f"figs/{model_name}_tsne_visualisation.svg", dpi=300, bbox_inches='tight')


In [None]:
# Upload to AWS
local_directory = "figs/"  # Local directory to upload
s3_directory = f"/data/outputs_50/figs/"  # S3 path where the directory will be uploaded

upload_directory(local_directory, bucket_name, s3_directory, s3)


In [None]:
plt.figure(figsize=(12,8))
plt.scatter(tsne_ml_obj[:, 0], tsne_ml_obj[:, 1], color='b', label='ML Object Labels', alpha=0.5)
plt.scatter(tsne_ml_capt[:, 0], tsne_ml_capt[:, 1], color='tab:orange', label='ML Captions', alpha=0.5)
plt.scatter(tsne_human_lab[:, 0], tsne_human_lab[:, 1], color='g', label='Human Labels', alpha=0.5)

plt.title('t-SNE Visualization of Columns')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend(title='Column')
plt.show()

In [None]:
tsne_ml_obj[0].reshape(1, -1)

In [None]:
import plotly.express as px
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd

# Example data
ids = [f'ID_{i}' for i in range(len(labelled_data))]  # Replace with your actual IDs

df_ml_obj = pd.DataFrame(tsne_ml_obj, columns=['tsne-2d-one', 'tsne-2d-two'])
df_ml_obj['Column'] = 'ML Object Labels'

df_ml_obj['ID'] = ids

df_ml_capt = pd.DataFrame(tsne_ml_capt, columns=['tsne-2d-one', 'tsne-2d-two'])
df_ml_capt['Column'] = 'ML Captions'
df_ml_capt['ID'] = ids

df_human_lab = pd.DataFrame(tsne_human_lab, columns=['tsne-2d-one', 'tsne-2d-two'])
df_human_lab['Column'] = 'Human Labels'

df_human_lab['ID'] = ids

df_tsne = pd.concat([df_ml_obj, df_ml_capt, df_human_lab], ignore_index=False)

custom_colors = {
    'ML Object Labels': 'blue',
    'ML Captions': 'blue',
    'Human Labels': 'blue',
    'TEST': 'red'
}
# Plotting with Plotly
fig = px.scatter(df_tsne, x='tsne-2d-one', y='tsne-2d-two', color='Column', 
                 hover_name='ID', 
                 hover_data={'Column': True, 
                             # 'Info': True, 
                             'tsne-2d-one': False, 'tsne-2d-two': False}, 
                title='t-SNE Visualization with IDs')
fig.update_traces(textposition='top center')  # Adjust text position if needed
fig.show()


## Code Below is for Testing Purposes Only

In [None]:
sentence1 = [
    # "a person is pouring water into a pink bucket",
    # "a blue bucket with a white substance",
    # "water tub,water,soil,bottle,dirty hands",

    # "person,sink",
    "a man is washing his hands in a sink",
    # "water,hand,wash,tap,soap,ring,washbasin,cloth,cleaning,make up,handwashing,save water,hyg enig,scope,tooth cleaning,face cleaning,water,sink,soup,home appliance,water pipe,hand wash,hand cleaning,save water,washing hands,vigorous washing,clean,with soap,sink,water,splash sounds,bar soap,water sound,some,man,tap,cleaning,ring,water,deep clean,use soap for handwash,rub our dirty hands,take time for handwash,wash hand for remove terms,healthy habits,good behaviours for future generation,cleaning hands with soap,teach cleaning method to others,wet hands with water,rub hands palm to palm,palm to palm with fingers interface,backs of fingers to opposing palms with fingers interlocked,clean well so you can eat well,clean hands healthy heart,handwashing good,hand washing and caring go together,all hands to the pump,let your fingers do the washing,be aware wash with care,clean hands can stop terms,hand hygiene makes me feel clean,wash your hands to kill terms,be aware wash with care,give soap a chance when washing your hands,water,sink,tap,hand,soap,towel,ring,hand wash,soap,water tap,hand wash,tissue,man,sink,water,weakness"

]

embedding1 = model.encode(sentence1)

sentence2 = [
    # "person,frisbee,fire tyrant,bowl",
    # "a woman is putting something in a bucket",
    # "water tub, water, soil, bottle, dirty hands",

    # "person,sink",
    # "a man is washing his hands in a sink",
    "water,hand,wash,tap,soap,ring,washbasin,cloth,cleaning,make up,handwashing,save water,hyg enig,scope,tooth cleaning,face cleaning,water,sink,soup,home appliance,water pipe,hand wash,hand cleaning,save water,washing hands,vigorous washing,clean,with soap,sink,water,splash sounds,bar soap,water sound,some,man,tap,cleaning,ring,water,deep clean,use soap for handwash,rub our dirty hands,take time for handwash,wash hand for remove terms,healthy habits,good behaviours for future generation,cleaning hands with soap,teach cleaning method to others,wet hands with water,rub hands palm to palm,palm to palm with fingers interface,backs of fingers to opposing palms with fingers interlocked,clean well so you can eat well,clean hands healthy heart,handwashing good,hand washing and caring go together,all hands to the pump,let your fingers do the washing,be aware wash with care,clean hands can stop terms,hand hygiene makes me feel clean,wash your hands to kill terms,be aware wash with care,give soap a chance when washing your hands,water,sink,tap,hand,soap,towel,ring,hand wash,soap,water tap,hand wash,tissue,man,sink,water,weakness"

]

embedding2 = model.encode(sentence2)
similarities = model.similarity(embedding1, embedding2)
print(similarities)

# END