# Script to Train Embedding Model

In this notebook, we try to train a sentence transformer using the handwashing dataset

Author: Nardiena A. Pratama

Sources Used:

- Source of notebook: https://github.com/huggingface/blog/blob/main/how-to-train-sentence-transformers.md
- Additional: https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/95_Training_Sentence_Transformers.ipynb#scrollTo=fwYA76vY2YbZ 


In [None]:
!pip install wordsegment autocorrect 
!pip install spacy==3.8.0

In [None]:
!python -m spacy download en_core_web_trf
!pip install wandb seaborn
!pip install accelerate==0.27.2
!pip install sentence-transformers

In [None]:
import os
import boto3
import pandas as pd
from io import StringIO
import wandb

from datasets import (
    Dataset, 
    DatasetDict
)
from sentence_transformers import (
    SentenceTransformer,
    InputExample

)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from torch.utils.data import DataLoader
from torch.optim import *


from helper_scripts.preprocess import *


## Set AWS Credentials

Do not put quotation marks around the value.

In [None]:
%env BUCKET_NAME=aws_bucket_name

## Connect to AWS

In [None]:
# Create a session using the default credentials (IAM role attached to the instance)
session = boto3.Session()

# Create an S3 client
s3 = session.client('s3')

# Specify your bucket name
bucket_name = os.getenv('BUCKET_NAME')

### Set up W&B

In [None]:
wandb.login()

## Read CSV containing ML and Human Annotations

In [None]:
key = "repo/data/outputs_50/final_combined_ml_human.csv"
response = s3.get_object(Bucket=bucket_name, Key=key)
csv_content = response['Body'].read().decode('utf-8')
data_df = pd.read_csv(StringIO(csv_content))

data_df

### Create dataframe containing list of sentences for every image ID

In [None]:
from helper_scripts.preprocess import *

expanded_data = []
for _, row in data_df.iterrows():
    perms = create_combinations_from_list(row, ['ml_labels', 'ml_captions', 'human_labels'], 2)
    for perm in perms:
        duplicated_row = row.to_dict()
        duplicated_row["Combination Pair"] = list(perm)  # Add permutation as a new column
        expanded_data.append(duplicated_row)

data_df_with_combinations = pd.DataFrame(expanded_data)
data_df_with_combinations

In [None]:
combined_data = pd.DataFrame({'set': data_df_with_combinations['Combination Pair']})
combined_data

In [None]:
train_data = combined_data.copy()

### Convert dataframe to dataset object

In [None]:
train_dataset = Dataset.from_pandas(train_data)

dataset_dict = DatasetDict({
    'train': train_dataset,     
})
dataset_dict

In [None]:
print(f"- The Handwashing dataset has {dataset_dict['train'].num_rows} examples.")
print(f"- Each example is a {type(dataset_dict['train'][0])} with a {type(dataset_dict['train'][0]['set'])} as value.")
print(f"- Examples look like this: {dataset_dict['train'][0]}")


In [None]:
print(f"Examples look like this: {dataset_dict['train']['set'][1]}")

In [None]:
train_examples = []
train_data = dataset_dict['train']['set']
n_examples = dataset_dict['train'].num_rows

for i in range(n_examples):
  example = train_data[i]
#   print(example)
  train_examples.append(InputExample(texts=[example[0], example[1]]))

In [None]:
# ======================= FINETUNED ========================


model = SentenceTransformer("all-MiniLM-L12-v2")


batch_size=16
shuffle=True
train_dataloader = DataLoader(train_examples, shuffle=shuffle, batch_size=batch_size)
train_loss = MultipleNegativesRankingLoss(model)
num_epochs = 10
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

## Optional: If using wandb

In [None]:
version = "v1"
training_type = "finetuning_all-MiniLM-L12-v2" # distilroberta_jan3_icwsm25

run = wandb.init(
    # Set the project where this run will be logged
    project="ICSWSM-2025-RnR",
    # Track hyperparameters and run metadata
    config={
        "batch_size__train_dataloader": batch_size,
        "shuffle__train_dataloader": shuffle,
        "num_examples__train_dataloader": len(train_dataloader),
        "epochs": num_epochs,
        "train_loss" : "MultipleNegativesRankingLoss",
        "version": version,
        "training_type": training_type
    },
)

## Fit Model

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps
          ) 

In [None]:
model.save_pretrained(f"models/{training_type}/{version}")

## Upload Repository to AWS

In [None]:
local_directory = "models/"  # Local directory to upload
s3_directory = f"repo/data/outputs_50/models/"  # S3 path where the directory will be uploaded

upload_directory(local_directory, bucket_name, s3_directory, s3)


## END