In [33]:
!pip install -U sagemaker
!pip install transformers
!pip install datasets[s3]
!pip install torch
!pip install sentence_transformers

[0mCollecting torch
Collecting sentence_transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting torch>=1.6.0 (from sentence_transformers)


In [34]:
import sys

import sagemaker.huggingface
import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
default_bucket = sagemaker_session.default_bucket()
model_package_group_name = f"SimilarityTestModelRegisterName"

processing step

In [35]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)
training_input_path = f's3://sagemaker-us-east-1-107408944800/samples/datasets/data.csv'
test_input_path = f's3://sagemaker-us-east-1-107408944800/samples/datasets/data.csv'

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m4.xlarge")
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)
input_data = ParameterString(
    name="InputData",
    default_value=training_input_path,
)
batch_data = ParameterString(
    name="BatchData",
    default_value=test_input_path,
)
mse_threshold = ParameterFloat(name="MseThreshold", default_value=6.0)

In [36]:
!mkdir -p code

In [37]:
%%writefile code/preprocessing.py

import os 
os.system('pip install datasets[s3]')
os.system('pip install transformers')
os.system('pip install sentence_transformers')
os.system('pip install torch')

import pandas as pd
from sentence_transformers import InputExample
from tqdm.auto import tqdm
import random
import torch
from sklearn.model_selection import train_test_split
import numpy as np
import boto3


class TripletDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, device='cpu', batch_size=10, shuffle=True, max_len=200):
        '''
        data: pandas dataframe with columns: ['triplet', 'positive_group', 'negative_group']
        tokenizer: tokenizer object from transformers library
        device: torch device
        batch_size: batch size for the dataloader
        shuffle: shuffle the data before batching
        '''
        self.data = data
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.device = device
        self.tokenizer = tokenizer
        self.max_len = max_len

        if shuffle:
            self.data = self.data.sample(frac=1).reset_index(drop=True)

        self.batched_data = []
        current_batch = []
        for k in tqdm(range(len(self.data)), unit='row', desc='Batching data'):
            row = self.data.iloc[k]
            item = row['triplet']
            current_batch.append(item.texts)
            if len(current_batch) == self.batch_size:
                self.batched_data.append(current_batch)
                current_batch = []
        for k in range(self.batch_size - len(current_batch) ):
            row = self.data.iloc[k]
            item = row['triplet']
            current_batch.append(item.texts)
        self.batched_data.append(current_batch)
        
        self.batched_data = np.array(self.batched_data)
        self.batched_anchors = []
        self.batched_positives = []
        self.batched_negatives = []
        
        for i in tqdm(range(len(self.batched_data)), unit='batch', desc='Tokenizing data'):
            batch = self.batched_data[i]
            anchors = batch[:, 0].tolist()
            positives = batch[:, 1].tolist()
            negatives = batch[:, 2].tolist()
            encoded_anchors   = self.tokenizer(anchors  , padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt')
            encoded_positives = self.tokenizer(positives, padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt')
            encoded_negatives = self.tokenizer(negatives, padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt')
            self.batched_anchors.append(encoded_anchors)
            self.batched_positives.append(encoded_positives)
            self.batched_negatives.append(encoded_negatives)

    def __len__(self):
        return len(self.batched_data)
    
    def __getitem__(self, idx):
        encoded_anchors   = self.batched_anchors[idx]
        encoded_positives = self.batched_positives[idx]
        encoded_negatives = self.batched_negatives[idx]
        encoded_anchors.to(self.device)
        encoded_positives.to(self.device)
        encoded_negatives.to(self.device)
        return encoded_anchors, encoded_positives, encoded_negatives

def get_sentence_id_label_df(path='./dataset/data.csv'):
    df = pd.read_csv(path)
    ids_to_labels_dict = get_ids_to_labels_dict(df)
    df = clean_data(df)
    #Loop over all sentences
    sentences = df['question'].tolist()
    ids = df['id'].tolist()
    labels = [ids_to_labels_dict[id] for id in ids]
    data = {'sentence': sentences, 'id': ids, 'label': labels}
    df = pd.DataFrame(data)
    return df

def get_ids_to_labels_dict(data):
    ids_to_labels_dict = {}
    grouped = data.groupby('id')
    for name, group in grouped:
        labels = group['node_name'].tolist()
        for label in labels:
            if isinstance(label, str):
                ids_to_labels_dict[name] = label
            if name in ids_to_labels_dict:
                break
        if name not in ids_to_labels_dict:
            ids_to_labels_dict[name] = f'unknown_{name}'
    return ids_to_labels_dict

        


def get_dataset(bucket, data_key, downsample_flag=True):
    data_location = 's3://{}/{}'.format(bucket, data_key)
    df = pd.read_csv(data_location)
    df = clean_data(df)
    train, test = train_test_split(df, test_size=0.2)
    # train = create_triplets(train)
    # train = remove_duplicates(train)
    # if downsample_flag:
    #     train = downsample(train)
    return train, test

def create_triplets(data):
    detailed_dict ={'triplet': [], 'positive_group': [], 'negative_group': [] }
    grouped = data.groupby('id')

    for name, group in tqdm(grouped, unit='group', desc='Creating triplets'):
        questions = group['question'].tolist()
        for i in range(len(questions)-1):
            anchor = questions[i]

            for j in range(len(questions)-1):
                if j == i:
                    continue
                positive = questions[j]

                for other_name, other_group in grouped:
                    if name == other_name:
                        continue
                    negatives = other_group['question'].tolist()

                    for negative in negatives:
                        triplet = InputExample(texts=[anchor, positive, negative])
                        detailed_dict['triplet'].append(triplet)
                        detailed_dict['positive_group'].append(name)
                        detailed_dict['negative_group'].append(other_name)

    data = pd.DataFrame(detailed_dict)
    return data

def downsample(data):
    min_len = data.positive_group.value_counts().min()
    grouped = data.groupby('positive_group')
    data = pd.concat([group.sample(min_len) for name, group in grouped])
    data = data.reset_index(drop=True)
    return data

def clean_data(data):
    data = data.dropna(subset=['question'])
    data = data.dropna(subset=['id'])
    data = data.reset_index(drop=True)
    return data

def remove_duplicates(data):
    #remove duplicate rows from the dataframe
    data = data.drop_duplicates(subset=['triplet'])
    data = data.reset_index(drop=True)
    return data

if __name__ == "__main__":

    base_dir = "/opt/ml/processing"
    
    
    train, test = get_dataset(bucket = "sagemaker-us-east-1-107408944800", data_key = "samples/datasets/data.csv")
        
    pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv")

    pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv")

Overwriting code/preprocessing.py


In [38]:
from sagemaker.sklearn.processing import SKLearnProcessor


framework_version = "1.2-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type="ml.m4.xlarge",
    instance_count=processing_instance_count,
    base_job_name="sklearn-abalone-process",
    role=role,
    sagemaker_session=pipeline_session,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [39]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        # ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
    ],
    code="code/preprocessing.py",
)

preprocessing_step = ProcessingStep(name="SimilarityPreprocessing", step_args=processor_args)

training

In [40]:
%%writefile code/train.py

import os 
os.system('pip install datasets[s3]')
os.system('pip install transformers')
os.system('pip install sentence_transformers')
os.system('pip install torch')
os.system('pip install sklearn')
os.system('pip install peft')
os.system('pip install wandb')

import os
import sys
import random
import csv
import gzip
import wandb
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_distances
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import InputExample, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator




def calculate_dsiatances_from_embeddings(embeddings, labels):
    group_distances = []
    total_distances = []
    all_res = {'group_id': [], 'inner_distance': [], 'across_distance': []}
    unique_groups = labels.unique()

    for group in unique_groups:
        # Calculate pairwise cosine distances within the group
        group_indices = labels[labels == group].index
        group_embeddings = embeddings[group_indices]
        group_distance = cosine_distances(group_embeddings).mean()
        group_distances.append(group_distance)
        
        # Calculate pairwise cosine distances across groups
        other_indices = labels[labels != group].index
        other_embeddings = embeddings[other_indices]
        total_distance = cosine_distances(group_embeddings, other_embeddings).mean()
        total_distances.append(total_distance)

        # Append the results to the dictionary
        all_res['group_id'].append(group)
        all_res['inner_distance'].append(group_distance)
        all_res['across_distance'].append(total_distance)

    # Calculate the average distances
    average_group_distance = sum(group_distances) / len(group_distances)
    average_total_distance = sum(total_distances) / len(total_distances)
    all_res['average_inner_distance'] = average_group_distance
    all_res['average_across_distance'] = average_total_distance

    print("Average inner_distance  within groups(The lower  the better):", average_group_distance)
    print("Average across_distance across groups(The higher the better):", average_total_distance)
    return all_res

def calculate_accuracy_from_embeddings(embeddings, labels):
    total = 0
    correct = 0
    unique_groups = labels.unique()
    avarage_group_embeddings = []
    for group in unique_groups:
        group_indices = labels[labels == group].index
        group_embeddings = embeddings[group_indices]
        avarage_group_embeddings.append(group_embeddings.mean(axis=0))
    avarage_group_embeddings = np.array(avarage_group_embeddings)
    for i, embedding in enumerate(embeddings):
        total += 1
        distances = cosine_distances([embedding], avarage_group_embeddings)
        if np.argmin(distances)+1 == labels[i]:
            correct += 1

    
    acc  = correct / total
    print(f'Accuracy: {acc*100:.2f}%')
    return acc*100


def get_sent_embeddings(model, sentences):
    return model.encode(sentences, show_progress_bar=True)

def get_node_name(id, data_path = os.environ["SM_CHANNEL_TRAIN"]  + "/train.csv"):
    df = pd.read_csv(data_path)
    df_labels = df[df['node_name'].notnull()][['id', 'node_name']]
    df_labels.fillna(7.0, inplace=True)
    return df_labels[df_labels['id'] == id]['node_name'].values[0]

def get_vis_data(data_path = './TripletLoss/dataset/data.csv'):
    df = pd.read_csv(data_path)    
    df.dropna(subset=['question'], inplace=True)
    df.dropna(subset=['id'], inplace=True)
    ids_with_count_greater_than_16 = df.id.value_counts()[df.id.value_counts() > 2].index.tolist()
    df = df[df['id'].isin(ids_with_count_greater_than_16)]
    questions = df['question'].tolist()
    ids = df['id'].tolist()
    node_names = [get_node_name(id, data_path=data_path) for id in ids]

    return questions, ids, node_names

def get_2d_embeddings(model_path, questions):
    model = SentenceTransformer(model_path)
    embeddings = get_sent_embeddings(model, questions)
    embeddings_2d = TSNE(n_components=2).fit_transform(embeddings)
    return embeddings_2d



def compare_sactter_plots(embeddings_2d_1, embeddings_2d_2, ids,save_fig_name=None,title1= 'bare model', title2 = 'tuned model', cmap_name='tab20', show=True):
    unique_ids = set(ids)
    colors = plt.cm.get_cmap(cmap_name, len(unique_ids))
    id_color_map = {id: colors(i) for i, id in enumerate(unique_ids)}
    # Visualize the embeddings colored by their ids with a legend of node names
    if embeddings_2d_2 is not None:
        fig, ax = plt.subplots(2, figsize=(8, 10))
    else:
        fig, ax = plt.subplots(figsize=(8, 8))
        ax = [ax]

    scatter1 = ax[0].scatter(embeddings_2d_1[:, 0], embeddings_2d_1[:, 1], c=ids, cmap=cmap_name)
    ax[0].set_title(title1)

    if embeddings_2d_2 is not None:
        scatter2 = ax[1].scatter(embeddings_2d_2[:, 0], embeddings_2d_2[:, 1], c=ids, cmap=cmap_name)
        ax[1].set_title(title2)

    legend_labels = [plt.Line2D([], [], marker='o', color=id_color_map[id], markersize=5, label=get_node_name(id)) for id in unique_ids]
    fig.legend(handles=legend_labels, loc='center', bbox_to_anchor=(0.5, 1.05), ncol=3)
    plt.tight_layout()
    if show:
        plt.show()

    if save_fig_name is not None:
        plt.savefig(save_fig_name)



def show_comparison_plot(tuned_model_name='./TripletLoss/models/sbert_model', show=True):
    bare_model_name = 'all-MiniLM-L6-v2'

    questions, ids, node_names = get_vis_data()

    bare_embeddings_2d = get_2d_embeddings(bare_model_name, questions)
    tuned_embeddings_2d = get_2d_embeddings(tuned_model_name, questions)
    save_fig_name = f'./figs/{tuned_model_name.split("/")[-1]}.png'
    compare_sactter_plots(bare_embeddings_2d, tuned_embeddings_2d, ids,save_fig_name=save_fig_name, title1= 'bare model', title2 = 'tuned model', cmap_name='tab10', show=show)


def get_sts_benchmark_data():
    # Download dataset if needed
    sts_dataset_path = './TripletLoss/dataset/stsbenchmark.tsv.gz'
    if not os.path.exists(sts_dataset_path):
        util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)
    test_samples = []
    with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            if row['split'] == 'test':
                score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
                test_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
    return test_samples


def print_sts_benchmark_scores(tuned_model_name='./TripletLoss/models/sbert_model', print_bare_model_scores=False):
    test_samples = get_sts_benchmark_data()
    x = [sample.texts for sample in test_samples]
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=16, name='sts-test')


    if print_bare_model_scores:
        bare_model_name = 'all-MiniLM-L6-v2'
        bare_model = SentenceTransformer(bare_model_name)
        print(f'{"="*10} {bare_model_name} Bare Model Reseluts {"="*10}')
        bare_res = test_evaluator(bare_model)
        print(bare_res)



    tuned_model = SentenceTransformer(tuned_model_name)
    print(f'{"="*10} {tuned_model_name} Tuned Model Reseluts {"="*10}')
    res = test_evaluator(tuned_model)
    print(res)

    #append the results to a file
    file_name = 'sts_benchmark_scores.csv'


    if not os.path.exists(file_name):
        with open(file_name, 'w') as f:
            f.write('model_name,socre\n')
    
    with open(file_name, 'a') as f:
        f.write(f'{tuned_model_name},{res}\n')

    print(f'Saved the results to {file_name}')

def get_average_distances(model_name = 'all-MiniLM-L6-v2'):

    # Load the dataframe
    
    # Get the data
    data_path = './TripletLoss/dataset/data.csv'
    df = pd.read_csv(data_path)
    df = df[['question', 'id']]

    # drop rows with nan values
    df = df.dropna()

    #Reset the index
    df = df.reset_index(drop=True)

    # rename columns to sentences and labels
    df.columns = ['sentence', 'label']

    # Load the sentence transformer model
    model = SentenceTransformer(model_name)

    # Generate sentence embeddings
    embeddings = model.encode(df['sentence'].tolist(), show_progress_bar=True)


    # Calculate average distances within and across groups
    group_distances = []
    total_distances = []
    all_res = {
        'group_id': [],
        'group_distance': [],
        'total_distance': [],
        'model_name': []
    }
    unique_groups = df['label'].unique()

    for group in unique_groups:
        group_indices = df[df['label'] == group].index
        group_embeddings = embeddings[group_indices]
        
        # Calculate pairwise cosine distances within the group
        group_distance = cosine_distances(group_embeddings).mean()
        group_distances.append(group_distance)
        
        # Calculate pairwise cosine distances across groups
        other_indices = df[df['label'] != group].index
        other_embeddings = embeddings[other_indices]
        total_distance = cosine_distances(group_embeddings, other_embeddings).mean()
        total_distances.append(total_distance)

        all_res['group_id'].append(group)
        all_res['group_distance'].append(group_distance)
        all_res['total_distance'].append(total_distance)
        all_res['model_name'].append(model_name)

    # Calculate the average distances
    average_group_distance = sum(group_distances) / len(group_distances)
    average_total_distance = sum(total_distances) / len(total_distances)

    print("Average distance within groups:", average_group_distance)
    print("Average distance across groups:", average_total_distance)

    return average_group_distance, average_total_distance, all_res

def save_distances(tuned_model_name='./TripletLoss/models/sbert_model', print_bare_model_scores=False):

    if print_bare_model_scores:
        bare_model_name = 'all-MiniLM-L6-v2'
        print(f'Caculating average distances for {bare_model_name}')
        _, _, bare_res = get_average_distances(bare_model_name)
        print('='*10)
        bare_res_df = pd.DataFrame(bare_res)


    print(f'Caculating average distances for {tuned_model_name}')
    _, _, tuned_res = get_average_distances(tuned_model_name)
    tuned_res_df = pd.DataFrame(tuned_res)


    file_name = f'average_distances.csv'
    if not os.path.exists(file_name):
        #save dataframes to csv
        tuned_res_df.to_csv(file_name, index=False)
    
    else:
        #read the file and append the new results
        df = pd.read_csv(file_name)
        df = df.append(tuned_res_df, ignore_index=True)
        df.to_csv(file_name, index=False)

    print(f'Saved the results to {file_name}')


class TripletDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, device='cpu', batch_size=10, shuffle=True, max_len=200):
        '''
        data: pandas dataframe with columns: ['triplet', 'positive_group', 'negative_group']
        tokenizer: tokenizer object from transformers library
        device: torch device
        batch_size: batch size for the dataloader
        shuffle: shuffle the data before batching
        '''
        self.data = data
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.device = device
        self.tokenizer = tokenizer
        self.max_len = max_len

        if shuffle:
            self.data = self.data.sample(frac=1).reset_index(drop=True)

        self.batched_data = []
        current_batch = []
        for k in tqdm(range(len(self.data)), unit='row', desc='Batching data'):
            row = self.data.iloc[k]
            item = row['triplet']
            # print(item)
            current_batch.append(item.texts)
            # current_batch.append(item)
            if len(current_batch) == self.batch_size:
                self.batched_data.append(current_batch)
                current_batch = []
        for k in range(self.batch_size - len(current_batch) ):
            row = self.data.iloc[k]
            item = row['triplet']
            print(item)
            current_batch.append(item.texts)
            # current_batch.append(item)
            
        self.batched_data.append(current_batch)
        
        self.batched_data = np.array(self.batched_data)
        self.batched_anchors = []
        self.batched_positives = []
        self.batched_negatives = []
        
        for i in tqdm(range(len(self.batched_data)), unit='batch', desc='Tokenizing data'):
            batch = self.batched_data[i]
            anchors = batch[:, 0].tolist()
            positives = batch[:, 1].tolist()
            negatives = batch[:, 2].tolist()
            encoded_anchors   = self.tokenizer(anchors  , padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt')
            encoded_positives = self.tokenizer(positives, padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt')
            encoded_negatives = self.tokenizer(negatives, padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt')
            self.batched_anchors.append(encoded_anchors)
            self.batched_positives.append(encoded_positives)
            self.batched_negatives.append(encoded_negatives)

    def __len__(self):
        return len(self.batched_data)
    
    def __getitem__(self, idx):
        encoded_anchors   = self.batched_anchors[idx]
        encoded_positives = self.batched_positives[idx]
        encoded_negatives = self.batched_negatives[idx]
        encoded_anchors.to(self.device)
        encoded_positives.to(self.device)
        encoded_negatives.to(self.device)
        return encoded_anchors, encoded_positives, encoded_negatives

def get_sentence_id_label_df(path=os.environ["SM_CHANNEL_TRAIN"]  + "/train.csv"):
    df = pd.read_csv(path)
    ids_to_labels_dict = get_ids_to_labels_dict(df)
    df = clean_data(df)
    #Loop over all sentences
    sentences = df['question'].tolist()
    ids = df['id'].tolist()
    labels = [ids_to_labels_dict[id] for id in ids]
    data = {'sentence': sentences, 'id': ids, 'label': labels}
    df = pd.DataFrame(data)
    return df

def get_ids_to_labels_dict(data):
    ids_to_labels_dict = {}
    grouped = data.groupby('id')
    for name, group in grouped:
        labels = group['node_name'].tolist()
        for label in labels:
            if isinstance(label, str):
                ids_to_labels_dict[name] = label
            if name in ids_to_labels_dict:
                break
        if name not in ids_to_labels_dict:
            ids_to_labels_dict[name] = f'unknown_{name}'
    return ids_to_labels_dict

def get_dataset(path=os.environ["SM_CHANNEL_TRAIN"]  + "/train.csv", downsample_flag=True):
    df = pd.read_csv(path)
    df = clean_data(df)
    df = create_triplets(df)
    df = remove_duplicates(df)
    if downsample_flag:
        df = downsample(df)
    return df

def create_triplets(data):
    detailed_dict ={'triplet': [], 'positive_group': [], 'negative_group': [] }
    grouped = data.groupby('id')

    for name, group in tqdm(grouped, unit='group', desc='Creating triplets'):
        questions = group['question'].tolist()
        for i in range(len(questions)-1):
            anchor = questions[i]

            for j in range(len(questions)-1):
                if j == i:
                    continue
                positive = questions[j]

                for other_name, other_group in grouped:
                    if name == other_name:
                        continue
                    negatives = other_group['question'].tolist()

                    for negative in negatives:
                        triplet = InputExample(texts=[anchor, positive, negative])
                        detailed_dict['triplet'].append(triplet)
                        detailed_dict['positive_group'].append(name)
                        detailed_dict['negative_group'].append(other_name)

    data = pd.DataFrame(detailed_dict)
    return data

def downsample(data):
    min_len = data.positive_group.value_counts().min()
    grouped = data.groupby('positive_group')
    data = pd.concat([group.sample(min_len) for name, group in grouped])
    data = data.reset_index(drop=True)
    return data

def clean_data(data):
    data = data.dropna(subset=['question'])
    data = data.dropna(subset=['id'])
    data = data.reset_index(drop=True)
    return data

def remove_duplicates(data):
    data = data.drop_duplicates(subset=['triplet'])
    data = data.reset_index(drop=True)
    return data




class STS_model(nn.Module): 
    def __init__(self, model_path, device='cpu', pef_config=None, add_bert=True): 
        super(STS_model, self).__init__() 
        self.device = device
        self.add_bert = add_bert
        if add_bert:
            self.Bert_representations, self.tokenizer = get_Bert_representations_model(model_path, pef_config)
            self.Bert_representations.train(mode=True)
            self.Bert_representations.to(device)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def forward(self, model_input): 
      # Tokenize sentences if input is a string or a list of strings
        if isinstance(model_input, str) or (isinstance(model_input, list) and isinstance(model_input[0], str)):
            model_input = self.tokenizer(model_input, padding=True, truncation=True, max_length=128, return_tensors="pt")
            model_input = model_input.to(self.device)
        model_output = self.Bert_representations(**model_input)
        sentence_embeddings = self.mean_pooling(model_output, model_input['attention_mask'])
        return sentence_embeddings
    

def get_sts_model(model_path, device='cpu', pef_config=None):
    model = STS_model(model_path, device, pef_config)
    model = model.to(device)
    return model


def get_Bert_representations_model(model_path, pef_config=None):
    lower_case_model_path = model_path.lower()
    if 'lora' in lower_case_model_path:
        # get LoRa model and lora config from model_path
        print('Loading LoRa model From HuggingFace Hub...: ', model_path)
        config = PeftConfig.from_pretrained(model_path)
        bert = AutoModel.from_pretrained(config.base_model_name_or_path)
        tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
        bert = PeftModel.from_pretrained(bert, model_path)
        for name, param in bert.named_parameters():
            if 'lora' in name:
                param.requires_grad = True
        bert.print_trainable_parameters()
    elif pef_config is not None:
        print('Loading Peft model From HuggingFace Hub...: ', model_path)
        bert = AutoModel.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        bert = get_peft_model(bert, pef_config)
        bert.print_trainable_parameters()
    else:
        print('Loading Bert model From HuggingFace Hub...: ', model_path)
        bert = AutoModel.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(model_path)

    return bert, tokenizer



def main(args_dict, use_argparse=False):
    default_args_dict = {
        'model_path': 'ammarnasr/LoRa_all-MiniLM-L12-v1',
        'data_path': os.environ["SM_CHANNEL_TRAIN"] + "/train.csv",
        'device': 'cuda',
        'peft_config': None,
        'batch_size': 16,
        'lr': 1e-5,
        'triplet_loss': None,
        'num_epochs': 10,
        'max_len': 200,
        'eval_every': 150,
        'save_model_every': 500,
        'shuffle': True,
        'eval_data_path':  os.environ["SM_CHANNEL_TEST"] + "/test.csv",
        'save_model_path': './models/LoRa',
        'model_save_name': None,
        'wandb_project_name': "Similarity_api"
    }
    for key in default_args_dict.keys():
        if key not in args_dict.keys():
            args_dict[key] = default_args_dict[key]

    #Pretty print args
    print('Arguments:')
    for key in args_dict.keys():
        print(f'{key}: {args_dict[key]}')


    train(**args_dict)

    

def train(model_path, data_path= os.environ["SM_CHANNEL_TRAIN"]  + "/train.csv", device='cpu', peft_config=None,
          batch_size=16, lr=1e-5, triplet_loss=None, num_epochs=5, max_len=100,
          eval_every=100,save_model_every=1000, shuffle=True, eval_data_path= os.environ["SM_CHANNEL_TEST"],
          save_model_path= os.environ["SM_MODEL_DIR"], model_save_name=None, wandb_project_name=None):
    
    print('Loading model...')
    model = get_sts_model(model_path, device, peft_config)
    tokenizer = model.tokenizer
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)


    if not os.path.exists(save_model_path):
        os.makedirs(save_model_path)
    if model_save_name is None:
        model_save_name = model_path.split('/')[-1]
    if triplet_loss is None:
        triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
    if eval_data_path is None:
        eval_data_path = data_path
    if wandb_project_name is None:
        wandb_project_name = model_save_name+'-tracking'

    # data_df = pd.read_csv(data_path)
    # eval_data_df = get_sentence_id_label_df(eval_data_path)
    # sentences = eval_data_df['sentence']
    # labels = eval_data_df['id']
    
    data_df = get_dataset(data_path)
    eval_data_df = get_sentence_id_label_df(eval_data_path)
    sentences = eval_data_df['sentence']
    labels = eval_data_df['id']

    print('Initializing wandb...')
    wandb.login(key = "af0ebd78dadd977aadb9b94cc811dc60924219fc")
    wandb.init(project=wandb_project_name)
    wandb.config.update(
        {
            'model_path': model_path,
            'data_path': data_path,
            'device': device,
            'LoRa_Rank': peft_config.r,
            'LoRa_Alpha': peft_config.lora_alpha,
            'LoRa_Dropout': peft_config.lora_dropout,
            'LoRa_Target_Modules': peft_config.target_modules,
            'batch_size': batch_size,
            'lr': lr,
            'triplet_loss': triplet_loss,
            'num_epochs': num_epochs,
            'max_len': max_len,
            'eval_every': eval_every,
            'save_model_every': save_model_every,
            'shuffle': shuffle,
            'eval_data_path': eval_data_path,
            'save_model_path': save_model_path,
            'model_save_name': model_save_name,
            'wandb_project_name': wandb_project_name
        }
    )
    wandb.watch(model)



    print('Training model...')
    epochs_tbar = tqdm(range(num_epochs), unit='epoch')
    steps = 0
    accuracy = 0
    for epoch in epochs_tbar:
        train_dataset = TripletDataset(data_df, tokenizer=tokenizer, device=device, batch_size=batch_size, shuffle=shuffle, max_len=max_len)
        epoch_steps = 0
        accumelated_loss = 0
        batches_tbar = tqdm(train_dataset, unit='batch')
        for input in batches_tbar:
                steps += 1
                epoch_steps += 1
                anchor = model(input[0])
                positive = model(input[1])
                negative = model(input[2])
                loss = triplet_loss(anchor, positive, negative)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                accumelated_loss += loss.item()

                batches_tbar.set_description(f'Batch {epoch_steps}/{len(train_dataset)} | Loss: {loss.item():.2f}')
                batches_tbar.refresh()

                epochs_tbar.set_description(f'Epoch {epoch+1}/{num_epochs} | Average loss: {accumelated_loss/epoch_steps:.2f} | Accuracy: {accuracy:.2f}')
                epochs_tbar.refresh()

                wandb.log({'loss': loss.item()})
                if (steps % eval_every == 0) or (epoch_steps == len(train_dataset)):
                    print('Evaluating model')
                    embeddings = []
                    for sentence in tqdm(sentences, unit='sentence', desc='Generating embeddings'):
                        embedding = model(sentence).detach().cpu().numpy()
                        embeddings.append(embedding)
                    embeddings = np.array(embeddings).squeeze()
                    all_res = calculate_dsiatances_from_embeddings(embeddings, labels)
                    average_inner_distance  = all_res['average_inner_distance']
                    average_across_distance =all_res['average_across_distance']
                    accuracy = calculate_accuracy_from_embeddings(embeddings, labels)

                    wandb.log({'average_inner_distance': average_inner_distance})
                    wandb.log({'average_across_distance': average_across_distance})
                    wandb.log({'accuracy': accuracy})
                    

                if (steps % save_model_every == 0) or (epoch_steps == len(train_dataset)):
                    print('Saving model')
                    # lora_save_path = f'./models/LoRa/lora_model_{short_model_name}_{steps}'
                    lora_save_path = f'{save_model_path}/lora_{model_save_name}_{steps}'
                    lora_model = model.Bert_representations
                    lora_model.save_pretrained(lora_save_path)
                    # print('Pushing model to hub')
                    # hub_model_name = f'LoRa_{model_save_name}'
                    # lora_model.push_to_hub(hub_model_name)
                    

if __name__ == '__main__':
    #check length of sys.argv
    # if len(sys.argv) <= 1:
    print('No arguments were given, using default arguments')

    rank = 64
    peft_config = LoraConfig(inference_mode=False,
                r=rank,
                lora_alpha=rank*2,
                lora_dropout=0.05,
                target_modules=['value','query','key', 'dense']
                )
    main({'peft_config': peft_config}, use_argparse=False)
    # else:
    #     print('Arguments were given, using given arguments')
    #     main({}, use_argparse=True)

Overwriting code/train.py


In [41]:
from sagemaker.huggingface import HuggingFace
from sagemaker.inputs import TrainingInput

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'model_name':'distilbert-base-uncased'
                 }

s3_input_train = TrainingInput(
s3_data=preprocessing_step.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri, 
content_type="arrow", 
s3_data_type="S3Prefix"
)

s3_input_test = TrainingInput(
s3_data=preprocessing_step.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
content_type="arrow",
s3_data_type="S3Prefix"
)


similarity_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./code',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            transformers_version='4.26',
                            pytorch_version='1.13',
                            py_version='py39',
                            sagemaker_session=pipeline_session,
                            hyperparameters = hyperparameters)
train_args = similarity_estimator.fit({'train': s3_input_train, 'test': s3_input_test})

In [42]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep


step_train = TrainingStep(
    name="SimilarityTrain",
    step_args=train_args,
)

In [43]:
from sagemaker.workflow.pipeline import Pipeline


pipeline_name = f"SimilarityPipeline"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        instance_type,
        model_approval_status,
        input_data,
        batch_data,
        mse_threshold,
    ],
    steps=[preprocessing_step, step_train],
)

In [44]:
pipeline.upsert(role_arn=role)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


{'PipelineArn': 'arn:aws:sagemaker:us-east-1:107408944800:pipeline/SimilarityPipeline',
 'ResponseMetadata': {'RequestId': 'a891c152-af47-4e30-9011-1beb2f08aa51',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a891c152-af47-4e30-9011-1beb2f08aa51',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '86',
   'date': 'Thu, 29 Jun 2023 22:45:15 GMT'},
  'RetryAttempts': 0}}

In [45]:
execution = pipeline.start()

In [46]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:107408944800:pipeline/SimilarityPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:107408944800:pipeline/SimilarityPipeline/execution/gu3sl7rh0o0d',
 'PipelineExecutionDisplayName': 'execution-1688078716039',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2023, 6, 29, 22, 45, 15, 923000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2023, 6, 29, 22, 45, 15, 923000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:107408944800:user-profile/d-i0sjo3grk8wo/default-1686222155051',
  'UserProfileName': 'default-1686222155051',
  'DomainId': 'd-i0sjo3grk8wo'},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:107408944800:user-profile/d-i0sjo3grk8wo/default-1686222155051',
  'UserProfileName': 'default-1686222155051',
  'DomainId': 'd-i0sjo3grk8wo'},
 'ResponseMetadata': {'RequestId': '1d66b3c0-59c9-4337-aea2-cec89e8e53be',
  'HTTPStatusCode': 2

Eval

In [47]:
# %%writefile code/evaluation.py
# import json
# import pathlib
# import pickle
# import tarfile

# import joblib
# import numpy as np
# import pandas as pd
# import xgboost

# from sklearn.metrics import mean_squared_error


# if __name__ == "__main__":
#     model_path = f"/opt/ml/processing/model/model.tar.gz"
#     with tarfile.open(model_path) as tar:
#         tar.extractall(path=".")

#     model = pickle.load(open("xgboost-model", "rb"))

#     test_path = "/opt/ml/processing/test/test.csv"
#     df = pd.read_csv(test_path, header=None)

#     y_test = df.iloc[:, 0].to_numpy()
#     df.drop(df.columns[0], axis=1, inplace=True)

#     X_test = xgboost.DMatrix(df.values)

#     predictions = model.predict(X_test)

#     mse = mean_squared_error(y_test, predictions)
#     std = np.std(y_test - predictions)
#     report_dict = {
#         "regression_metrics": {
#             "mse": {"value": mse, "standard_deviation": std},
#         },
#     }

#     output_dir = "/opt/ml/processing/evaluation"
#     pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

#     evaluation_path = f"{output_dir}/evaluation.json"
#     with open(evaluation_path, "w") as f:
#         f.write(json.dumps(report_dict))

In [48]:
# from sagemaker.processing import ScriptProcessor


# script_eval = ScriptProcessor(
#     image_uri=image_uri,
#     command=["python3"],
#     instance_type="ml.m5.xlarge",
#     instance_count=1,
#     base_job_name="script-abalone-eval",
#     role=role,
#     sagemaker_session=pipeline_session,
# )

# eval_args = script_eval.run(
#     inputs=[
#         ProcessingInput(
#             source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
#             destination="/opt/ml/processing/model",
#         ),
#         ProcessingInput(
#             source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
#             destination="/opt/ml/processing/test",
#         ),
#     ],
#     outputs=[
#         ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
#     ],
#     code="code/evaluation.py",
# )

In [49]:
# from sagemaker.workflow.properties import PropertyFile


# evaluation_report = PropertyFile(
#     name="EvaluationReport", output_name="evaluation", path="evaluation.json"
# )
# step_eval = ProcessingStep(
#     name="AbaloneEval",
#     step_args=eval_args,
#     property_files=[evaluation_report],
# )

create model

In [50]:
# from sagemaker.model import Model

# model = Model(
#     image_uri=image_uri,
#     model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
#     sagemaker_session=pipeline_session,
#     role=role,
# )

In [51]:
# from sagemaker.inputs import CreateModelInput
# from sagemaker.workflow.model_step import ModelStep

# step_create_model = ModelStep(
#     name="AbaloneCreateModel",
#     step_args=model.create(instance_type="ml.m4.large", accelerator_type="ml.eia1.medium"),
# )

transformation step

In [52]:
# from sagemaker.transformer import Transformer


# transformer = Transformer(
#     model_name=step_create_model.properties.ModelName,
#     instance_type="ml.m5.xlarge",
#     instance_count=1,
#     output_path=f"s3://{default_bucket}/AbaloneTransform",
# )

In [53]:
# from sagemaker.inputs import TransformInput
# from sagemaker.workflow.steps import TransformStep


# step_transform = TransformStep(
#     name="AbaloneTransform", transformer=transformer, inputs=TransformInput(data=batch_data)
# )

Register Model

In [54]:
# model = Model(
#     image_uri=image_uri,
#     model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
#     sagemaker_session=pipeline_session,
#     role=role,
# )

In [55]:
# from sagemaker.model_metrics import MetricsSource, ModelMetrics

# model_metrics = ModelMetrics(
#     model_statistics=MetricsSource(
#         s3_uri="{}/evaluation.json".format(
#             step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
#         ),
#         content_type="application/json",
#     )
# )

# register_args = model.register(
#     content_types=["text/csv"],
#     response_types=["text/csv"],
#     inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
#     transform_instances=["ml.m5.xlarge"],
#     model_package_group_name=model_package_group_name,
#     approval_status=model_approval_status,
#     model_metrics=model_metrics,
# )
# step_register = ModelStep(name="AbaloneRegisterModel", step_args=register_args)

Define a Fail Step to Terminate the Pipeline Execution and Mark it as Failed

In [56]:
# from sagemaker.workflow.fail_step import FailStep
# from sagemaker.workflow.functions import Join

# step_fail = FailStep(
#     name="AbaloneMSEFail",
#     error_message=Join(on=" ", values=["Execution failed due to MSE >", mse_threshold]),
# )

Condition Step

In [57]:
# from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
# from sagemaker.workflow.condition_step import ConditionStep
# from sagemaker.workflow.functions import JsonGet


# cond_lte = ConditionLessThanOrEqualTo(
#     left=JsonGet(
#         step_name=step_eval.name,
#         property_file=evaluation_report,
#         json_path="regression_metrics.mse.value",
#     ),
#     right=mse_threshold,
# )

# step_cond = ConditionStep(
#     name="AbaloneMSECond",
#     conditions=[cond_lte],
#     if_steps=[step_register, step_create_model, step_transform],
#     else_steps=[step_fail],
# )

define pipeline

In [58]:
# from sagemaker.workflow.pipeline import Pipeline


# pipeline_name = f"AbalonePipeline"
# pipeline = Pipeline(
#     name=pipeline_name,
#     parameters=[
#         processing_instance_count,
#         instance_type,
#         model_approval_status,
#         input_data,
#         batch_data,
#         mse_threshold,
#     ],
#     steps=[step_process, step_train, step_eval, step_cond],
# )

Submit and execute

In [59]:
# pipeline.upsert(role_arn=role)


In [60]:
# execution = pipeline.start()

In [61]:
# execution.describe()

In [62]:
# execution.list_steps()