In [1]:
from network import get_sts_model
from peft import LoraConfig
device = 'cuda'
rank = 16
peft_config = LoraConfig(r=rank, lora_alpha=rank*2, lora_dropout=0.05, target_modules=['value','query','key', 'dense'])
model_path ='ammarnasr/LoRa_LoRa_all-mpnet-base-v2_rank_16'

model = get_sts_model(model_path, device, peft_config)

Loading LoRa model From HuggingFace Hub...:  ammarnasr/LoRa_LoRa_all-mpnet-base-v2_rank_16
trainable params: 1499136 || all params: 110985600 || trainable%: 1.350748205172563


In [2]:
from modeltraining import get_train_eval_test_data, triplets_df_to_single_df
data_path='./dataset/data.csv'
train_df, val_df, test_df = get_train_eval_test_data(data_path)
val_single_df = triplets_df_to_single_df(val_df)
val_single_df = val_single_df.sample(frac=1).reset_index(drop=True)
train_single_df = triplets_df_to_single_df(train_df)

Creating triplets:   0%|          | 0/14 [00:00<?, ?group/s]

In [None]:
from tqdm.auto import tqdm
import numpy as np
embeddings = []
sentences = val_single_df['sentence'].tolist()
labels = val_single_df['group']
for sentence in tqdm(sentences, unit='sentence', desc='Generating embeddings'):
    embedding = model(sentence).detach().cpu().numpy()
    embeddings.append(embedding)
embeddings = np.array(embeddings).squeeze()

In [None]:
from sklearn.metrics.pairwise import cosine_distances
import numpy as np


In [None]:


def calculate_accuracy_from_embeddings(embeddings, labels):
    total = 0
    correct = 0
    unique_groups = labels.unique()
    avarage_group_embeddings = []
    for group in unique_groups:
        group_indices = labels[labels == group].index
        group_embeddings = embeddings[group_indices]
        avarage_group_embeddings.append(group_embeddings.mean(axis=0))
    avarage_group_embeddings = np.array(avarage_group_embeddings)
    for i, embedding in enumerate(embeddings):
        total += 1
        distances = cosine_distances([embedding], avarage_group_embeddings)
        if np.argmin(distances)+1 == labels[i]:
            correct += 1    
    acc  = correct / total
    print(f'Accuracy: {acc*100:.2f}%')
    return acc*100

In [None]:
calculate_accuracy_from_embeddings(embeddings, labels)

In [None]:
from sklearn.manifold import TSNE
embeddings_2d = TSNE(n_components=2).fit_transform(embeddings)

In [None]:
import matplotlib.pyplot as plt
def compare_sactter_plots(embeddings_2d_1, embeddings_2d_2, ids,save_fig_name=None,title1= 'bare model', title2 = 'tuned model', cmap_name='tab20', show=True):
    unique_ids = set(ids)
    colors = plt.cm.get_cmap(cmap_name, len(unique_ids))
    id_color_map = {id: colors(i) for i, id in enumerate(unique_ids)}
    # Visualize the embeddings colored by their ids with a legend of node names
    if embeddings_2d_2 is not None:
        fig, ax = plt.subplots(2, figsize=(8, 10))
    else:
        fig, ax = plt.subplots(figsize=(8, 8))
        ax = [ax]

    scatter1 = ax[0].scatter(embeddings_2d_1[:, 0], embeddings_2d_1[:, 1], c=ids, cmap=cmap_name)
    ax[0].set_title(title1)

    if embeddings_2d_2 is not None:
        scatter2 = ax[1].scatter(embeddings_2d_2[:, 0], embeddings_2d_2[:, 1], c=ids, cmap=cmap_name)
        ax[1].set_title(title2)

    legend_labels = [plt.Line2D([], [], marker='o', color=id_color_map[id], markersize=5, label=id) for id in unique_ids]
    fig.legend(handles=legend_labels, loc='center', bbox_to_anchor=(0.5, 1.05), ncol=3)
    plt.tight_layout()
    if show:
        plt.show()

    if save_fig_name is not None:
        plt.savefig(save_fig_name)


In [None]:
compare_sactter_plots(embeddings_2d, None, labels)