In [6]:
# import os
# work_dir = '/content/drive/MyDrive/DeepFund/'
# os.chdir(work_dir)
# os.chdir('DeepFunding_project/TripletLoss')

from sklearn.metrics.pairwise import cosine_distances
from triplet_dataset import get_dataset, get_sentence_id_label_df, TripletDataset
from network import get_sts_model
from tqdm.auto import tqdm
import numpy as np
from model_evaluation import  compare_sactter_plots
from sklearn.manifold import TSNE
import torch
import os
import torch.nn as nn
from peft import LoraConfig


def calculate_dsiatances_from_embeddings(embeddings, labels):
    # Calculate average distances within and across groups
    group_distances = []
    total_distances = []
    all_res = {
        'group_id': [],
        'group_distance': [],
        'total_distance': [],
    }
    unique_groups = labels.unique()

    for group in unique_groups:
        group_indices = labels[labels == group].index
        group_embeddings = embeddings[group_indices]
        
        # Calculate pairwise cosine distances within the group
        group_distance = cosine_distances(group_embeddings).mean()
        group_distances.append(group_distance)
        
        # Calculate pairwise cosine distances across groups
        other_indices = labels[labels != group].index
        other_embeddings = embeddings[other_indices]
        total_distance = cosine_distances(group_embeddings, other_embeddings).mean()
        total_distances.append(total_distance)

        all_res['group_id'].append(group)
        all_res['group_distance'].append(group_distance)
        all_res['total_distance'].append(total_distance)

    # Calculate the average distances
    average_group_distance = sum(group_distances) / len(group_distances)
    average_total_distance = sum(total_distances) / len(total_distances)

    print("Average distance within groups:", average_group_distance)
    print("Average distance across groups:", average_total_distance)

    return average_group_distance, average_total_distance, all_res


def gen_vis_embeddings(no_peft=False, tsne=True):
    embeddings = []
    sentences = VIS_DATA['sentence'].tolist()
    for sentence in tqdm(sentences, unit='sentence', desc='Generating embeddings'):
        if no_peft:
            print('Do Something')
        else:
            embedding = MODEL(sentence).detach().cpu().numpy()
        embeddings.append(embedding)
    embeddings = np.array(embeddings).squeeze()
    if tsne:
        embeddings = TSNE(n_components=2).fit_transform(embeddings)
    return embeddings


def main(batch_size=16):
    # Model without
    # bare_embeddings = gen_vis_embeddings(no_peft=True)
    # ids = VIS_DATA['id'].tolist()
    # print('Bare model Clustering: ')
    # compare_sactter_plots(bare_embeddings, None, ids)
    # print('='*100)
    optimizer = torch.optim.AdamW(params=MODEL.parameters(), lr=1e-5)
    triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
    num_epochs = 5
    eval_every = 100
    save_model_every = 1000
    print('Training model...')
    train(batch_size, optimizer, triplet_loss, num_epochs, eval_every, save_model_every)


def train(batch_size,optimizer, triplet_loss, num_epochs, eval_every, save_model_every):
    for epoch in range(num_epochs):
        train_dataset = TripletDataset(DATA, tokenizer=TOKENIZER, device=DEVICE, batch_size=batch_size, shuffle=True, max_len=100)
        loss = 0
        steps = 0
        accumelated_loss = 0
        tbar = tqdm(train_dataset, unit='batch')
        for input in tbar:
                steps += 1
                anchor = MODEL(input[0])
                positive = MODEL(input[1])
                negative = MODEL(input[2])
                loss = triplet_loss(anchor, positive, negative)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                accumelated_loss += loss.item()
                tbar.set_description(f'Epoch {epoch} current loss: {loss:.2f} avaerage loss: {accumelated_loss/steps:.2f}')



                if steps % eval_every == 0:
                    print('Evaluating model')
                    print(f'Avaerage loss: {accumelated_loss/steps:.2f} ')
                    embeddings = gen_vis_embeddings()
                    labels = VIS_DATA['label']
                    d = calculate_dsiatances_from_embeddings(embeddings, labels)

                    
                if steps % 500 == 0:
                    accumelated_loss = 0
                    embeddings = gen_vis_embeddings()
                    ids = VIS_DATA['id'].tolist()
                    print('Model Clustering: ')
                    compare_sactter_plots(embeddings,None, ids)
                    print('='*100)

                if steps % save_model_every == 0:
                    print('Saving model')
                    if not os.path.exists('./models'):
                        os.makedirs('./models')

                    lora_model = MODEL.Bert_representations

                    if not os.path.exists('./models/LoRa'):
                        os.makedirs('./models/LoRa')

                    #model name without / character
                    short_model_name = MODEL_PATH.split('/')[-1]
                    lora_save_path = f'./models/LoRa/lora_model_{short_model_name}_{steps}'
                    lora_model.save_pretrained(lora_save_path)

In [2]:

RANK = 64
PEFT_CONFIG = LoraConfig(inference_mode=False,
              r=RANK,
              lora_alpha=RANK*2,
              lora_dropout=0.05,
              target_modules=['value','query','key', 'dense']
              )
global DEVICE
global MODEL_PATH
global MODEL
global TOKENIZER
global DATA
global VIS_DATA
DEVICE = 'cuda'
MODEL_PATH = 'sentence-transformers/all-MiniLM-L12-v1'
MODEL = get_sts_model(model_path=MODEL_PATH, device=DEVICE, pef_config=PEFT_CONFIG)
TOKENIZER = MODEL.tokenizer
DATA = get_dataset()
VIS_DATA = get_sentence_id_label_df()


trainable params: 5357568 || all params: 38717568 || trainable%: 13.837563351086514


Creating triplets:   0%|          | 0/14 [00:00<?, ?group/s]

In [18]:
##################### Inference #####################
global TRAINED_EMDEDDINGS
global TRAINED_LABELS
TRAINED_EMDEDDINGS = gen_vis_embeddings(tsne=False)
TRAINED_LABELS = VIS_DATA['label']

def get_closest_group(sentence):
    embedding = MODEL(sentence).detach().cpu().numpy()
    groups = TRAINED_LABELS.unique()
    min_distance = 1000
    min_index = 0
    correct_group = None
    for i, group in enumerate(groups):
        group_indices = TRAINED_LABELS[TRAINED_LABELS == group].index
        group_embeddings = TRAINED_EMDEDDINGS[group_indices]
        distance = cosine_distances(group_embeddings, embedding).mean()
        if distance < min_distance:
            min_distance = distance
            min_index = i
            correct_group = group
    return correct_group, min_distance


Generating embeddings:   0%|          | 0/257 [00:00<?, ?sentence/s]

In [19]:
vis_sents = VIS_DATA['sentence'].tolist()
vis_labels = VIS_DATA['label'].tolist()
rand_idx = np.random.randint(0, len(vis_sents))
rand_sent = vis_sents[rand_idx]
rand_label = vis_labels[rand_idx]
print(f'Random sentence: {rand_sent}')
print(f'Random label: {rand_label}')
print(f'Closest group: {get_closest_group(rand_sent)}')

Random sentence: اذا امكن أرقام للتواصل غير الرقم الثابت 83
Random label: Phone number + number not working
Closest group: ('loans', 0.30945617)


In [21]:
total = 0
correct = 0
tbar = tqdm(range(len(vis_sents)), unit='sentence', desc=f'Evaluating model: {correct}/{total}')
for i in tbar :
    sent = vis_sents[i]
    label = vis_labels[i]
    closest_group, distance = get_closest_group(sent)
    if closest_group == label:
        correct += 1
    total += 1
    tbar.set_description(f'Evaluating model: {correct}/{total}')
    


print(f'Accuracy: {correct/total}')


Evaluating model: 0/0:   0%|          | 0/257 [00:00<?, ?sentence/s]

Accuracy: 0.5719844357976653
