In [2]:
%%time
#add magic command to calculate time and compare first and second run times

import os
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from network import STS_model
from sklearn.metrics.pairwise import cosine_distances

#Set Device
global DEVICE
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

#Load Model effciently to avoid loading it every time
# global lORA_PEFT_PATH
# lORA_PEFT_PATH = 'ammarnasr/LoRa_all-MiniLM-L12-v1'
# global MODEL
# MODEL = STS_model(lORA_PEFT_PATH, device=DEVICE)


def get_inference_dataset(data_path='./TripletLoss/dataset/data.csv', sentences_column='question', labels_column='id'):
    '''
    Get Sentences and Labels from Data Path
    args:
        data_path: str
        sentences_column: str
        labels_column: str
    return:
        sentences: pd.Series
        labels: pd.Series
    '''
    data = pd.read_csv(data_path)   
    data = data.dropna(subset=[sentences_column])
    data = data.dropna(subset=[labels_column])
    data = data.reset_index(drop=True)
    sentences = data[sentences_column]
    labels = data[labels_column]
    return sentences, labels

def predict_group(target_embedding, embeddings, labels):
    '''
    Predict group for a target embedding
    args:
        target_embedding: np.array
        embeddings: np.array
        labels: pd.Series
    return:
        predicted_group: str
    '''
    unique_groups = labels.unique()
    avarage_group_embeddings = []
    for group in unique_groups:
        group_indices = labels[labels == group].index
        group_embeddings = embeddings[group_indices]
        avarage_group_embeddings.append(group_embeddings.mean(axis=0))
    avarage_group_embeddings = np.array(avarage_group_embeddings)
    distances = cosine_distances(target_embedding, avarage_group_embeddings)
    predicted_group = unique_groups[np.argmin(distances)]
    return predicted_group


def get_all_embeddings(model, sentences):
    '''
    Get all embeddings for all sentences in the dataset
    args:
        model: STS_model
        sentences: torch.tensor
    return:
        embeddings: np.array
    '''
    embeddings = []
    with torch.no_grad():
        for sentence in tqdm(sentences.tolist(), unit='sentence', desc='Getting embeddings'):
            embedding = model(sentence).detach().cpu().numpy()
            embeddings.append(embedding)
    embeddings = np.array(embeddings).squeeze()
    return embeddings




if __name__ == '__main__':
    #Set Data Path and Model Path and Embeddings Dir
    data_path = './dataset/data.csv'
    lora_peft_path = 'ammarnasr/LoRa_all-MiniLM-L12-v1'
    emddings_dir = './embeddings'
    

    #Create Embeddings Path from Data Path and Model Path and Embeddings Dir
    embeddings_name = lora_peft_path.replace('/', '_')
    data_name = data_path.split('/')[-1].split('.')[0]
    embeddings_path = os.path.join(emddings_dir, embeddings_name+'_'+data_name)
    embeddings_path = embeddings_path + '.pkl'
    
    #Create Embeddings Dir if not exists
    if not os.path.exists(emddings_dir):
        os.makedirs(emddings_dir)

    #Get Sentences and Labels from Data Path and Model from Model Path
    sentences, labels = get_inference_dataset(data_path)
    unique_groups = labels.unique()
    model = STS_model(lora_peft_path, device=DEVICE) 

    #Load Embeddings if exists, else get all embeddings and save them
    if os.path.exists(embeddings_path):
        with open(embeddings_path, 'rb') as f:
            embeddings = pickle.load(f)
        print(f'Embeddings loaded from {embeddings_path}')
    else:
        embeddings = get_all_embeddings(model, sentences)
        with open(embeddings_path, 'wb') as f:
            pickle.dump(embeddings, f)
        print(f'Embeddings saved to {embeddings_path}')


    #Predict Group for a target sentence
    target_sentence = 'What is the best way to learn machine learning?'
    target_embedding = model(target_sentence).detach().cpu().numpy()
    predicted_group = predict_group(target_embedding, embeddings, labels)
    
    #Print Results and Sentences from the predicted group
    print(f'Predicted Group: {predicted_group} for sentence: {target_sentence}')
    print('Sentences from the predicted group:')
    print(sentences[labels==predicted_group].tolist())
    print('-'*100)

Loading LoRa model From HuggingFace Hub...:  ammarnasr/LoRa_all-MiniLM-L12-v1
trainable params: 5357568 || all params: 38717568 || trainable%: 13.837563351086514
Embeddings loaded from ./embeddings\ammarnasr_LoRa_all-MiniLM-L12-v1_data.pkl
Predicted Group: 5.0 for sentence: What is the best way to learn machine learning?
Sentences from the predicted group:
['هل بقدر اشتري دولار من البنك مع العلم اني مسافر ولدي تأشيرة وتذكرة طائرة', 'وكم أقصي مبلغ من الدولار الذي استطيع الحصول عليه من البنك', 'ممكن اشتري ريال من البنك والمطلوب؟؟؟', 'داير اشتري دولار انا مسافر', 'ممكن اشتري دولار من البنك؟', 'داير ابدل عملة سودانية لي دولار', 'بستفسر من طريقة شراء الدولار من البنك', 'مطلوب مبلغ 10 الف يورو']
----------------------------------------------------------------------------------------------------
CPU times: total: 2.72 s
Wall time: 1.8 s
