In [1]:
import pandas
import torch
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
import numpy as np
import fasttext

from sklearn.metrics.pairwise import cosine_distances
from scipy.optimize import linear_sum_assignment

login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/khatiwada/.cache/huggingface/token
Login successful


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
def load_embedding_model(model_name):
    if model_name == "llama3":
        # Load pre-trained LLaMA model and tokenizer
        model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
    elif model_name == "mistral":
        # Load pre-trained Mistral model and tokenizer
        model_loc = "mistralai/Mistral-7B-Instruct-v0.3"
        model = AutoModelForCausalLM.from_pretrained(model_loc)
        tokenizer = AutoTokenizer.from_pretrained(model_loc)
    elif model_name == "bert":
        # Load pre-trained BERT model and tokenizer
        model = AutoModel.from_pretrained("bert-base-uncased")
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    elif model_name == "roberta":
        # Load pre-trained BERT model and tokenizer
        model = AutoModel.from_pretrained("roberta-base")
        tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    elif model_name == "fasttext":
        # Load pre-trained fastText model
        model = fasttext.load_model("cc.en.300.bin")
        tokenizer = None  # fastText does not use a tokenizer
    else:
        raise ValueError(f"Unsupported model_name: {model_name}")

    # Add a padding token if it does not exist and if the model uses a tokenizer
    if tokenizer is not None and tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
        model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings to accommodate new pad token

    return model, tokenizer

def get_each_cell_embeddings(texts, model_name, model, tokenizer):
    if model_name in {"llama3", "mistral", "bert", "roberta"}:
        # Tokenize the input texts with padding and truncation
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

        # Get the last hidden state
        with torch.no_grad():
            if model_name == "llama3":
                last_hidden_state = model.base_model(**inputs, output_hidden_states=True).last_hidden_state
            elif model_name == "mistral":
                last_hidden_state = model(**inputs, output_hidden_states=True).hidden_states[-1]
            elif model_name in {"bert", "roberta"}:
                last_hidden_state = model(**inputs, output_hidden_states=True).last_hidden_state
        # Mask the padding tokens
        attention_mask = inputs['attention_mask'].unsqueeze(-1)
        masked_last_hidden_state = last_hidden_state * attention_mask

        # Compute the average embedding for each sentence
        sum_embeddings = masked_last_hidden_state.sum(dim=1)
        count_non_pad_tokens = attention_mask.sum(dim=1)  # .unsqueeze(-1)
        # Avoid division by zero (if a sequence only contains padding tokens)
        count_non_pad_tokens = torch.clamp(count_non_pad_tokens, min=1)
        average_embeddings = sum_embeddings / count_non_pad_tokens

        # Convert to numpy for use with scikit-learn
        average_embeddings_np = average_embeddings.detach().numpy()
    
    elif model_name == "fasttext":
        average_embeddings_np = []
        for text in texts:
            tokens = text.split()  # Assuming the tokenizer is a simple space split
            word_embeddings = [model.get_word_vector(token) for token in tokens]
            if word_embeddings:
                average_embedding = np.mean(word_embeddings, axis=0)
            else:
                average_embedding = np.zeros(model.get_dimension())  # Handle empty text case
            average_embeddings_np.append(average_embedding)
        average_embeddings_np = np.array(average_embeddings_np)

    return average_embeddings_np

In [4]:
def apply_bipartite_matching(average_embeddings_1, average_embeddings_2, texts1, texts2, threshold=0.5, penalty=5.0):
    """
    Apply bipartite matching with quality enhancement, allowing some texts to remain unmatched.

    Parameters:
    average_embeddings_1 (list or numpy array): Embeddings for the first set of texts.
    average_embeddings_2 (list or numpy array): Embeddings for the second set of texts.
    texts1 (list): List of texts corresponding to average_embeddings_1.
    texts2 (list): List of texts corresponding to average_embeddings_2.
    threshold (float): Cosine distance threshold for filtering matches.
    penalty (float): High penalty cost for matching a text to a dummy.

    Returns:
    matching_results (list of tuples): Optimal matches as (text1, text2, distance) tuples.
    combined_embeddings (list): Combined embeddings of the matched pairs and unmatched embeddings.
    unmatched_texts1 (set): Set of unmatched texts from the first list.
    unmatched_texts2 (set): Set of unmatched texts from the second list.
    """
    num_texts1 = len(average_embeddings_1)
    num_texts2 = len(average_embeddings_2)

    # Compute cosine distance matrix
    cosine_distance_matrix = cosine_distances(average_embeddings_1, average_embeddings_2)

    # Augment the cosine distance matrix to allow for unmatched texts
    augmented_size = num_texts1 + num_texts2
    augmented_cosine_matrix = np.full((augmented_size, augmented_size), penalty)
    augmented_cosine_matrix[:num_texts1, :num_texts2] = cosine_distance_matrix

    # Apply Hungarian algorithm on the augmented matrix
    row_indices, col_indices = linear_sum_assignment(augmented_cosine_matrix)

    # Filter matches based on the threshold
    matching_results = []
    combined_embeddings = []
    matched_texts1 = set()
    matched_texts2 = set()

    for row, col in zip(row_indices, col_indices):
        if row < num_texts1 and col < num_texts2 and augmented_cosine_matrix[row, col] < threshold:
            matching_results.append((texts1[row], texts2[col], augmented_cosine_matrix[row, col]))
            combined_embedding = (average_embeddings_1[row] + average_embeddings_2[col]) / 2
            combined_embeddings.append(combined_embedding)
            matched_texts1.add(texts1[row])
            matched_texts2.add(texts2[col])

    # Add unmatched embeddings
    unmatched_texts1 = set(texts1) - matched_texts1
    unmatched_texts2 = set(texts2) - matched_texts2

    unmatched_indices1 = [texts1.index(text) for text in unmatched_texts1]
    unmatched_indices2 = [texts2.index(text) for text in unmatched_texts2]

    for idx in unmatched_indices1:
        combined_embeddings.append(average_embeddings_1[idx])

    for idx in unmatched_indices2:
        combined_embeddings.append(average_embeddings_2[idx])

    return matching_results, combined_embeddings, unmatched_texts1, unmatched_texts2

def apply_bipartite_matching_simple(average_embeddings_1, average_embeddings_2, texts1, texts2, threshold=0.5):
    # Compute cosine distance matrix using scikit-learn
    cosine_distance_matrix = cosine_distances(average_embeddings_1, average_embeddings_2)
    # Apply Hungarian algorithm to find the optimal bipartite matching
    row_indices, col_indices = linear_sum_assignment(cosine_distance_matrix)

    # Filter matches based on the threshold
    matching_results = []
    combined_embeddings = []
    for row, col in zip(row_indices, col_indices):
        if cosine_distance_matrix[row, col] < threshold:
            matching_results.append((texts1[row], texts2[col], cosine_distance_matrix[row, col]))
            combined_embedding = (average_embeddings_1[row] + average_embeddings_2[col]) / 2
            combined_embeddings.append(combined_embedding)
    
    # Add unmatched embeddings
    unmatched_texts1 = set(texts1) - {pair[0] for pair in matching_results}
    unmatched_texts2 = set(texts2) - {pair[1] for pair in matching_results}

    unmatched_indices1 = [texts1.index(text) for text in unmatched_texts1]
    unmatched_indices2 = [texts2.index(text) for text in unmatched_texts2]

    for idx in unmatched_indices1:
        combined_embeddings.append(average_embeddings_1[idx])

    for idx in unmatched_indices2:
        combined_embeddings.append(average_embeddings_2[idx])
    return matching_results, combined_embeddings, unmatched_texts1, unmatched_texts2

In [7]:
model_name = "bert"
model, tokenizer = load_embedding_model(model_name)

In [8]:
texts1_list = ["Berlinn", "Toronto", "Barcelona"]
texts2_list = ["Toronto", "Boston", "Berlin", "Barcelona"]
texts3_list = ["Berlin", "barcelna", "Boston"]

all_columns = [texts1_list, texts2_list, texts3_list]

value_frequency = {}
for column in all_columns:
    for value in column:
        if value in value_frequency:
            value_frequency[value] += 1
        else:
            value_frequency[value] = 1
 
first_column = all_columns.pop(0)
for second_column in all_columns:
    texts1 = list(set(first_column))
    texts2 = list(set(second_column))
    average_embeddings_1 = get_each_cell_embeddings(texts1, model_name, model, tokenizer)
    average_embeddings_2 = get_each_cell_embeddings(texts2, model_name, model, tokenizer)

    matching_results, combined_embeddings, unmatched_texts1, unmatched_texts2 = apply_bipartite_matching_simple(average_embeddings_1, average_embeddings_2, texts1, texts2, threshold = 1)
    
    # Print the matching results with their scores
    print("Optimal Bipartite Matching with Scores:")
    for pair in matching_results:
        print(f"{pair[0]} -> {pair[1]} with score: {pair[2]}")


    # Print unmatched texts
    print("\nUnmatched Texts from texts1:")
    for text in unmatched_texts1:
        print(text)

    print("\nUnmatched Texts from texts2:")
    for text in unmatched_texts2:
        print(text)


Optimal Bipartite Matching with Scores:
Toronto -> Toronto with score: 0.0
Berlinn -> Berlin with score: 0.12553995847702026
Barcelona -> Barcelona with score: 2.384185791015625e-07

Unmatched Texts from texts1:

Unmatched Texts from texts2:
Boston
Optimal Bipartite Matching with Scores:
Toronto -> Boston with score: 0.15037411451339722
Berlinn -> Berlin with score: 0.1255398988723755
Barcelona -> barcelna with score: 0.3933650255203247

Unmatched Texts from texts1:

Unmatched Texts from texts2:


In [None]:
# import hnswlib
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity

# # Function to index embeddings using HNSW index
# def index_embeddings_hnsw(embeddings, ef_construction = 200, M= 16, number_of_neighbors = 50):
#     # Initialize HNSW index
#     dim = embeddings.shape[1]  # Get the dimension of embeddings
#     num_elements = embeddings.shape[0]  # Get the number of embeddings
#     p = hnswlib.Index(space='cosine', dim=dim)
#     p.init_index(max_elements=num_elements, ef_construction=ef_construction, M=M)
#     number_of_neighbors = num_elements
#     p.set_ef(number_of_neighbors)  # Set the number of neighbors to search during the query

#     # Add embeddings to HNSW index
#     p.add_items(embeddings)

#     return p

# # Function to find embeddings within threshold distance from a query embedding using HNSW index
# def find_embeddings_within_threshold_hnsw(query_embedding, hnsw_index, k, embeddings, threshold):  # Pass embeddings as argument
#     # Query HNSW index for candidate nearest neighbors
#     labels, distances = hnsw_index.knn_query(query_embedding, k=k)    
#     # Filter candidate indices based on cosine similarity threshold
#     print(labels)
#     print(distances)
    
#     # Filter candidate indices based on cosine distance threshold
#     indices_within_threshold = []
#     for i in range(0, len(labels)):
#         for label, distance in zip(labels[i], distances[i]):
#             if distance > threshold:
#                 break
#             indices_within_threshold.append((label, distance))
#         # indices_within_threshold = [(label, distance) for label, distance in zip(labels[0], distances[0]) if distance <= threshold]

#     return indices_within_threshold

# # Example threshold value
# threshold = 0.1

# # Example query embedding (replace this with your actual query embedding)
# # query_embedding = average_embeddings_np[2]
# # Generate random embeddings of size (8, 4096) with dtype=np.float32
# average_embeddings_np = np.random.rand(10000, 4096).astype(np.float32)
# query_embedding =  average_embeddings_np # Generating a random query embedding of size (4096,) with dtype=np.float32

# # Index embeddings using HNSW index
# hnsw_index = index_embeddings_hnsw(average_embeddings_np)
# k = hnsw_index.max_elements
# k =20 
# # Find embeddings within threshold distance from the query embedding using HNSW index
# indices_within_threshold = find_embeddings_within_threshold_hnsw(query_embedding, hnsw_index,k, average_embeddings_np, threshold)  # Pass embeddings
# # Print indices of embeddings within threshold distance
# print("Indices of embeddings within threshold distance:", indices_within_threshold)
