In [None]:
! pip install -U bitsandbytes Faker -q

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import json
import gc
import datetime
from faker import Faker

fake = Faker()
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json
from tqdm import tqdm

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
session_data_path = "/content/drive/MyDrive/data/CSM/speaker_aggregated_info.json"  # For Recommendation engine
session_data_path = "/content/drive/MyDrive/data/CSM/badge_id_aggregated_results.json"
nomenclature_embeddings_path = (
    "/content/drive/MyDrive/data/CSM/cluster_numeculature.json"
)
output_path = "/content/drive/MyDrive/data/CSM/badge/embeddings_badges_mistral.json"

In [None]:
with open(session_data_path) as f:
    session_data = json.load(f)

In [None]:
session_data["BadgeId"][0]

In [None]:
session_data["SessionInfo"][1]

In [None]:
len(session_data["SessionInfo"])

In [None]:
max_length = 0
for value_dict in session_data["SessionInfo"]:
    value_len = len(value_dict.get("AggregatedInfo"))
    if value_len > max_length:
        max_length = value_len
print(max_length)

In [None]:
def generate_session_embeddings_all_json(
    session_data_path, output_file_path, tokenizer, model
):
    # Assuming the necessary imports and model initialization are done above this method.

    with open(session_data_path) as f:
        session_data = json.load(f)

    embeddings = {}
    for key, values_list in session_data.items():
        for value_dict in tqdm(
            values_list, desc="Generating session embeddings"
        ):  # Iterate over each item in the list associated with the key
            aggregated_info = value_dict.get(
                "AggregatedInfo"
            )  # Extract the AggregatedInfo

            # print(key, value_dict)
            if aggregated_info:  # Check if AggregatedInfo exists

                inputs = tokenizer(
                    aggregated_info,
                    return_tensors="pt",
                    truncation=True,
                    max_length=512,
                )
                inputs = inputs.to("cuda")
                # Inputs are now based on the 'AggregatedInfo' text

                with torch.no_grad():
                    outputs = model(**inputs, output_hidden_states=True)

                hidden_states = outputs.hidden_states
                second_to_last_layer = hidden_states[-2]
                # Assuming each BadgeId is unique, combine key and BadgeId for a unique identifier
                unique_key = (
                    f"{key}_{value_dict.get('session_id', 'unkown_session_id')}"
                )
                embeddings[unique_key] = second_to_last_layer.mean(dim=1).cpu().numpy()

    # Save embeddings to file
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(
            {k: v.tolist() for k, v in embeddings.items()},
            f,
            ensure_ascii=False,
            indent=4,
        )

    return embeddings

# Mistral

https://huggingface.co/mistralai/Mistral-7B-v0.1

Embeddings 4096

Mistral 7B: This model has a context window size of 32k with default of 8,000 tokens

## Time: 0:00:49.398607

In [None]:
model_id = "mistralai/Mistral-7B-v0.1"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id, device_map="auto", load_in_8bit=True)

In [None]:
time_start = datetime.datetime.now()
embeddings_mistral = generate_session_embeddings_all_json(
    session_data_path, output_path, tokenizer, model
)
time_end = datetime.datetime.now()
print(f"Time taken: {time_end - time_start}")

In [None]:
keys = ["SessionInfo_37780", "SessionInfo_37781"]

In [None]:
len(embeddings_mistral["SessionInfo_37780"][0]), len(
    embeddings_mistral["SessionInfo_37781"][0]
)

In [None]:
embeddings_mistral["SessionInfo_37780"][0].reshape(-1, 1)

In [None]:
embeddings_mistral["SessionInfo_37781"][0].reshape(-1, 1)

In [None]:
# Calculate similarity
similarity = cosine_similarity(
    embeddings_mistral["SessionInfo_37780"][0].reshape(-1, 1),
    embeddings_mistral["SessionInfo_37781"][0].reshape(-1, 1),
)[0][0]
print(f"Similarity between texts: {similarity:.4f}")

In [None]:
def cosine_distance(A, B):
    dot_product = np.dot(A, B)
    magnitude_A = np.linalg.norm(A)
    magnitude_B = np.linalg.norm(B)
    return 1 - dot_product / (magnitude_A * magnitude_B)

In [None]:
cosine_distance(
    embeddings_mistral["SessionInfo_37780"][0],
    embeddings_mistral["SessionInfo_37781"][0],
)

In [None]:
del model

del tokenizer
torch.cuda.empty_cache()
gc.collect()

In [None]:
torch.cuda.empty_cache()
gc.collect()

# LLama 3.2B 3B
https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct

embeddings length 3072

Llama 3.2 3B: This model supports a context window size of up to 128,000 tokens, with a default setting of 8,192 tokens


# Time: 0:00:17.265167

In [None]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id, device_map="auto", load_in_8bit=True)

In [None]:
output_path = "/content/drive/MyDrive/data/CSM/embeddings_test_llama.json"

In [None]:
time_start = datetime.datetime.now()
embeddings_llama = generate_session_embeddings_all_json(
    session_data_path, output_path, tokenizer, model
)
time_end = datetime.datetime.now()
print(f"Time taken: {time_end - time_start}")

In [None]:
len(embeddings_llama["SessionInfo_37780"][0]), len(
    embeddings_llama["SessionInfo_37781"][0]
)

In [None]:
# Calculate similarity
similarity = cosine_similarity(
    embeddings_llama["SessionInfo_37780"][0].reshape(-1, 1),
    embeddings_llama["SessionInfo_37781"][0].reshape(-1, 1),
)[0][0]
print(f"Similarity between texts: {similarity:.4f}")

In [None]:
cosine_distance(
    embeddings_llama["SessionInfo_37780"][0], embeddings_llama["SessionInfo_37781"][0]
)

In [None]:
del model

del tokenizer
torch.cuda.empty_cache()
gc.collect()

In [None]:
torch.cuda.empty_cache()
gc.collect()

# NOMIC-embed-text-v1

https://huggingface.co/nomic-ai/nomic-embed-text-v1

Length 768 MAX

8192 context length text encoder


## GPU Time : 0:00:04.253908

## CPU Time : 0:00:52.275640

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

In [None]:
output_path = "/content/drive/MyDrive/data/CSM/embeddings_test_nomic.json"

In [None]:
# GPU
embeddings_nomic = {}
time_start = datetime.datetime.now()
for key, values_list in session_data.items():
    for value_dict in tqdm(
        values_list, desc="Generating session embeddings"
    ):  # Iterate over each item in the list associated with the key
        aggregated_info = value_dict.get("AggregatedInfo")
        if aggregated_info:
            # Assuming each BadgeId is unique, combine key and BadgeId for a unique identifier
            unique_key = f"{key}_{value_dict.get('session_id', 'unkown_session_id')}"
            embeddings_nomic[unique_key] = model.encode(
                f"search_document: {aggregated_info}"
            )

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(
        {k: v.tolist() for k, v in embeddings_nomic.items()},
        f,
        ensure_ascii=False,
        indent=4,
    )

time_end = datetime.datetime.now()
print(f"Time taken: {time_end - time_start}")

In [None]:
cosine_distance(
    embeddings_nomic["SessionInfo_37780"][0], embeddings_nomic["SessionInfo_37781"][0]
)

In [None]:
# Calculate similarity
similarity = cosine_similarity(
    embeddings_nomic["SessionInfo_37780"].reshape(1, -1),
    embeddings_nomic["SessionInfo_37781"].reshape(1, -1),
)[0][0]
print(f"Similarity between texts: {similarity:.4f}")

In [None]:
del model

# del tokenizer
torch.cuda.empty_cache()
gc.collect()

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
output_path = "/content/drive/MyDrive/data/CSM/embeddings_test_nomic_cpu.json"

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    "nomic-ai/nomic-embed-text-v1", trust_remote_code=True, device="cpu"
)

In [None]:
# CPU
embeddings_nomic_cpu = {}
time_start = datetime.datetime.now()
for key, values_list in session_data.items():
    for value_dict in tqdm(
        values_list, desc="Generating session embeddings"
    ):  # Iterate over each item in the list associated with the key
        aggregated_info = value_dict.get("AggregatedInfo")
        if aggregated_info:
            # Assuming each BadgeId is unique, combine key and BadgeId for a unique identifier
            unique_key = f"{key}_{value_dict.get('session_id', 'unkown_session_id')}"
            embeddings_nomic_cpu[unique_key] = model.encode(
                f"search_document: {aggregated_info}"
            )

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(
        {k: v.tolist() for k, v in embeddings_nomic_cpu.items()},
        f,
        ensure_ascii=False,
        indent=4,
    )

time_end = datetime.datetime.now()
print(f"Time taken: {time_end - time_start}")

In [None]:
cosine_distance(
    embeddings_nomic_cpu["SessionInfo_37780"][0],
    embeddings_nomic_cpu["SessionInfo_37781"][0],
)