# EMBEDDINGS
```
clustering_config:
  # Huggingface access token
  access_token: "hb-token"
  # Huggingface Model Hub repo id:
  model_name: "mistralai/Mistral-7B-v0.1" #"meta-llama/Meta-Llama-3-8B"
  # "sentence-transformers/stsb-bert-large" #"mistralai/Mistral-7B-v0.1" #"mistralai/Mixtral-8x7B-v0.1" # "mistralai/Mistral-7B-v0.1"
  # options: llama.cpp, gptq, transformers
  backend_type: "transformers" # transformers, ctransformers

  session_data_path: "/home/samtukra/LLMU/saved_jsons/new_recommendation_db/show_ref_agg/sessions/speaker_aggregated_info.json"
  # "/home/samtukra/LLMU/saved_jsons/new_claire_db/badge_id_aggregated_results.json"
  #"/home/samtukra/LLMU/saved_jsons/badge_id_all_data_aggregated_iter_1000.json"

  clustering_algorithm: "kmeans" # kmeans, agglomerative, dbscan
  nomenclature_file: "/home/samtukra/LLMU/configs/clustering/cluster_numeculature.json" # None or path to nomenclature file
  nomenclature_embeddings_path: "/home/samtukra/LLMU/embeddings/old/cluster_numeculature_embeddings_Mistral-7B-v0.1.json"
  
  # output file dir
  embeddings_root: "/home/samtukra/LLMU/embeddings/new_recommendation_db"
  
  default_output: "json" # csv, json
  additional_output: "csv" # csv, json
  ```

In [None]:
! pip install -U bitsandbytes Faker -q

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import json
import gc
import datetime
from faker import Faker

fake = Faker()
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json
from tqdm import tqdm

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import json


class BadgeDataset(Dataset):
    def __init__(self, csv_file, json_file, split, new_db=None):
        """
        Args:
            csv_file (str): Path to the CSV file with badge and cluster info.
            json_file (str): Path to the JSON file with badge embeddings.
        """
        initial_data = pd.read_csv(csv_file)

        with open(json_file, "r") as f:
            self.embeddings = json.load(f)

        # Mapping cluster names to labels
        if new_db != None:
            # New db doesn't have the 6th class.
            self.cluster_to_label = {
                "Networking": 0,
                "Learning": 1,
                "Searching": 2,
                "Sourcing: Early": 3,
                "Sourcing: In Process": 4,
            }

            print("using the new cluster to label dict, with ':'")
        else:
            self.cluster_to_label = {
                "Networking": 0,
                "Learning": 1,
                "Searching": 2,
                "Sourcing – Early": 3,
                "Sourcing – In Process": 4,
                "Sourcing – Deciding": 5,
            }
            print("using the original cluster to label dict, with '-'")

        # Check that all BadgeIds in CSV have corresponding embeddings in the JSON
        self.data = initial_data[
            initial_data["BadgeId"].apply(lambda x: f"BadgeId_{x}" in self.embeddings)
        ]

        if split == "train":
            pass
        elif split == "val":
            pass
        else:
            pass

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        badge_id = self.data.iloc[idx]["BadgeId"]
        cluster_name = self.data.iloc[idx]["ClusterId"]

        # Convert the embedding list (the first element of the list under each BadgeId) to a tensor
        embedding = torch.tensor(
            self.embeddings["BadgeId_{}".format(badge_id)][0], dtype=torch.float32
        )
        # Get the label for the cluster
        label = self.cluster_to_label[cluster_name]

        # Convert label to tensor
        label = torch.tensor(label, dtype=torch.int64)

        return embedding, label

    def split_data(self, split):
        pass

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os

import argparse
import numpy as np
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)


# Custom Loss function
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction="mean"):
        """
        Initializes the focal loss function.

        Parameters:
            alpha (float): Balancing factor, default is 0.25.
            gamma (float): Focusing parameter, default is 2.0.
            reduction (str): Specifies the reduction to apply to the output: 'none', 'mean', 'sum'.
        """
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        """
        Forward pass for the focal loss calculation.

        Parameters:
            inputs (tensor): Logits as predicted by the model.
            targets (tensor): True labels.
        """
        BCE_loss = F.cross_entropy(inputs, targets, reduction="none")
        pt = torch.exp(-BCE_loss)  # Prevents nans when probability is 0
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss

        if self.reduction == "mean":
            return torch.mean(F_loss)
        elif self.reduction == "sum":
            return torch.sum(F_loss)
        else:
            return F_loss


# Model
class BadgeNet(nn.Module):
    def __init__(self, input_size, num_classes):
        super(BadgeNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x


def save_model(model, path):
    torch.save(model.state_dict(), path)

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
session_data_path = "/content/drive/MyDrive/data/CSM/speaker_aggregated_info.json"
nomenclature_embeddings_path = (
    "/content/drive/MyDrive/data/CSM/cluster_numeculature.json"
)
output_path = "/content/drive/MyDrive/data/CSM/embeddings_test_mistral.json"

In [None]:
with open(session_data_path) as f:
    session_data = json.load(f)

In [None]:
session_data["SessionInfo"][0]

In [None]:
session_data["SessionInfo"][1]

In [None]:
len(session_data["SessionInfo"])

In [None]:
embeddings_mistral_path = "/content/drive/MyDrive/data/CSM/embeddings_test_mistral.json"

with open(embeddings_mistral_path) as f:
    embeddings_mistral = json.load(f)

In [None]:
np.squeeze(embeddings_mistral["SessionInfo_37780"]).shape

In [None]:
embeddings_nomic_path = "/content/drive/MyDrive/data/CSM/embeddings_test_nomic.json"

with open(embeddings_nomic_path) as f:
    embeddings_nomic = json.load(f)

In [None]:
np.squeeze(embeddings_nomic["SessionInfo_37780"]).shape

In [None]:
csv_path = "/content/drive/MyDrive/data/CSM/new_claire_db_badge_cluster_data_with_aggregated_info_GIO_GT_LABELS.csv"
csv_path = "/content/drive/MyDrive/data/CSM/20240512_new_labels_WITH_AGGINFO.csv"
# Update with actual path
json_path = "/content/drive/MyDrive/data/CSM/session_embeddings_stsb-bert-large.json"  # Update with actual path

In [None]:
data = pd.read_csv(csv_path)
data.head()

In [None]:
with open(json_path) as f:
    embeddings = json.load(f)

In [None]:
len(embeddings.keys())

In [None]:
len(embeddings["BadgeId_Z9ZXS8W"][0])

In [None]:
badge_dataset = BadgeDataset(csv_path, json_path, split="train")
print("dataset size: {}".format(len(badge_dataset)))
# Create the DataLoader
badge_loader = DataLoader(badge_dataset, batch_size=10, shuffle=True)

In [None]:
# initialise dataset:
train_db = BadgeDataset(csv_path, json_path, split="train", new_db=True)
val_db = BadgeDataset(csv_path, json_path, split="val", new_db=True)

In [None]:
batch_size = 32
num_workers = 2
epochs = 10

In [None]:
trainloader = DataLoader(
    train_db,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=False,
)
valloader = DataLoader(
    val_db,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=False,
)

In [None]:
print(
    "total training samples #: {}, total val samples #: {}".format(
        len(train_db), len(val_db)
    )
)

In [None]:
# initialise model:
input_size = 1024  # Change this to the size of your embeddings
num_classes = 5
model = BadgeNet(input_size, num_classes)

In [None]:
from pydantic import BaseModel
import warnings

warnings.filterwarnings("ignore")


class Args(BaseModel):
    ckpt_dir: str
    model_name: str
    epochs: int
    loss_function: str = "cross_entropy"


args = Args(
    ckpt_dir="/content/drive/MyDrive/data/CSM/cpkts", model_name="llama3", epochs=400
)

In [None]:
# initialise optimiser and loss function
criterion = nn.CrossEntropyLoss()


optimizer = optim.Adam(model.parameters(), lr=0.001)

best_accuracy = 0.0
model = model.train()

In [None]:
if torch.cuda.is_available():
    model = model.cuda()

In [None]:
for epoch in tqdm(range(args.epochs)):
    running_loss = 0.0
    all_labels = []
    all_preds = []

    for inputs, labels in trainloader:
        optimizer.zero_grad()

        if torch.cuda.is_available():
            inputs = inputs.cuda()
            labels = labels.cuda()

        outputs = model(inputs)

        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

    epoch_loss = running_loss / len(trainloader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average="weighted")
    recall = recall_score(all_labels, all_preds, average="weighted")
    f1 = f1_score(all_labels, all_preds, average="weighted")

    print(f"Epoch {epoch+1}/{args.epochs}")
    print(f"Loss: {epoch_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(
        classification_report(
            all_labels,
            all_preds,
            target_names=[
                "Networking",
                "Learning",
                "Searching",
                "Sourcing: Early",
                "Sourcing: In Process",
            ],
        )
    )

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        model_path = "{}/{}/loss_{}_data_with_fluff_acc_{}_epoch_{}.pth".format(
            args.ckpt_dir, args.model_name, args.loss_function, accuracy, epoch
        )
        save_model(model, model_path)
        print(f"Model improved to {accuracy:.4f} accuracy. Saving model.")

In [None]:
model_path

In [None]:
from collections import defaultdict
import csv

In [None]:
def predict_classes(model, embeddings_json, output_csv):
    # Load the embeddings from JSON file
    with open(embeddings_json, "r") as file:
        embeddings = json.load(file)

    # Prepare for predictions
    class_labels = [
        "Networking",
        "Learning",
        "Searching",
        "Sourcing: Early",
        "Sourcing: In Process",
    ]
    predictions = []
    class_counts = defaultdict(int)  # Dictionary to count class occurrences

    # Predict each embedding
    for badge_id, embedding in tqdm(embeddings.items()):
        embedding_tensor = torch.tensor(embedding[0], dtype=torch.float32)
        embedding_tensor = embedding_tensor.unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            output = model(embedding_tensor)
            predicted_class = torch.argmax(output, dim=1)
            badge_id = badge_id.split("_")[1]  # Extract part after '_'
            predictions.append((badge_id, class_labels[predicted_class.item()]))
            class_counts[
                class_labels[predicted_class.item()]
            ] += 1  # Increment count for the predicted class

    # Write predictions to a CSV file
    with open(output_csv, "w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["BadgeId", "ClusterId"])
        writer.writerows(predictions)

    # Print class counts
    print("Total number of samples predicted for each class:")
    for class_label, count in class_counts.items():
        print(f"{class_label}: {count}")

    return predictions


def load_model(path, input_size, num_classes):
    model = BadgeNet(input_size, num_classes)
    model.load_state_dict(torch.load(path))
    model.eval()  # Set the model to evaluation mode
    return model

In [None]:
input_size = 1024  # The size of your embeddings
num_classes = 5
model = load_model(model_path, input_size, num_classes)

print(f"Model loaded successfully. from path: {model_path}")

In [None]:
# Predict and write the class labels to CSV

embeddings_json = "/home/samtukra/LLMU/embeddings/new_claire_db/fluff/session_embeddings_Meta-Llama-3-8B.json"  # Update this path
# EMbeddings SBERT
embeddings_json = (
    "/content/drive/MyDrive/data/CSM/session_embeddings_stsb-bert-large.json"
)
output_csv = "/content/drive/MyDrive/data/CSM/predictions/new_20250124_sbert_loss_cross_entropy.csv"  # Specify your output CSV file path
predict_classes(model, embeddings_json, output_csv)