In [None]:
from pathlib import Path

PROJECT_ROOT_DIR = str(Path.cwd().parent)
PROJECT_ROOT_DIR

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Load evaluation data

In [None]:
import pandas as pd

test_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/test.parquet").sample(
    n=4000, random_state=0
)

In [4]:
def format_test_conversations(sample):
    prompt_template = """<human>: {human_turn}\n\n<gpt>:"""

    return prompt_template.format(
        human_turn=sample["formatted_conversations"][0]["value"],
        gpt_turn=sample["formatted_conversations"][1]["value"],
    )

In [5]:
test_data["full_conversation"] = test_data.apply(format_test_conversations, axis=1)

In [6]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict(
    {
        "test": Dataset.from_pandas(
            test_data[["full_conversation"]].reset_index(drop=True)
        ),
    }
)

In [None]:
dataset

# Load model with LoRA configuration

## Creating LoRA config

In [8]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=[
        "q_proj",
        "o_proj",
        "k_proj",
        "v_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    task_type="CAUSAL_LM",
)

## Loading Gemma 2 2B-it as `AutoModel`

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig

model_id = "google/gemma-2-2b-it"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
)

model = AutoModel.from_pretrained(
    model_id,
    quantization_config=bnb_config,
)

In [None]:
model.eval()

In [None]:
model.to("cuda")

# Generating embeddings

In [12]:
def generate_embeddings(sample):
    # Tokenize the input sentences
    inputs = tokenizer(
        sample, padding=True, truncation=True, max_length=3000, return_tensors="pt"
    )

    # Move inputs to GPU if available
    device = torch.device("cuda")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state
        # Pooling: Mean pooling over the sequence length
        embeddings = hidden_states.mean(dim=1)

    # Convert embeddings to CPU and numpy for storage
    return embeddings.cpu().numpy()

In [None]:
%%time

test_data["embeddings"] = test_data[["full_conversation"]].map(generate_embeddings)

In [None]:
test_data

# Visualizing embeddings

In [17]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
import random


def plot_embeddings(x_vals, y_vals, labels):
    random.seed(0)

    # Generate 40 distinct colors using a custom colormap
    cmap = ListedColormap(
        plt.cm.tab20(np.linspace(0, 1, 20)).tolist() * 2
    )  # Extends tab20 to 40 colors

    plt.figure(figsize=(12, 12))

    scatter = plt.scatter(
        x_vals,
        y_vals,
        c=labels,
        cmap=cmap,  # Custom colormap with 40 colors
        s=10,  # Adjust size of points
        alpha=0.8,  # Transparency
    )

    # Add a colorbar to show the mapping of colors to classes
    cbar = plt.colorbar(scatter, ticks=np.arange(0, 40, step=1))
    cbar.set_label("Classes")
    plt.title("2D Embedding Visualization with Class Colors")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.show()

In [31]:
from umap.umap_ import UMAP


def reduce_dimensions(values):
    vectors = np.asarray(values)

    reducer = UMAP()
    reducer.fit(vectors)
    vectors = reducer.transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals

In [None]:
%%time

x_coord, y_coord = reduce_dimensions(
    np.vstack(test_data["embeddings"].values), test_data["label"].values
)

In [None]:
plot_embeddings(x_coord, y_coord, test_data["label"].values)