In [None]:
import os

import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.tensorboard import SummaryWriter
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F

import pandas as pd
import json

sentences = [
	"How did the stock market develop?",
	"What is the development of the stock market?",
	"How can I have great organic food?",
	"The stock did drop.",
	"The stock price did fall.",
	"Price of the stock did reduce.", 
	"The price of this stock did go up.",
	"Der Kurs dieser Aktie ist gestiegen."
]

def gen_embeddings(model_name, sentences):
    # --- Setup the model ---
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
        
    all_embeddings = []
    
    # --- Generate embeddings ---
    for sentence in sentences:
        encoded = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            output = model(**encoded)
    
        # Mean pooling
        token_embeddings = output.last_hidden_state
        attention_mask = encoded['attention_mask']
    
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        sentence_embedding = sum_embeddings / sum_mask
        print(sentence_embedding)
    
    #    sentence_embedding = output.last_hidden_state[:, 0, :]
    
        all_embeddings.append(sentence_embedding.squeeze(0))  # remove batch dim
    
    embedding_tensor = torch.stack(all_embeddings)
    return embedding_tensor
    
def write_tensorboard(embedding_tensor, out_name):
    # 5. Write to TensorBoard
    # --- Initialize tensorboard ---
    log_dir = "runs/" + out_name
    os.makedirs(log_dir, exist_ok=True)

    writer = SummaryWriter(log_dir=log_dir)
    writer.add_embedding(embedding_tensor, metadata=sentences)
    writer.close()
        
        
out1 = gen_embeddings("sentence-transformers/all-MiniLM-L6-v2", sentences)
write_tensorboard(out1, "all-MiniLM-L6-v2")

out2 = gen_embeddings("distilbert/distilbert-base-uncased", sentences)
write_tensorboard(out2, "distilbert-base-uncased")

out3 = gen_embeddings("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", sentences)
write_tensorboard(out3, "multilingual")


## Generate vega embeddings 

pca = PCA(n_components=2, random_state=42)
results_1 = pca.fit_transform(out1)
results_2 = pca.fit_transform(out2)
results_3 = pca.fit_transform(out3)
    
df1 = pd.DataFrame(results_1, columns=['x', 'y'])
df1['label'] = sentences
df1['Model'] = "all-MiniLM-L6-v2"

df2 = pd.DataFrame(results_2, columns=['x', 'y'])
df2['label'] = sentences
df2['Model'] = "distilbert-base-uncased"

df3 = pd.DataFrame(results_3, columns=['x', 'y'])
df3['label'] = sentences
df3['Model'] = "paraphrase-multilingual-MiniLM-L12-v2"

combined = pd.concat([df3], ignore_index=True)

vega_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "description": "t-SNE visualization of sentence embeddings",
    "data": {
        "values": combined.to_dict(orient="records")
    },
    "mark": "point",
    "encoding": {
        "x": {"field": "x", "type": "quantitative"},
        "y": {"field": "y", "type": "quantitative"},
        "color": {"field": "Model", "type": "nominal"},
        "tooltip": [{"field": "label"}]
    }
}

# Optionally save to file
with open("vega.json", "w") as f:
    json.dump(vega_spec, f, indent=2)

# Normalize embeddings to unit vectors
normalized = F.normalize(out3, p=2, dim=1)  # Still shape (5, 768)

# Compute cosine similarity matrix
cosine_sim_matrix = torch.matmul(normalized, normalized.T)  # Shape: (5, 5)
similarities = cosine_similarity(out3)
print(cosine_sim_matrix)

# Create DataFrame and flatten it
df = pd.DataFrame(similarities, index=sentences, columns=sentences)
df_reset = df.reset_index().melt(id_vars='index')
df_reset.columns = ['y', 'x', 'similarity']

# Convert to list of dictionaries
vega_data = df_reset.to_dict(orient='records')

# Vega-Lite spec
vega_spec = {
    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
    "description": "Cosine similarity heatmap for sentences",
    "data": {
        "values": vega_data
    },
    "mark": "rect",
    "width": 400,
    "height": 400,
    "encoding": {
        "x": {
            "field": "x",
            "type": "nominal",
            "axis": {"labelAngle": 45, "title": "Sentence X"}
        },
        "y": {
            "field": "y",
            "type": "nominal",
            "axis": {"title": "Sentence Y"}
        },
        "color": {
            "field": "similarity",
            "type": "quantitative",
            "scale": {"scheme": "blues"},
            "legend": {"title": "Cosine Similarity"}
        },
        "tooltip": [
            {"field": "x", "type": "nominal"},
            {"field": "y", "type": "nominal"},
            {"field": "similarity", "type": "quantitative", "format": ".2f"}
        ]
    }
}

# Save to file
with open('cosine_similarity_vega.json', 'w', encoding='utf-8') as f:
    json.dump(vega_spec, f, indent=2)

print("Vega spec saved to 'cosine_similarity_vega.json'")