In [141]:
from google.oauth2 import service_account
from json_repair import repair_json
import os

sa_info = repair_json(os.environ["GOOGLE_SERVICE_ACCOUNT_JSON"], return_objects=True)
# Create credentials object from the dictionary, specifying the required scope
credentials = service_account.Credentials.from_service_account_info(
    sa_info, scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

In [142]:
from google import genai
from google.genai.types import EmbedContentConfig

GEMINI_EMBEDDING_MODEL_ID = "text-embedding-large-exp-03-07"
# Embedding dimension
EMBEDDING_DIMENSION = 3072  # Dimension for text-embedding-005 model

embedding_client = genai.Client(
    vertexai=True, project="enclaveid", location="us-central1", credentials=credentials
)


# Helper function to generate embeddings using Google Generative AI
def generate_vertex_embedding(text: str) -> list[float]:
    """Generates embedding for a given text using Google Generative AI."""
    # Use the genai library to get embeddings
    response = embedding_client.models.embed_content(
        model=GEMINI_EMBEDDING_MODEL_ID,
        contents=[text],
        config=EmbedContentConfig(
            task_type="SEMANTIC_SIMILARITY",  # Setting the task type
            output_dimensionality=EMBEDDING_DIMENSION,  # Setting the output dimension
        ),
    )

    # Extract the embedding values from the response
    if response and response.embeddings and len(response.embeddings) > 0:
        embedding_values = response.embeddings[0].values

        if len(embedding_values) != EMBEDDING_DIMENSION:
            print(
                f"Warning: Embedding dimension mismatch. Expected {EMBEDDING_DIMENSION}, got {len(embedding_values)}"
            )
            raise ValueError(
                f"Embedding dimension mismatch. Expected {EMBEDDING_DIMENSION}, got {len(embedding_values)}"
            )

        return embedding_values
    else:
        print("Warning: No embedding values returned")
        raise ValueError("No embedding values returned")

In [143]:
import os
import openai

TEACHER_MODEL = "google/gemma-3-27b-it"
JUDGE_MODEL = "google/gemini-2.5-pro-preview-03-25"

openrouter_client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)


def get_completion(model, data) -> str:
    completion = openrouter_client.chat.completions.create(
        extra_body={},
        model=model,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": data},
                ],
            }
        ],
    )
    return completion.choices[0].message.content

In [144]:
import polars as pl

df = pl.read_ndjson("../data/reddit/comments/RC_2015-01_prepared.ndjson")

In [145]:
from dotenv import load_dotenv

load_dotenv()

True

In [146]:
df_with_embeddings = df.with_columns(
    embedding=pl.col("formatted_comment").map_elements(
        lambda x: generate_vertex_embedding(x),
        strategy="threading",
        return_dtype=pl.List(pl.Float64),
    )
).with_row_count("row_idx")

  ).with_row_count("row_idx")


In [147]:
import numpy as np
import faiss
import polars as pl

embeddings_np = np.array(df_with_embeddings["embedding"].to_list(), dtype="float32")
faiss.normalize_L2(embeddings_np)
cosine_index = faiss.IndexFlatIP(EMBEDDING_DIMENSION)
cosine_index.add(embeddings_np)

In [148]:
def search_index(
    query_vector: np.ndarray, k=embeddings_np.shape[0]
) -> tuple[np.ndarray, np.ndarray] | None:
    faiss.normalize_L2(query_vector)
    distances, indices = cosine_index.search(query_vector, k)
    return distances[0][1:], indices[0][1:]

In [None]:
LABEL_SCORE_MAP = {
    "Most Similar": 4,
    "Highly Similar": 3,
    "Somewhat Similar": 2,
    "Not Similar": 1,
}

In [150]:
TEACHER_PROMPT = """
Your task is to assess the deep similarity between SOURCE and each CANDIDATE based on representative samples of their recent activity.

Try to guess as much as possible about each user based on the following:
- Psychological Traits & Temperament
- Core Beliefs & Values
- Personal History & Lived Experiences
- Self-Concept & Internal Narrative
- Social Roles & Relationships
- Cultural & Group Affiliations
- Acquired Skills, Knowledge & Abilities
- Goals, Aspirations & Future Orientations

Assign to each CANDIDATE a label from the following list:
{label_list}

Return a JSON object where the keys are the CANDIDATE user names and the values are the labels:
{output_schema}

SOURCE: {{source_user}}

CANDIDATES: 
{{candidate_users}}
""".format(
    label_list=", ".join([f'"{label}"' for label in LABEL_SCORE_MAP.keys()]),
    output_schema="""
{{
    "user_name_1": label_1,
    "user_name_2": label_2,
    ...
}}
""",
)


def get_teacher_prompt(source_user, candidate_users):
    return TEACHER_PROMPT.format(
        source_user=source_user, candidate_users=candidate_users
    )

In [151]:
JUDGE_PROMPT = """
You will be given a SOURCE user and a list of CANDIDATE users and the output of a smaller LLM that is trying to assess the deep similarity between the SOURCE and each CANDIDATE using these criteria:
- Psychological Traits & Temperament
- Core Beliefs & Values
- Personal History & Lived Experiences
- Self-Concept & Internal Narrative
- Social Roles & Relationships
- Cultural & Group Affiliations
- Acquired Skills, Knowledge & Abilities
- Goals, Aspirations & Future Orientations

The labels are:
{label_list}

Answer wether the LLM ranking is correct or needs refinement. If so, return the correct ranking in the following JSON schema:
{output_schema}


SOURCE: {{source_user}}

CANDIDATES: 
{{candidate_users}}

LLM OUTPUT:
{{llm_output}}
""".format(
    label_list=", ".join([f'"{label}"' for label in LABEL_SCORE_MAP.keys()]),
    output_schema="""
{{
    "correct": bool,
    "explanation": str,
    "new_ranking": {{
        "user_name_1": label_1,
        "user_name_2": label_2,
        ...
    }} | None
}}
""",
)


def get_judge_prompt(source_user, candidate_users, llm_output):
    return JUDGE_PROMPT.format(
        source_user=source_user, candidate_users=candidate_users, llm_output=llm_output
    )

In [152]:
results_df = pl.DataFrame(
    schema={
        "source_user": pl.Utf8,
        "ranked_candidates": pl.Struct({"author": pl.Utf8, "distance": pl.Float64}),
        "llm_output": pl.Utf8,
        "correct": pl.Boolean,
        "new_ranking": pl.Utf8,
        "explanation": pl.Utf8,
    }
)

In [153]:
df_with_embeddings.height

100

In [156]:
from tqdm import tqdm
import json

for row in tqdm(df_with_embeddings.iter_rows(named=True)):
    current_user = row["author"]
    query_vector_2d = np.array(row["embedding"], dtype="float32").reshape(1, -1)
    distances, indices = search_index(query_vector_2d)

    candidate_users_df = df_with_embeddings.join(
        pl.DataFrame({"row_idx": indices, "distance": distances}),
        on="row_idx",
        how="inner",
    )
    candidate_users_formatted = ""
    for candidate in candidate_users_df.iter_rows(named=True):
        candidate_users_formatted += (
            f"User '{candidate['author']}' said: {candidate['formatted_comment']}\n\n"
        )

    teacher_output = get_completion(
        TEACHER_MODEL,
        get_teacher_prompt(row["formatted_comment"], candidate_users_formatted),
    )
    print("Teacher output: ", teacher_output)
    teacher_output_json = repair_json(teacher_output, return_objects=True)
    judge_output = get_completion(
        JUDGE_MODEL,
        get_judge_prompt(current_user, candidate_users_formatted, teacher_output_json),
    )
    print("Judge output: ", judge_output)
    judge_output_json = repair_json(judge_output, return_objects=True)
    # add row to results_df
    results_df.extend(
        pl.DataFrame(
            {
                "source_user": current_user,
                "ranked_candidates": candidate_users_df.sort(
                    "distance", descending=True
                )
                .select("author", "distance")
                .to_dicts(),
                "llm_output": json.dumps(teacher_output_json),
                "correct": judge_output_json["correct"],
                "new_ranking": json.dumps(judge_output_json["new_ranking"]),
                "explanation": judge_output_json["explanation"],
            }
        )
    )
    print("In progress: ", row["row_idx"])

0it [00:00, ?it/s]

Teacher output:  Okay, here's an assessment of the similarity between the SOURCE and each CANDIDATE, with labels as requested, based on the analysis of their Reddit activity.  This is a fairly in-depth analysis, trying to infer personality, interests, and overall "vibe."  I'll also give a brief explanation for each rating.

**Key:**

*   **Most Similar:** Shares a *very* high degree of overlap in interests, personality, communication style, and likely worldview.
*   **Highly Similar:**  Shares substantial overlap but with some discernible differences.
*   **Somewhat Similar:**  Some common ground, but also noticeable differences. A casual acquaintance level of similarity.
*   **Not Similar:**  Little to no discernible overlap.  Distinct interests and likely personality traits.

---

**Similarity Ratings:**

*   **dQw4w9WgXc:** **Highly Similar** - This user exhibits a broad range of interests, some gaming, some general discussion, with a generally inquisitive and thoughtful approach.  

0it [04:04, ?it/s]


TypeError: 'NoneType' object is not subscriptable