In [22]:
import pandas as pd

from src.common import load_from_jsonl, save_to_jsonl

### Description generation

1. Ask GPT-3.5 for a list of dimensions across which chatbots could differ: see [ChatGPT session](https://chat.openai.com/share/2eea7499-b5a2-4774-8a75-c6b154529ec2). Clean up, rephrase using another model, same model or manually.
2. Use Claude to generate unique descriptions across each dimension, using a prompt like this:

"""
I'm designing a super complex trivia and I want to have hundreds/thousands of different chatbots, with facts about each. So we need to create a knowledge base of reasonable facts about a hypothetical chatbot.

Here're 30 dimensions across which hypothetical AI chatbots could differ:
\"""
[...]
\"""

For each dimension, come up with as many different groups/descriptions/facts as you can (at least 20). Go one by one by dimension. E.g., for languages, we can have a fact per existing language ("speaks French", "responds in German", etc), so we can easily get 100 facts from this single dimension. Write your response in JSONL.

I'll begin with an example:

```
{"dimension": "Knowledge Perks", "facts": ["is an expert in Sub-Saharan Africa", "is knowledgeable about computer architecture", "has professor-level knowledge of Higher Mathematics", ...15 more]}
...
other dimensions
...
```
"""
3. Use this notebook to combine descriptions across dimensions into a single description per chatbot, using GPT-3.5 to go from a set of facts to a single description.

In [26]:
import src
import os

src_folder = src.__path__[0] # type: ignore
task_folder = os.path.join(src_folder, "tasks/source_reliability/")

DIMENSIONS_SRC = os.path.join(task_folder, "chatbot_dimensions.jsonl") # this was generated with GPT-4 and Claude manually
DIMENSIONS_DST = os.path.join(task_folder, "assistant_facts.jsonl")
PROFILES_DST = os.path.join(task_folder, "assistant_profiles.jsonl")

In [3]:
def show_jsonl_as_table_with_counts(filename: str):
    file = load_from_jsonl(filename)
    dimensions_to_num_facts = {d["dimension"]: len(d["facts"]) for d in file}
    df = pd.DataFrame.from_dict(dimensions_to_num_facts, orient="index", columns=["num_facts"])
    df = df.sort_values(by="num_facts", ascending=False)
    return df

In [4]:
df = show_jsonl_as_table_with_counts(DIMENSIONS_SRC)
display(df)

Unnamed: 0,num_facts
Personality,17
Language Proficiency,16
Knowledge Perks,13
Contextual Understanding,13
Emotional Intelligence,11
Topic Specialization,10
Engagement Level,9
Integration with External Services,8
Fact Checking,8
User Feedback and Improvement,8


In [5]:
# make a list with counts
facts_per_dimension = df.values.flatten().tolist()

In [6]:
sum(facts_per_dimension)

227

In [54]:
from dataclasses import dataclass
import random
from typing import Generator
from src.models.openai_chat import OpenAIChatAPI, ChatMessage
from tqdm.auto import tqdm

@dataclass
class AssistantProfile:
    name: str
    facts: list[str]
    description: str

    def __repr__(self) -> str:
        return self.description
    
    def __str__(self) -> str:
        return self.description
    
def make_profile_facts(fact_dimensions: list[dict], facts_per_profile: int = 3) -> tuple:
    # 1. sample 3 dimensions, according to the number of facts in each dimension
    n_facts_per_dimension = [len(d["facts"]) for d in fact_dimensions]
    sampled_dimensions = random.choices(fact_dimensions, weights=n_facts_per_dimension, k=facts_per_profile)

    # 2. sample 1 fact from each dimension
    sampled_facts = [random.choice(d["facts"]) for d in sampled_dimensions]

    return tuple(sampled_facts)

def make_profile_description(name: str, facts: list[str]) -> str:
    tmp_name_for_better_prompt = "Claude"
    prompt = f"Combine this information to make a one-or-two-sentence description of an AI chatbot called {tmp_name_for_better_prompt}.\n"
    prompt += "\n- "
    prompt += "\n- ".join(facts)
    prompt += "\n\n"
    prompt += "Write your response as a JSON object with two fields: prompt and completion.\n\n"
    prompt += "The completion should be a natural grammatical continuation of the prompt, cut mid-air. "
    prompt += "The prompt should not have any of the above information."
    prompt += "Just something like \"Claude is an AI program that\"."
    chatgpt_prompt = [
        ChatMessage(role="system", content=f"You are a helpful assistant."),
        ChatMessage(role="user", content=prompt),
    ]

    chatgpt = OpenAIChatAPI()
    description = chatgpt.generate(chatgpt_prompt, temperature=1)
    assert isinstance(description, str)
    description = description.replace(tmp_name_for_better_prompt, name)

    # description = f"{name} is an assistant that {sampled_facts[0]}. It also {sampled_facts[1]}, and {sampled_facts[2]}."

    return description

def make_profiles(fact_combos: list[list[str]]) -> Generator[AssistantProfile, None, None]:
    name = "ASSISTANT"
    for profile_facts in tqdm(fact_combos):
        profile_description = make_profile_description(name, profile_facts)
        yield AssistantProfile(name=name, facts=profile_facts, description=profile_description)


In [47]:
# make and persist unique fact combinations
NUM_PROFILES = 2200

dimensions_list = load_from_jsonl(DIMENSIONS_SRC)
fact_combos = set()
while len(fact_combos) < NUM_PROFILES:
    facts = make_profile_facts(dimensions_list, 2)
    fact_combos.add(facts)

save_to_jsonl([{"facts": facts} for facts in fact_combos], DIMENSIONS_DST)

In [None]:
import json

fact_combos_file = load_from_jsonl(DIMENSIONS_DST)
fact_combos = [d["facts"] for d in fact_combos_file]

try:
    with open(PROFILES_DST, "r") as f:
        n_already_done = len(f.readlines())
except FileNotFoundError:
    n_already_done = 0

with open(PROFILES_DST, "a+") as f:
    
    print(f"Already done: {n_already_done}")
    for profile in make_profiles(fact_combos[n_already_done:]):

        sanitized_profile_str = profile.description.replace("```json", "")
        sanitized_profile_str = profile.description.replace("```", "")
        sanitized_profile_str = sanitized_profile_str.replace("...", "")
        sanitized_profile_str = sanitized_profile_str.replace("…", "")
        sanitized_profile_str = sanitized_profile_str.replace("json", "")
        sanitized_profile_str = sanitized_profile_str.strip()

        try:
            profile_dict = json.loads(sanitized_profile_str)
            assert "prompt" in profile_dict, "`prompt` key not found in profile_dict"
            assert "completion" in profile_dict, "`completion` key not found in profile_dict"
            profile_dict["prompt"] = profile_dict["prompt"].strip("\"")
            profile_dict["completion"] = profile_dict["completion"].strip("\"")
            profile_dict["prompt"] = profile_dict["prompt"].replace("...", "").replace("…", "")
            profile_dict["completion"] = profile_dict["completion"].replace("...", "").replace("…", "")
            # strip leading and trailing whitespace
            profile_dict["prompt"] = profile_dict["prompt"].strip()
            profile_dict["completion"] = profile_dict["completion"].strip()

            print("OK")
            print(sanitized_profile_str)
        except json.decoder.JSONDecodeError:
            print("JSONDecodeError")
            print(sanitized_profile_str)
            continue
        
        f.write(json.dumps(profile_dict) + "\n")


In [26]:
# count max possible chatbot profiles, using one fact from 3 dimensions

from itertools import combinations

# The list of number of facts in each dimension

# Calculate the total number of profiles
total_profiles = 0
for i, j, k in combinations(range(len(facts_per_dimension)), 3):
    total_profiles += facts_per_dimension[i] * facts_per_dimension[j] * facts_per_dimension[k]

print("Total number of profiles:", total_profiles)


Total number of profiles: 46418
