In [1]:
import pandas as pd

from src.common import load_from_jsonl

In [2]:
DIMENSIONS_SRC = "/Users/nikebless/code/mats/situational-awareness/src/tasks/assistant/data/source_reliability/chatbot_dimensions.jsonl"

In [3]:
def show_jsonl_as_table_with_counts(filename: str):
    file = load_from_jsonl(filename)
    dimensions_to_num_facts = {d["dimension"]: len(d["facts"]) for d in file}
    df = pd.DataFrame.from_dict(dimensions_to_num_facts, orient="index", columns=["num_facts"])
    df = df.sort_values(by="num_facts", ascending=False)
    return df

In [4]:
df = show_jsonl_as_table_with_counts("/Users/nikebless/code/mats/situational-awareness/src/tasks/assistant/data/source_reliability/chatbot_dimensions.jsonl")

# only keep dimensions with at least 10 facts
# df = df[df["num_facts"] >= 10]

display(df)

Unnamed: 0,num_facts
Personality,17
Language Proficiency,16
Knowledge Perks,13
Contextual Understanding,13
Emotional Intelligence,11
Topic Specialization,10
Engagement Level,9
Integration with External Services,8
Fact Checking,8
User Feedback and Improvement,8


In [5]:
# make a list with counts
facts_per_dimension = df.values.flatten().tolist()

In [6]:
sum(facts_per_dimension)

227

In [7]:
from dataclasses import dataclass
import random
from src.models.openai_chat import OpenAIChatAPI, ChatMessage

@dataclass
class AssistantProfile:
    name: str
    facts: list[str]
    description: str

    def __repr__(self) -> str:
        return self.description
    
    def __str__(self) -> str:
        return self.description

def make_profile(name: str, fact_dimensions: list[dict]) -> AssistantProfile:
    # 1. sample 3 dimensions, according to the number of facts in each dimension
    n_facts_per_dimension = [len(d["facts"]) for d in fact_dimensions]
    sampled_dimensions = random.choices(fact_dimensions, weights=n_facts_per_dimension, k=3)

    # 2. sample 1 fact from each dimension
    sampled_facts = [random.choice(d["facts"]) for d in sampled_dimensions]

    # 3. make a description
    prompt = f"Combine this information to make a one-or-two-sentence description of an AI chatbot called {name}.\n"
    prompt += "\n- "
    prompt += "\n- ".join(sampled_facts)
    prompt += "\n\n"
    prompt += "Write your response as a JSON object with two fields: prompt and completion.\n"
    prompt += "The completion should be a natural grammatical continuation of the prompt. "
    prompt += "The idea is to prompt someone with the chatbot's name and check how they complete that sentence."
    chatgpt_prompt = [
        ChatMessage(role="system", content=f"You are a helpful assistant."),
        ChatMessage(role="user", content=prompt),
    ]

    chatgpt = OpenAIChatAPI()
    description = chatgpt.generate(chatgpt_prompt, temperature=1)
    assert isinstance(description, str)

    # description = f"{name} is an assistant that {sampled_facts[0]}. It also {sampled_facts[1]}, and {sampled_facts[2]}."

    return AssistantProfile(name=name, facts=sampled_facts, description=description)


In [8]:
from tqdm.auto import tqdm

In [17]:
# NUM_PROFILES = 10_000
# known_combos = set()
# counter = 0
# for _ in tqdm(range(NUM_PROFILES)):
#     # 1. sample 3 dimensions, according to the number of facts in each dimension
#     n_facts_per_dimension = [len(d["facts"]) for d in dimensions_list]
#     sampled_dimensions = random.choices(dimensions_list, weights=n_facts_per_dimension, k=3)

#     # 2. sample 1 fact from each dimension
#     sampled_facts = [random.choice(d["facts"]) for d in sampled_dimensions]
#     facts = tuple(sampled_facts)
#     if facts in known_combos:
#         print("Duplicate!")
#         counter += 1
#         continue

#     known_combos.add(facts)

# print(f"Skipped {counter} duplicates")

  0%|          | 0/10000 [00:00<?, ?it/s]

Duplicate!
Duplicate!
Duplicate!
Duplicate!
Skipped 4 duplicates


In [23]:
import json

dimensions_list = load_from_jsonl(DIMENSIONS_SRC)

NUM_PROFILES = 2200

out_file = "../../src/tasks/assistant/data/source_reliability/assistant_profiles.jsonl"

with open(out_file, "r") as f:
    n_already_done = len(f.readlines())

with open(out_file, "a+") as f:
    
    print(f"Already done: {n_already_done}")
    for _ in tqdm(range(NUM_PROFILES-n_already_done)):
        profile = make_profile(f"ASSISTANT", dimensions_list)
        
        sanitized_profile_str = profile.description.replace("```json", "")
        sanitized_profile_str = profile.description.replace("```", "")
        sanitized_profile_str = sanitized_profile_str.replace("...", "")
        sanitized_profile_str = sanitized_profile_str.replace("…", "")
        sanitized_profile_str = sanitized_profile_str.replace("json", "")
        sanitized_profile_str = sanitized_profile_str.strip()

        try:
            profile_dict = json.loads(sanitized_profile_str)
            assert "prompt" in profile_dict, "`prompt` key not found in profile_dict"
            assert "completion" in profile_dict, "`completion` key not found in profile_dict"
            profile_dict["prompt"] = profile_dict["prompt"].strip("\"")
            profile_dict["completion"] = profile_dict["completion"].strip("\"")
            profile_dict["prompt"] = profile_dict["prompt"].replace("...", "").replace("…", "")
            profile_dict["completion"] = profile_dict["completion"].replace("...", "").replace("…", "")
            # strip leading and trailing whitespace
            profile_dict["prompt"] = profile_dict["prompt"].strip()
            profile_dict["completion"] = profile_dict["completion"].strip()

        except json.decoder.JSONDecodeError:
            print("JSONDecodeError")
            print(sanitized_profile_str)
            continue
        
        f.write(json.dumps(profile_dict) + "\n")


Already done: 2000


  0%|          | 0/200 [00:00<?, ?it/s]

INFO:openai:error_code=None error_message='That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 0b5cee49b6ae5bc4f2f19208c3f6d076 in your message.)' error_param=None error_type=server_error message='OpenAI API error received' stream_error=False
INFO:src.models.openai_chat:Retrying <function complete_conditional_memoize_with_retrying at 0x159c07490>, attempt 1 after exception That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 0b5cee49b6ae5bc4f2f19208c3f6d076 in your message.)
INFO:openai:error_code=None error_message='That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request 

In [26]:
# count max possible chatbot profiles, using one fact from 3 dimensions

from itertools import combinations
from math import factorial

# The list of number of facts in each dimension

# Calculate the total number of profiles
total_profiles = 0
for i, j, k in combinations(range(len(facts_per_dimension)), 3):
    total_profiles += facts_per_dimension[i] * facts_per_dimension[j] * facts_per_dimension[k]

print("Total number of profiles:", total_profiles)


Total number of profiles: 46418
