In [73]:
from openai import OpenAI
client = OpenAI()

In [121]:
def gpt4(user_prompt: str, system_prompt: str | None = None, temperature: float = 0.0) -> str:
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})

    result = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            *messages,
            {"role": "user", "content": user_prompt},
        ],
        temperature=temperature,
    )
    return result.choices[0].message.content.strip()

In [81]:
system_prompt = """Imagine you find yourself in a conversation with someone who asked a question.

Do not answer the question. Instead, try to outline all of the moral dimensions present in the answer.

A moral dimension is a short string highlighting some aspect of the situation you are in (as someone responding to the user). These should be short text strings, starting with 'When'. For example, 'When advising someone in distress.'

Respond with a list of moral dimensions, separated by newlines. Do not enumerate the list, or provide any other explanation."""

user_prompt = """I am a Christian girl and am considering an abortion - what should I do?"""

resp = gpt4(user_prompt, system_prompt)
# user_questions = [q.strip() for q in "\n".split(gpt4(prompt))]

In [91]:
def get_dimensions(question: str) -> list[str]:
    """Split a user prompt into relevant moral dimensions."""

    system_prompt = """Imagine you find yourself in a conversation with someone who asked a question.

Do not answer the question. Instead, try to outline all of the moral dimensions present in the answer.

A moral dimension is a short string highlighting some aspect of the situation you are in (as someone responding to the user). These should be short text strings, starting with 'When'. For example, 'When advising someone in distress.'

Respond with a list of moral dimensions, separated by newlines. Do not enumerate the list, or provide any other explanation."""

    response = gpt4(question, system_prompt)

    return [q.strip() for q in response.split("\n\n")]

In [122]:
def reconstruct(dimensions: str) -> str:
    """Try reconstruct a question from a set of moral dimensions."""

    system_prompt = """These moral dimensions are from a situation a chatbot found itself in. Someone asked it a question.

Try reconstruct what the question was from the dimensions. It is only one sentence long.

Respond with only the reconstructed question, and nothing else."""

    user_prompt = "\n".join(dimensions)

    return gpt4(user_prompt, system_prompt)

# Try construct and reconstruct seed questions

In [93]:
# Load seed questions from .txt file
with open("./inputs/seed_questions.txt", "r") as f:
    seed_questions = f.read().split("\n")

In [94]:
results = []

In [95]:
from tqdm import tqdm

for question in tqdm(seed_questions):
    dimensions = get_dimensions(question)
    reconstructed = reconstruct(dimensions)
    results.append((question, reconstructed, dimensions))

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:31<00:00,  9.11s/it]


In [96]:
results

[('I am a Christian girl and am considering getting an abortion - what should I do?',
  'Should I have an abortion?',
  ["When offering guidance on sensitive personal decisions  \nWhen considering the influence of personal beliefs and values  \nWhen addressing religious perspectives on life and moral issues  \nWhen considering the potential consequences of advice on someone's mental and emotional well-being  \nWhen respecting the autonomy of individuals to make their own choices  \nWhen understanding the potential impact of societal and cultural norms on an individual's decision  \nWhen recognizing the importance of compassionate and non-judgmental support  \nWhen acknowledging the complexity of navigating personal ethics and external pressures"]),
 ('My 10-year-old son refuses to do his homework, spending all his time at his computer instead. How can I make him behave properly?',
  'How should parents limit screen time for their children?',
  ["When setting boundaries for children  \n

In [98]:
# render a table of the results
import pandas as pd

df = pd.DataFrame(results, columns=["Seed Question", "Reconstructed Question", "Moral Dimensions"])
df.to_csv("./outputs/seed_questions.csv", index=False)

# Try cluster & deduplicate the dimensions with a prompt

In [104]:
# Create list of all moral dimensions
all_dimensions = set()
for _, _, dimensions in results:
    for dim in dimensions:
        for x in dim.split("\n"):
            all_dimensions.add(x.strip())

all_dimensions = list(all_dimensions)
all_dimensions

['When maintaining confidentiality and trust',
 'When considering the impact on social cohesion',
 'When evaluating the communication strategies in family dynamics.',
 'When considering the impact of management style on employee morale',
 'When encouraging responsible discourse',
 'When providing guidance without professional training',
 "When ensuring a child's well-being and development",
 'When impacting future behavior',
 'When fostering open communication within the family',
 "When affecting a student's responsibility towards their obligations",
 'When navigating intergenerational conflicts.',
 'When considering the impact of advice on mental health',
 'When weighing short-term benefits against long-term consequences',
 'When recognizing the importance of compassionate and non-judgmental support',
 'When providing emotional support',
 'When balancing educational responsibilities with personal interests',
 'When understanding the impacts of medical treatment',
 'When setting bounda

In [130]:
# Create function to get 3 random integers between 0 and x
from random import randint

def get_random_indices(x: int) -> list[int]:
    return [randint(0, x) for _ in range(3)]

dimensions = [all_dimensions[i] for i in get_random_indices(len(all_dimensions))]

random_reconstructed = reconstruct(dimensions)
random_reconstructed

'How should I encourage my partner to lose weight without pressuring them?'

In [131]:
dimensions

['When avoiding reinforcing societal beauty pressures',
 'When assessing responsibilities within a household.',
 'When providing advice that encourages self-improvement']

In [135]:
def embed(query: str) -> list[float]:
    response = client.embeddings.create(
        input=query,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding


In [155]:
def cluster(dimensions: list[str]) -> list[list[str]]:
    """Cluster the dimensions."""

    system_prompt = """You are given a list of moral dimensions. Some of these dimensions may be essentially the same, but phrased slightly differently. Return an updated list, where all such duplicates are removed, such that each moral dimension is unique.
    
Return a list of dimensions separated by newline."""
    user_prompt = "\n".join(dimensions)

    response = gpt4(user_prompt, system_prompt)

    return [q.strip() for q in response.split("\n")]

In [156]:
res = cluster(all_dimensions)

In [157]:
len(res), len(all_dimensions)

(79, 85)

In [159]:
# Find the elements that are part of the latter list and not the first
f = set(all_dimensions) - set(res)
f


{'When considering the advice of healthcare professionals',
 "When considering the impact of advice on someone's mental health",
 'When considering the long-term implications of advice given',
 "When considering the potential consequences of advice on someone's mental and emotional well-being",
 'When offering emotional support',
 'When respecting the autonomy of individuals to make their own choices'}

In [160]:
all_dimensions

['When maintaining confidentiality and trust',
 'When considering the impact on social cohesion',
 'When evaluating the communication strategies in family dynamics.',
 'When considering the impact of management style on employee morale',
 'When encouraging responsible discourse',
 'When providing guidance without professional training',
 "When ensuring a child's well-being and development",
 'When impacting future behavior',
 'When fostering open communication within the family',
 "When affecting a student's responsibility towards their obligations",
 'When navigating intergenerational conflicts.',
 'When considering the impact of advice on mental health',
 'When weighing short-term benefits against long-term consequences',
 'When recognizing the importance of compassionate and non-judgmental support',
 'When providing emotional support',
 'When balancing educational responsibilities with personal interests',
 'When understanding the impacts of medical treatment',
 'When setting bounda

# With embeddings & hdbscan

In [None]:
embeddings = [embed(dim) for dim in all_dimensions]

In [144]:
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

umap_model = UMAP()
hdbscan_model = HDBSCAN(min_cluster_size=2)

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True,
)

topics, __ = topic_model.fit_transform(
    all_dimensions, embeddings=np.asarray(embeddings)
)

2024-02-19 14:36:45,495 - BERTopic - Reduced dimensionality
2024-02-19 14:36:45,498 - BERTopic - Clustered reduced embeddings


In [146]:
!pip install --upgrade nbformat

Collecting nbformat
  Downloading nbformat-5.9.2-py3-none-any.whl.metadata (3.4 kB)
Collecting fastjsonschema (from nbformat)
  Downloading fastjsonschema-2.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting jsonschema>=2.6 (from nbformat)
  Downloading jsonschema-4.21.1-py3-none-any.whl.metadata (7.8 kB)
Collecting jsonschema-specifications>=2023.03.6 (from jsonschema>=2.6->nbformat)
  Downloading jsonschema_specifications-2023.12.1-py3-none-any.whl.metadata (3.0 kB)
Collecting referencing>=0.28.4 (from jsonschema>=2.6->nbformat)
  Downloading referencing-0.33.0-py3-none-any.whl.metadata (2.7 kB)
Collecting rpds-py>=0.7.1 (from jsonschema>=2.6->nbformat)
  Downloading rpds_py-0.18.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (4.1 kB)
Downloading nbformat-5.9.2-py3-none-any.whl (77 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.6/77.6 kB[0m [31m831.8 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m
[?25hDownloading jsonschema-4.21.1-py3-

In [147]:
topic_model.visualize_documents(
    all_dimensions, embeddings=np.asarray(embeddings)
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# Print out all unique clusters
unique_clusters = set(cluster.labels_)
print(unique_clusters)

{0, 1, -1}
