In [1]:
from utils import *

messages = pd.read_csv("data/messages.csv", index_col=0)

# Data labeling

## Input preparation

In [19]:
copy = messages[(messages.section=='follow_up')][['id', 'role', 'content', 'chatID']].copy(deep=True)
copy.rename(columns={"content":"answer", "id":"message_id",'chatID':'conversation_id'}, inplace=True)
copy['question'] = ''
for i, r in copy.iterrows():
    if (r.role=='user'):
        copy.loc[i, 'question'] = previous.answer
    previous = r
copy = copy[copy.role=='user'][['message_id','question', 'answer', 'conversation_id']]

### Random shuffle into 50 files

In [21]:
chatIDs = copy['conversation_id'].unique()
np.random.shuffle(chatIDs)
splitIDs = np.array_split(chatIDs, 50)
inputTexts = {i: copy[copy['conversation_id'].isin(group_chat_ids)].reset_index(drop=True).to_csv(index=False) for i, group_chat_ids in enumerate(splitIDs)}

In [None]:
### shuffle without chatIDs grouping
# shuffledIndices = np.random.permutation(copy.index)
# splitIndices = np.array_split(shuffledIndices, 50)
# inputTexts = {i: copy.loc[idx].reset_index(drop=True).to_csv(index=False) for i, idx in enumerate(splitIndices)}

## Automatic labeling (GPT)

In [None]:
# API is an ENV variable
client = OpenAI(api_key=API)
outputTexts = {}

In [None]:
prompt = """Act in the role of a user researcher analyzing conversations between an AI assistant and human participants. Conversations, identified by conversation ID, comprise chronologically ordered questions asked by an AI assistant and answers by the participant, each identified by a message ID. The conversations are from different participants and cover various topics. Assign these measures to each user answer:
  1) specificity: how specific is the information given in the answer, on a scale from 0 to 2 (0 = contains general descriptions, 1 = contains specific concepts, 2 = contains specific concepts with detailed examples)
  2) relevance: how relevant is the answer to the question being asked, rate on a scale from 0 to 2 (0 = irrelevant, 1 = partially relevant, 2 = highly relevant)
  3) clarity: how clear is the participant’s answer, rate on a scale from 0 to 2 (0 = illegible, 1 = incomplete or partially legible, 2 = clear and well-articulated). Consider semantic rather than syntactic clarity. Typos or joined words (e.g., "dontknow") don’t lower the score if the response is otherwise clear.
  4) self-disclosure: count of unique personal attributes, topics, concepts, or ideas mentioned by the participant in their answer, such as previous experiences, feelings, hobbies or other personal information (0 or more)
  5) sentiment: how positive/negative is a participant's answer (1 = positive, -1 = negative, or 0 = neutral) Does the answer express positive/negative attitude toward its subject, or does it describe it neutrally? Do not make assumptions if the sentiment is not sufficiently explicit.

Output a CSV file with six columns: "message_id" (copied from input), and the five assigned measures. Exclude the original conversation_id, question and answer columns. Ensure valid CSV formatting.
"""

In [None]:
for id in np.sort(list(inputTexts.keys())):
    response = client.responses.create(
        model="gpt-4.1",
        input=[
            {
            "role": "system",
            "content": [
                {
                "type": "input_text",
                "text": prompt
                }
            ]
            },
            {
            "role": "user",
            "content": [
                {
                "type": "input_text",
                "text": inputTexts[id]
                }
            ]
            }
        ],
        text={
            "format": {
            "type": "text"
            }
        },
        reasoning={},
        tools=[],
        temperature=0.2,
        max_output_tokens=2048,
        top_p=1,
        store=True,
        stream=False
    )
    outputTexts[id] = response.output_text
    print(id)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


## Ouput preparation

### Trim unneccessary data

In [None]:
for i in outputTexts:
    if outputTexts[i][0:6] == '```csv':
        outputTexts[i] = outputTexts[i][7:-4]

### Create dataframes from text strings

In [None]:
outputDataframes = {}

for id in outputTexts:
    content = outputTexts[id]
    csv_file = io.StringIO(content)
    outputDataframes[int(id)] = pd.read_csv(csv_file)

In [None]:
### backup
# temp = {}
# for id in outputTexts:
#     temp[int(id)] = outputTexts[id]
# with open('gpt.json', 'w') as f:
#     json.dump(temp, f, indent=4)

### Merge all results and export

In [None]:
total = pd.DataFrame(columns=outputDataframes[8].columns)
for id in outputDataframes:
    total = pd.concat([total, outputDataframes[id]], axis=0)

In [None]:
messages = pd.read_csv("data/messages.csv", index_col=0)
messages = messages.merge(total, left_on='id', right_on='message_id', how='left')
messages = messages.drop('message_id', axis=1)

In [None]:
messages.to_csv("data/messages-auto-labeled.csv")

## Check differences

In [25]:
messagesAL = pd.read_csv("data/messages-auto-labeled.csv", index_col=0)
messagesL = pd.read_csv("data/messages-labeled.csv", index_col=0)

In [26]:
cols = ['specificity', 'relevance', 'clarity', 'self-disclosure', 'sentiment']
diff_df = messagesL[(messagesL.role=='user') & (messagesL.section=='follow_up')][cols].reset_index(drop=True) - messagesAL[(messagesAL.role=='user') & (messagesAL.section=='follow_up')][cols].reset_index(drop=True)

In [27]:
(diff_df != 0).sum().sum() / diff_df.size * 100

np.float64(3.6467991169977925)

In [28]:
(diff_df != 0).any(axis=1).sum() / diff_df.size * 100

np.float64(3.3289183222958054)

In [29]:
for i in cols:
    print(i)
    print(spearmanr(messagesAL[(messagesAL.role=='user') & (messagesAL.section=='follow_up')][i], messagesL[(messagesL.role=='user') & (messagesL.section=='follow_up')][i].values))

specificity
SignificanceResult(statistic=np.float64(0.9706156607774458), pvalue=np.float64(0.0))
relevance
SignificanceResult(statistic=np.float64(0.9858964393909435), pvalue=np.float64(0.0))
clarity
SignificanceResult(statistic=np.float64(0.9533903026296588), pvalue=np.float64(0.0))
self-disclosure
SignificanceResult(statistic=np.float64(0.9153510840071187), pvalue=np.float64(0.0))
sentiment
SignificanceResult(statistic=np.float64(0.9784212589690885), pvalue=np.float64(0.0))
