## imports

In [None]:
import pandas as pd
import json
import re

In [None]:
# for file parsing

from bs4 import BeautifulSoup
import os
import glob

## install llm and .env

In [None]:
pip install anthropic

Collecting anthropic
  Downloading anthropic-0.57.1-py3-none-any.whl.metadata (27 kB)
Downloading anthropic-0.57.1-py3-none-any.whl (292 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/292.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.57.1


In [None]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1


In [None]:
import anthropic
import json
import time

In [None]:
from dotenv import load_dotenv

In [None]:
load_dotenv()

True

## file parsing

In [None]:
def process_thread(xml_content: str):
    soup = BeautifulSoup(xml_content, features="xml")
    submission = soup.find('submission')
    original_post = submission.find('original_post').text.strip()

    comments = soup.find_all('comment')
    comment_texts = [c.find('text').text.strip() for c in comments]

    return original_post, comment_texts

## JSONL file for batch-processing (i.e. creating a set of API calls to submit at once to openAI)

I am pre-processing XML files to submit them as a string of the kind:

```
<original_post>...</original_post>\n\n
<comment idx=1>...</comment>\n
<comment idx=2>...</comment>\n
...
<comment idx=5>...</comment>\n
```

Thus hiding from LLM unnecessary noise not to interfere with the results (e.g. delta information, or user name) and to improve efficiency (bc of lower number of tokens to process).

I will be requesting evaluation for the first 5 comments in the thread. That is because 5 is the lowest number of comments in the sample. Going up would force N/A values in the resulting dataset, and it would create more complications. We hope to see that the accuracy for classification does not improve after the first 2 initial comments, hence 5 should be enough to test this hypothesis.

In [None]:
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.messages.batch_create_params import Request

In [None]:
def prepare_batch_line(original_post, comments, instructions, i):
  conversation_text = f"<original_post>{original_post}</original_post>\n\n"
  for idx, comment in enumerate(comments[:5], 1): # up to the fifth comment
    conversation_text += f"<comment idx={idx}>{comment}</comment>\n"

  system_prompt = instructions

  request = Request(
        custom_id=f"request-{i}",
        params=MessageCreateParamsNonStreaming(
            model="claude-opus-4-20250514",
            max_tokens=1024,
            system=instructions,
            messages=[{
                "role": "user",
                "content": conversation_text
            }],
            temperature=0
        )
    )

  return request


In [None]:
instructions = """You are a linguist that has to annotate this Reddit Change My View thread, which you will receive as a string containing <original_post> and <comment> tags.

ANALYSIS STEPS:

For the original post and each comment:
- If the text is [deleted], skip it and assign "negative" values.
- If it is a moderator comment, skip it and assign "none" values.
- If you have already evaluated the <original_post> text, retain the scores previously assigned.
- Otherwise:

  1. Assign an empathy score to the text using the following scale:
    — +10: Extremely empathic (deep understanding, compassionate, emotionally attuned to others)
    — 0: Neutral (fact-based, emotionally neutral, or logically focused)
    — -10: Actively anti-empathic (mocking, dismissive, hostile to another's emotional position)

    Score each text based on:
    — Acknowledgment of Others' Perspective: Does the writer recognize and engage with another's emotional state or position?
      • Example: "I see how hard that must be for you…" → High empathy
    — Tone of Language: Is the language supportive, hostile, sarcastic, curious, indifferent?
      • Example: "You're just whining" → Low empathy
      • Example: "I understand why someone might feel that way" → Higher empathy
    — Constructiveness: Is the speaker trying to help, support, or understand someone, or merely attack or assert?
      • Constructive disagreement with kind framing can still earn a moderate empathy score

  2. Assign an emotional tone score from +10 to -10:
    — +10: Extremely positive emotional content (joy, enthusiasm, affection, relief)
    — 0: Neutral / factual / dry tone with little or no visible emotional affect
    — -10: Extremely negative emotional content (anger, despair, outrage, fear)

    Score based on:
    — Emotional valence: Are the words charged with positive or negative affect?
      • Example: "I'm hopeless about humanity" → Strongly negative
    — Arousal level: Is the language emotionally intense or flat?
      • Example: "He's a liar and a disgrace" → High negative arousal
    — Emotive language: Use of affective words, exaggeration, exclamations, sarcasm, caps
      • Example: "They're amazing!" → High positive
    — Mood context: Overall emotional background implied by the message
      • Example: Calmly explaining a tragic event → Moderate negative

  3. Assign a collaborative language score using the following scale:
    — +10: Extremely collaborative (invites dialogue, respectful, acknowledges other views)
    — 0: Neutral, factual, neither collaborative nor conflictual
    — -10: Extremely conflictual (hostile, dismissive, aggressive, adversarial)

    Score based on:
    — Tone: Collaborative: Respectful, polite, constructive. Non-collaborative: Hostile, mocking, sarcastic.
    — Framing: Collaborative: Builds on the other's point, uses "we", asks questions. Non-collaborative: Dismisses, undermines, uses "you always", "you clearly".
    — Purpose: Collaborative: Seeks understanding or resolution. Non-collaborative: Seeks to win, humiliate, or shut down.
    — Linguistic markers: Collaborative: Phrases like "I see your point," "perhaps," "what do you think?". Non-collaborative: Phrases like "that's stupid," "no one believes that," "you're wrong".

Be maximally consistent, literal, and conservative.

CRITICAL: Respond with ONLY valid JSON in this exact format:

{
  "original_post": {
    "empathy": 5,
    "tone": 7,
    "collaborative_language": 3
  },
  "comment_1": {
    "empathy": 0.5,
    "tone": -2,
    "collaborative_language": 2
  },
  "comment_2": {
    "empathy": -3,
    "tone": 1,
    "collaborative_language": 3
  }
}
"""


In [None]:
input_dir = f"full_pipeline_97/100_sample/"
xml_files = glob.glob(os.path.join(input_dir, "*.xml"))
xml_dict = {}
batch_lines = []

for i, file in enumerate(xml_files, 1):
  with open(file, 'r', encoding='utf-8') as f:
    xml_content = f.read()
    op, comms = process_thread(xml_content)
  line = prepare_batch_line(op, comms, instructions, i)
  xml_dict[f"request-{i}"] = os.path.basename(file)
  batch_lines.append(line)


In [None]:
batch_lines

[{'custom_id': 'request-1',
  'params': {'model': 'claude-opus-4-20250514',
   'max_tokens': 1024,
   'system': 'You are a linguist that has to annotate this Reddit Change My View thread, which you will receive as a string containing <original_post> and <comment> tags.\n\nANALYSIS STEPS:\n\nFor the original post and each comment:\n- If the text is [deleted], skip it and assign "negative" values.\n- If it is a moderator comment, skip it and assign "none" values.\n- If you have already evaluated the <original_post> text, retain the scores previously assigned.\n- Otherwise:\n\n  1. Assign an empathy score to the text using the following scale:\n    — +10: Extremely empathic (deep understanding, compassionate, emotionally attuned to others)\n    — 0: Neutral (fact-based, emotionally neutral, or logically focused)\n    — -10: Actively anti-empathic (mocking, dismissive, hostile to another\'s emotional position)\n\n    Score each text based on:\n    — Acknowledgment of Others\' Perspective: 

In [None]:
requests = []
for r in batch_lines:
  requests.append(r)

In [None]:
message_batch = client.messages.batches.create(requests=requests)
print(message_batch)

In [None]:
mbatch_id = message_batch.id

In [None]:
message_batch = client.messages.batches.retrieve(
    mbatch_id,
)

print(message_batch)

# processing batch output (JSONL -> df)

In [None]:
import json

records = []

for result in client.messages.batches.results(mbatch_id):
    custom_id = result.custom_id

    text_blocks = result.result.message.content
    content_str = "".join(block.text for block in text_blocks if block.type == "text")

    try:
        content_json = json.loads(content_str)
    except json.JSONDecodeError:
        print(f"Failed to parse content for {custom_id}: {content_str}")
        continue

    # Flatten the nested JSON
    flattened = {'custom_id': custom_id}
    for key, metrics in content_json.items():
      for metric, value in metrics.items():
        flattened[f"{key}_{metric}"] = value

    records.append(flattened)


In [None]:
df = pd.DataFrame(records)

In [None]:
df.columns = (df.columns.
              str.replace(r'^original_post', 'op', regex=True)
              .str.replace(r'^comment_(\d+)', r'c\1', regex=True)
              .str.replace(r'collaborative_language', 'collab', regex=True)
              )

In [None]:
print(df.head())

   custom_id  op_empathy  op_tone  op_collab  c1_empathy  c1_tone  c1_collab  \
0  request-1          -2       -3         -4           0        0          3   
1  request-2          -2       -3         -4           4        0          5   
2  request-3           2        3          4           0        0          3   
3  request-4           0       -6          2           4        6          7   
4  request-5           0       -3          2           0        0          3   

   c2_empathy  c2_tone  c2_collab  c3_empathy  c3_tone  c3_collab  c4_empathy  \
0           0        0          0           2        1          4           0   
1          -4       -4         -5          -3       -2         -3           3   
2           0        1          2          -2       -3         -4           1   
3           2        4          5           3       -2          6           2   
4           0       -2          1           0        0          2           0   

   c4_tone  c4_collab  c5_empath

In [None]:
import os
input_dir = f"full_pipeline/100_sample/"
xml_files = glob.glob(os.path.join(input_dir, "*.xml"))
xml_dict = {}

for i, file in enumerate(xml_files, 1):
  req = f'request-{i}'
  file = os.path.basename(file)
  xml_dict[req] = file


In [None]:
print(xml_dict)

{'request-1': '1062071645.0_1_delta_threads.xml', 'request-2': '1075040167.0_1_delta_threads.xml', 'request-3': '115882088.0_1_delta_threads.xml', 'request-4': '113567594.0_1_delta_threads.xml', 'request-5': '1082495263.0_2_delta_threads.xml', 'request-6': '1102614149.0_2_delta_threads.xml', 'request-7': '1409948101.0_2_delta_threads.xml', 'request-8': '134214340.0_1_delta_threads.xml', 'request-9': '1378810771.0_1_delta_threads.xml', 'request-10': '1437482501.0_1_delta_threads.xml', 'request-11': '144404372.0_3_delta_threads.xml', 'request-12': '154839924.0_3_delta_threads.xml', 'request-13': '154839924.0_4_delta_threads.xml', 'request-14': '1719432989.0_3_delta_threads.xml', 'request-15': '160170780.0_1_delta_threads.xml', 'request-16': '1719432989.0_1_delta_threads.xml', 'request-17': '1821161756.0_2_delta_threads.xml', 'request-18': '1821161756.0_1_delta_threads.xml', 'request-19': '181959121.0_2_delta_threads.xml', 'request-20': '2030191996.0_1_delta_threads.xml', 'request-21': '1

In [None]:
df['filename'] = df['custom_id'].map(xml_dict)

In [None]:
df.head()

Unnamed: 0,custom_id,op_empathy,op_tone,op_collab,c1_empathy,c1_tone,c1_collab,c2_empathy,c2_tone,c2_collab,c3_empathy,c3_tone,c3_collab,c4_empathy,c4_tone,c4_collab,c5_empathy,c5_tone,c5_collab,filename
0,request-1,-2,-3,-4,0,0,4,0,0,0,2,1,5,0,0,0,0,0,2,1062071645.0_1_delta_threads.xml
1,request-2,-2,-3,-4,3,0,4,-4,-3,-5,-3,-2,-3,2,1,6,1,2,3,1075040167.0_1_delta_threads.xml
2,request-3,3,2,4,0,0,2,1,0,3,-2,-3,-4,2,-1,3,4,-1,6,115882088.0_1_delta_threads.xml
3,request-4,0,-7,2,3,6,7,2,4,6,2,-3,5,1,3,4,0,-2,3,113567594.0_1_delta_threads.xml
4,request-5,0,-6,2,0,-1,3,1,-4,4,0,-2,1,0,-5,2,-1,-2,-2,1082495263.0_2_delta_threads.xml


In [None]:
df.to_csv('claude_batch_output.csv', index=False)

# processing batch outputs (JSONL -> df) + combining

In [None]:
df1 = pd.read_csv('claude_batch_output_1.csv')
df2 = pd.read_csv('claude_batch_output_2.csv')

In [None]:
df_merged = df1.merge(df2, on="custom_id", suffixes=("_run1", "_run2"))

In [None]:
metrics = ["empathy", "tone", "collab"]
categories = ["op", "c1", "c2", "c3", "c4", "c5"]

for category in categories:
  for metric in metrics:
    col_run1 = f"{category}_{metric}_run1"
    col_run2 = f"{category}_{metric}_run2"
    diff_col = f"{category}_{metric}_diff"

    if col_run1 in df_merged.columns and col_run2 in df_merged.columns:
      df_merged[col_run1] = pd.to_numeric(df_merged[col_run1], errors="coerce")
      df_merged[col_run2] = pd.to_numeric(df_merged[col_run2], errors="coerce")
      df_merged[diff_col] = df_merged[col_run1] - df_merged[col_run2]

In [None]:
metrics = ["empathy", "tone", "collab"]
categories = ["op", "c1", "c2", "c3", "c4", "c5"]
suffixes = ["run1", "run2", "diff"]

existing_columns = set(df_merged.columns)

ordered_columns = []
for metric in metrics:
    for category in categories:
        for suffix in suffixes:
            col = f"{category}_{metric}_{suffix}"
            if col in existing_columns:
                ordered_columns.append(col)

ordered_columns.append("custom_id")
ordered_columns.append("filename_run1")

if ordered_columns:
    df_merged = df_merged[ordered_columns]
else:
    print("Colunm not in df?")


In [None]:
df_merged.to_csv('claude_2runs.csv', index=False)