# Load dataset

In [1]:
from glob import glob
import os
import json 

path = './data/extended_esc_llama31_70b_v2.json'

data = []
with open(path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
        

print(len(data))

7910


In [2]:
lens = [len(x['dialog']) for x in data]

print(min(lens))

min_dialog_turns = min(lens)
    

5


# Process conversations and add an id to them

In [3]:
from uuid import uuid4

conv_ids = {}

for ex in data:
    conv_str = '_'.join(ex['dialog'][:min_dialog_turns])
    if conv_str not in conv_ids:
        conv_ids[conv_str] = str(uuid4())
        
    ex['id'] = conv_ids[conv_str]

In [4]:
print("total conversations:", len(conv_ids))

total conversations: 1297


# split across conversations for training and test

In [5]:
import numpy as np

all_ids = list(conv_ids.values())
np.random.shuffle(all_ids)

val_size = 100
test_size = 100

test_ids = all_ids[:test_size]
val_ids = all_ids[test_size:test_size+val_size]
train_ids = all_ids[test_size+val_size:]

train = [d for d in data if d['id'] in train_ids]
val = [d for d in data if d['id'] in val_ids]
test = [d for d in data if d['id'] in test_ids]

print("training size: ", len(train))
print("validation size: ", len(val))
print("test size: ", len(test))

training size:  6671
validation size:  604
test size:  635


In [6]:
import json

def write_jsonl(path, data):
    with open(path, 'w') as f:
        for ex in data:
            f.write(json.dumps(ex) + "\n")
            
            
write_jsonl('data/train.json', train)
write_jsonl('data/dev.json', val)
write_jsonl('data/test.json', test)

In [7]:
train[0]

{'dialog': ['Hello\n',
  'Hello, what would you like to talk about?',
  'I am having a lot of anxiety about quitting my current job. It is too stressful but pays well\n',
  'What makes your job stressful for you?',
  'I have to deal with many people in hard financial situations and it is upsetting \n'],
 'situation': 'I hate my job but I am scared to quit and seek a new career.',
 'speakers': ['seeker', 'supporter', 'seeker', 'supporter', 'seeker'],
 'responses': {'Affirmation': 'Your compassion and empathy for those struggling financially are truly admirable, and it takes a lot of emotional strength to navigate such challenging situations on a daily basis.',
  'Clarification': "So, it's the emotional toll of interacting with clients who are struggling financially that's causing you stress, rather than just the workload itself?",
  'Collaborative Planning': "Let's brainstorm some ways you could set boundaries or self-care strategies to help you cope with the emotional demands of your j

# Get statistics of the data

In [8]:
import numpy as np
from collections import Counter


def get_statistics(ds):
    dlens = [len(x['dialog']) for x in ds]
    print("average dialog len: ", np.mean(dlens))
    print("min dialog len: ", np.min(dlens))
    print("max dialog len: ", np.max(dlens))

    print("number of examples: ", len(ds))

    scnt = Counter()
    all_cnt = 0
    for x in ds:
        scnt.update(list(x['responses'].keys()))
        all_cnt += len(list(x['responses'].keys()))
    print(scnt.most_common(20))
    print(all_cnt)
        
    

get_statistics(data)

average dialog len:  11.765613147914033
min dialog len:  5
max dialog len:  23
number of examples:  7910
[('unconditional', 7910), ('Suggest Options', 2451), ('Affirmation', 2437), ('Clarification', 2404), ('Share Information', 2404), ('Offer Hope', 2397), ('Normalize Experiences', 2393), ('Stress Management', 2370), ('Reframe Negative Thoughts', 2368), ('Emotional Validation', 2367), ('Promote Self-Care Practices', 2340), ('Avoid Judgment and Criticism', 2337), ('Collaborative Planning', 2323), ('Provide Different Perspectives', 2323), ('Reflective Statements', 2291), ('Chit Chat', 2284)]
43399


# prepare data in jsonl and clean format for uploading to HF

In [None]:
def load_jsonl(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
            
    return data

def write_jsonl(path, data):
    with open(path, 'w') as f:
        for d in data:
            f.write(json.dumps(d) + "\n")
            

def clean_and_reformat(input_path, output_path):
    data = load_jsonl("data/train.json")
    
    for d in data:
        for s, r in d['responses'].items():
            if r.startswith("assistant")