# Load dataset

In [29]:
from glob import glob
import os
import json 

path = '../data/extended_esc_70b.json'

with open(path, 'r') as f:
    data = json.load(f)
        

print(len(data))

5479


In [30]:
lens = [len(x['dialog']) for x in data]

print(min(lens))

min_dialog_turns = min(lens)
    

5


# Process conversations and add an id to them

In [31]:
from uuid import uuid4

conv_ids = {}

for ex in data:
    conv_str = '_'.join(ex['dialog'][:min_dialog_turns])
    if conv_str not in conv_ids:
        conv_ids[conv_str] = str(uuid4())
        
    ex['id'] = conv_ids[conv_str]

In [32]:
print("total conversations:", len(conv_ids))

total conversations: 1288


# split across conversations for training and test

In [6]:
import numpy as np

all_ids = list(conv_ids.values())
np.random.shuffle(all_ids)

val_size = 50
test_size = 100

test_ids = all_ids[:test_size]
val_ids = all_ids[test_size:test_size+val_size]
train_ids = all_ids[test_size+val_size:]

train = [d for d in data if d['id'] in train_ids]
val = [d for d in data if d['id'] in val_ids]
test = [d for d in data if d['id'] in test_ids]

print("training size: ", len(train))
print("validation size: ", len(val))
print("test size: ", len(test))

training size:  8264
validation size:  354
test size:  736


In [43]:
import json

def write_jsonl(path, data):
    with open(path, 'w') as f:
        for ex in data:
            f.write(json.dumps(ex) + "\n")
            
            
write_jsonl('data/train.json', train)
write_jsonl('data/dev.json', val)
write_jsonl('data/test.json', test)

In [7]:
train[0]

{'dialog': ['Hello\n',
  'Hello, what would you like to talk about?',
  'I am having a lot of anxiety about quitting my current job. It is too stressful but pays well\n',
  'What makes your job stressful for you?',
  'I have to deal with many people in hard financial situations and it is upsetting \n'],
 'situation': 'I hate my job but I am scared to quit and seek a new career.',
 'speakers': ['seeker', 'supporter', 'seeker', 'supporter', 'seeker'],
 'responses': {'Affirmation': 'you have shown incredible strength and resilience in facing the challenges of your current job and the anxiety it brings. your dedication to improving your situation is truly commendable, and i admire your hopefulness in tough times. it takes a lot of courage to consider quitting a well-paying job, and your willingness to take that risk is a testament to your determination to find a career that brings you fulfillment and happiness.',
  'Clarification': "i understand that you're feeling anxious about your job a

# Get statistics of the data

In [33]:
import numpy as np
from collections import Counter


def get_statistics(ds):
    dlens = [len(x['dialog']) for x in ds]
    print("average dialog len: ", np.mean(dlens))
    print("min dialog len: ", np.min(dlens))
    print("max dialog len: ", np.max(dlens))

    print("number of examples: ", len(ds))

    scnt = Counter()
    all_cnt = 0
    for x in ds:
        scnt.update(list(x['responses'].keys()))
        all_cnt += len(list(x['responses'].keys()))
    print(scnt.most_common(20))
    print(all_cnt)
        
    

get_statistics(data)

average dialog len:  11.01058587333455
min dialog len:  5
max dialog len:  19
number of examples:  5479
[('Reflective Statements', 1718), ('Promote Self-Care Practices', 1710), ('Collaborative Planning', 1688), ('Stress Management', 1669), ('Share Information', 1662), ('Affirmation', 1656), ('Offer Hope', 1651), ('Normalize Experiences', 1643), ('Clarification', 1641), ('Avoid Judgment and Criticism', 1635), ('Provide Different Perspectives', 1623), ('Reframe Negative Thoughts', 1623), ('Emotional Validation', 1619), ('Suggest Options', 1612), ('Chit Chat', 1610)]
24760
