# Load dataset

In [6]:
from glob import glob
import os
import json 

path = './data/extended_esc_llama31.json'

data = []
with open(path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
        

print(len(data))

4657


In [7]:
lens = [len(x['dialog']) for x in data]

print(min(lens))

min_dialog_turns = min(lens)
    

5


# Process conversations and add an id to them

In [8]:
from uuid import uuid4

conv_ids = {}

for ex in data:
    conv_str = '_'.join(ex['dialog'][:min_dialog_turns])
    if conv_str not in conv_ids:
        conv_ids[conv_str] = str(uuid4())
        
    ex['id'] = conv_ids[conv_str]

In [9]:
print("total conversations:", len(conv_ids))

total conversations: 1019


# split across conversations for training and test

In [14]:
import numpy as np

all_ids = list(conv_ids.values())
np.random.shuffle(all_ids)

val_size = 50
test_size = 100

test_ids = all_ids[:test_size]
val_ids = all_ids[test_size:test_size+val_size]
train_ids = all_ids[test_size+val_size:]

train = [d for d in data if d['id'] in train_ids]
val = [d for d in data if d['id'] in val_ids]
test = [d for d in data if d['id'] in test_ids]

print("training size: ", len(train))
print("validation size: ", len(val))
print("test size: ", len(test))

training size:  3941
validation size:  257
test size:  459


In [15]:
import json

def write_jsonl(path, data):
    with open(path, 'w') as f:
        for ex in data:
            f.write(json.dumps(ex) + "\n")
            
            
write_jsonl('data/train.json', train)
write_jsonl('data/dev.json', val)
write_jsonl('data/test.json', test)

In [16]:
train[0]

{'dialog': ['Hi',
  'Hello! How are you today?\n',
  'Ok I guess I do not know how to tell my husband that I am lonely and I want out of the marriage',
  'Oh, that sure is a heavy subject and a heavy thing to be on your mind.',
  'He is not one you can talk to he usually just brushes things off',
  'So you have tried to discuss your loneliness with your husband before?',
  'Yes he tells me it’s just in my head and I could just go out with him, but he drinks then to much so I do not like going out with him',
  'I am sure that is frustrating! Does he drink at home as well or just when you go out?',
  'Just when he goes out I do not go with him I do not drink and drive  And he hangs with some obnoxious people',
  'Would a date night at home with take-out and a movie just the two of you be an option?',
  'He would go to sleep , and then he thinks I am crazy he says we are no kids anymore that need to go on a date we are married ',
  'Do you think you both might be open to talking to a marr

# Get statistics of the data

In [13]:
import numpy as np
from collections import Counter


def get_statistics(ds):
    dlens = [len(x['dialog']) for x in ds]
    print("average dialog len: ", np.mean(dlens))
    print("min dialog len: ", np.min(dlens))
    print("max dialog len: ", np.max(dlens))

    print("number of examples: ", len(ds))

    scnt = Counter()
    all_cnt = 0
    for x in ds:
        scnt.update(list(x['responses'].keys()))
        all_cnt += len(list(x['responses'].keys()))
    print(scnt.most_common(20))
    print(all_cnt)
        
    

get_statistics(data)

average dialog len:  11.773459308567748
min dialog len:  5
max dialog len:  23
number of examples:  4657
[('unconditional', 4657), ('Affirmation', 1464), ('Emotional Validation', 1444), ('Promote Self-Care Practices', 1438), ('Reframe Negative Thoughts', 1421), ('Avoid Judgment and Criticism', 1414), ('Offer Hope', 1402), ('Share Information', 1400), ('Normalize Experiences', 1398), ('Clarification', 1395), ('Provide Different Perspectives', 1389), ('Reflective Statements', 1387), ('Collaborative Planning', 1382), ('Suggest Options', 1379), ('Stress Management', 1341), ('Chit Chat', 1338)]
25649


# prepare data in jsonl and clean format for uploading to HF

In [None]:
def load_jsonl(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
            
    return data

def write_jsonl(path, data):
    with open(path, 'w') as f:
        for d in data:
            f.write(json.dumps(d) + "\n")
            

def clean_and_reformat(input_path, output_path):
    data = load_jsonl("data/train.json")
    
    for d in data:
        for s, r in d['responses'].items():
            if r.startswith("assistant")