In [1]:
from transformers import TextStreamer
import torch
from datasets import Dataset
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
import warnings
import json
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


### MI dataset

In [3]:
dataset_name = "to-be/annomi-motivational-interviewing-therapy-conversations"

In [4]:
# load the dataset
data = load_dataset(dataset_name)

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/467k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [15]:
chat = []
# data['train'][88]['conversations']
for d in data['train']:
    single_chat = []
    conv = d['conversations']
    for i in conv:
        if i['from'] == "gpt":
            single_chat.append({
                "role": "assistant",
                "content": i['value']
            })
        else:
            single_chat.append({
                "role": "user",
                "content": i['value']
            })
        
    chat.append(single_chat)

In [21]:
# save as a json file
with open('substance_use_conv.json', 'w') as f:
    json.dump(chat, f)

In [31]:
# open the file and load the chat
with open('./Datasets/substance_use_conv.json') as f:
    chat = json.load(f)
    
print(len(chat))

133


In [32]:
def merge_consecutive_roles(conversations):
    merged_conversations = []

    for conversation in conversations:
        merged_conversation = []
        current_role = None
        current_content = ""
        
        # if the first role is an assistant, then add the following to the chat
        if conversation[0]['role'] == 'assistant':
            first_chat = {'role': 'user', 'content': 'Hello there!'}
            conversation.insert(0, first_chat)

        for message in conversation:
            # Check if the current message's role is the same as the previous one
            if message['role'] == current_role:
                # If so, merge the content with the previous message
                current_content += " " + message['content']
            else:
                # If the role has changed (or if it's the first message), save the previous message (if it exists)
                if current_content:
                    merged_conversation.append(
                        {'role': current_role, 'content': current_content.strip()})
                # Update the current role and content with the new message
                current_role = message['role']
                current_content = message['content']

        # Don't forget to add the last message after exiting the loop
        if current_content:
            merged_conversation.append(
                {'role': current_role, 'content': current_content.strip()})

        merged_conversations.append(merged_conversation)

    return merged_conversations

In [33]:
new_chat = merge_consecutive_roles(chat)
print(f'Length of new chat: {len(new_chat)}')

Length of new chat: 133


In [38]:
system_msg = """
You are a empathetic, respectful and engaging motivational therapist.
Your goal is to extract information from patient by engaging in a conversation with them.
If you are not sure what to say, you can ask the patient to elaborate on their response. Keep the conversation short and to the point. 
"""
system = {'role': 'system', 'content': system_msg}

In [40]:

for chat in new_chat:
    chat.insert(0, system)

In [46]:
dataset = Dataset.from_dict({'chat': new_chat})

In [48]:
dataset.save_to_disk('./Datasets/substance_use_conv')

Saving the dataset (1/1 shards): 100%|██████████| 133/133 [00:00<00:00, 33459.84 examples/s]


In [26]:
# dataset['chat'][0]