In [2]:
import pandas as pd
import re

# Load the CSV file into a DataFrame
df = pd.read_csv('synthetic_conversations.csv')

df2 = pd.read_csv('synthetic_conversations2.csv')

In [3]:
df = pd.concat([df, df2], ignore_index=True)

In [5]:
# Function to process the 'Content' column and transform it into a conversational format
def process_conversation(text):
    # Regular expression to match the format "System: ... (optional) User: ... AI: ..."
    system_pattern = r"System: (.+?)(?=\s*User:|\Z)"
    dialogue_pattern = r"(?s)User: (.+?)\s*AI: (.+?)(?=\s*User:|\Z)"
    
    # Find system message if it exists
    system_match = re.search(system_pattern, text)
    
    # Find all user-AI pairs
    matches = re.findall(dialogue_pattern, text)
    
    # If there are no matches, return None (indicating incorrect format)
    if not matches:
        return None
    
    # Start the conversation with a default system message
    conversation = [{"role": "system", "content": ""}]
    
    # If a system message exists, replace the default message
    if system_match:
        conversation[0]['content'] = system_match.group(1).strip()
    
    # Add user and AI messages
    for user_text, ai_text in matches:
        conversation.append({"role": "user", "content": user_text.strip()})
        conversation.append({"role": "assistant", "content": ai_text.strip()})
    
    return {"messages": conversation}

# Apply the function to the 'Content' column and create a new column for the transformed data
df['transformed_conversation'] = df['Converse'].apply(process_conversation)

# Drop rows with incorrectly formatted content (where transformed_conversation is None)
df_cleaned = df.dropna(subset=['transformed_conversation'])

# Save the cleaned and transformed dataset to a new CSV file
df_cleaned.to_csv('transformed_dataset.csv', index=False)

# Optionally, you can also view the first few rows of the transformed dataset
print(df_cleaned[['transformed_conversation']].head())


                            transformed_conversation
0  {'messages': [{'role': 'system', 'content': ''...
1  {'messages': [{'role': 'system', 'content': ''...
2  {'messages': [{'role': 'system', 'content': ''...
3  {'messages': [{'role': 'system', 'content': ''...
4  {'messages': [{'role': 'system', 'content': ''...


In [6]:
print(df.shape)

(30722, 6)


In [9]:
df_cleaned.head(1)

Unnamed: 0,Converse,Topic,Subtopic,Style,Greetings,transformed_conversation
0,User: M'he quedat sense idees per a sopar. Què...,Handling ambiguity,Providing multiple possibilities,Dilemmas,Do not include greetings,"{'messages': [{'role': 'system', 'content': ''..."
