In [33]:
import pandas as pd
import json
import os
print("Combingin all data sources.........")

Combingin all data sources.........


In [34]:
all_conversations = []

In [35]:
# Survery form data
# Open your JSONL file properly
with open("data/processed/survey.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            data = json.loads(line)
            # Make sure it is in 'messages' format
            if "user" in data and "assistant" in data:
                normalized = {
                    "messages": [
                        {"role": "user", "content": data["user"]},
                        {"role": "assistant", "content": data["assistant"]}
                    ]
                }
            elif "messages" in data:
                normalized = data
            else:
                print(f"Unknown format, skipping line: {line[:50]}")
                continue

            all_conversations.append(normalized)
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {line[:50]} | Error: {e}")

print(f"Loaded {len(all_conversations)} conversations")


Loaded 44 conversations


In [36]:


empathetic_file = "data/raw/empatheticdialogues/train.csv"
output_file = "data/processed/empathetic_conversations.jsonl"

if os.path.exists(empathetic_file):
    try:
        print("📋 Reading CSV file...")
        try:
            df = pd.read_csv(empathetic_file, sep=',', encoding='utf-8', on_bad_lines='skip')
        except Exception:
            print("⚠️ Comma separator failed, trying tab...")
            df = pd.read_csv(empathetic_file, sep='\t', encoding='utf-8', on_bad_lines='skip')

        # Keep only the required columns
        keep_cols = ['conv_id', 'prompt', 'utterance']
        df = df[[col for col in keep_cols if col in df.columns]]

        # Check required columns
        if not {'prompt', 'utterance'}.issubset(df.columns):
            raise ValueError("Missing required columns: 'prompt' and 'utterance'")

        print(f"📊 Columns kept: {list(df.columns)}")
        print(f"📊 Total rows: {len(df)}")

        # If conv_id is missing, create one based on row order
        if 'conv_id' not in df.columns:
            df['conv_id'] = range(len(df))

        # Group by conversation ID if present
        grouped = df.groupby('conv_id')

        for conv_id, group in grouped:
            messages = []

            for _, row in group.iterrows():
                # Add the prompt as user message
                prompt = str(row['prompt']).strip()
                if prompt:
                    messages.append({"role": "user", "content": prompt})

                # Add the utterance as assistant message
                utterance = str(row['utterance']).strip()
                if utterance:
                    messages.append({"role": "assistant", "content": utterance})

            # Only include meaningful conversations
            if len(messages) >= 2:
                all_conversations.append({"messages": messages})

        # Save to JSONL
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        with open(output_file, 'w', encoding='utf-8') as f:
            for conv in all_conversations:
                f.write(json.dumps(conv, ensure_ascii=False) + '\n')

        print(f"✅ Successfully saved {len(all_conversations)} conversations to {output_file}")

    except Exception as e:
        print(f"⚠️ Error processing EmpatheticDialogues: {e}")
        import traceback
        print(traceback.format_exc())

else:
    print(f"⚠️ EmpatheticDialogues file not found at {empathetic_file}")


📋 Reading CSV file...
📊 Columns kept: ['conv_id', 'prompt', 'utterance']
📊 Total rows: 76668
✅ Successfully saved 17883 conversations to data/processed/empathetic_conversations.jsonl


In [37]:
with open("data/processed/burmese.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            data = json.loads(line)
            # Make sure it is in 'messages' format
            if "user" in data and "assistant" in data:
                normalized = {
                    "messages": [
                        {"role": "user", "content": data["user"]},
                        {"role": "assistant", "content": data["assistant"]}
                    ]
                }
            elif "messages" in data:
                normalized = data
            else:
                print(f"Unknown format, skipping line: {line[:50]}")
                continue

            all_conversations.append(normalized)
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {line[:50]} | Error: {e}")

print(f"Loaded {len(all_conversations)} conversations")


Loaded 17938 conversations


In [38]:
# Save final Combined Data


output_file = "data/processed/combined3dataset.jsonl"

print(f"\n💾 Saving combined data...")
print(f"📊 Total conversations: {len(all_conversations)}")

with open(output_file, 'w', encoding='utf-8') as f:
    for conv in all_conversations:
        f.write(json.dumps(conv, ensure_ascii=False) + '\n')

print(f"✅ Saved to: {output_file}")


💾 Saving combined data...
📊 Total conversations: 17938
✅ Saved to: data/processed/combined3dataset.jsonl


In [41]:

import csv

# --- Configuration ---
input_file_name = 'data/processed/combined3dataset.jsonl'
output_file_name = 'data/processed/combined3dataset.csv'
# ---------------------

print(f"Starting conversion from {input_file_name} to {output_file_name}...")

converted_pairs = 0
skipped_lines = 0

try:
    # 'w' = write mode, newline='' is for csv, encoding='utf-8' is for Burmese
    with open(output_file_name, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        
        # Write the header row
        csv_writer.writerow(['user', 'assistant'])
        
        with open(input_file_name, 'r', encoding='utf-8') as jsonlfile:
            
            # Read one line at a time
            for i, line in enumerate(jsonlfile):
                line = line.strip()
                if not line:
                    continue
                    
                try:
                    data = json.loads(line)
                    
                    # Check if the line has the expected 'messages' list
                    if 'messages' in data and isinstance(data['messages'], list):
                        messages = data['messages']
                        
                        # Loop through the messages list
                        # We look for a 'user' role followed by an 'assistant' role
                        j = 0
                        while j < len(messages) - 1:
                            if messages[j].get('role') == 'user' and messages[j+1].get('role') == 'assistant':
                                
                                # Extract the content
                                user_content = messages[j].get('content', '')
                                assistant_content = messages[j+1].get('content', '')
                                
                                # --- DATA CLEANING ---
                                # This part fixes your "unusual data" problem
                                # It splits the string at the junk data and keeps only the real message
                                if ",5|5|5_" in assistant_content:
                                    assistant_content = assistant_content.split(",5|5|5_")[0]
                                
                                # Write the clean pair as one row in the CSV
                                csv_writer.writerow([user_content, assistant_content])
                                converted_pairs += 1
                                
                                # Move to the next pair
                                j += 2
                            else:
                                # If the pattern is broken (e.g., user, user, assistant)
                                # skip this message to find the next 'user'
                                j += 1 
                    else:
                        print(f"Skipping line {i+1}: 'messages' key not found or format is wrong.")
                        skipped_lines += 1
                        
                except json.JSONDecodeError:
                    print(f"Skipping line {i+1}: Not valid JSON.")
                    skipped_lines += 1
                except Exception as e:
                    print(f"Skipping line {i+1}: Unexpected error - {e}")
                    skipped_lines += 1

    print(f"\n--- Conversion Complete ---")
    print(f"Successfully converted {converted_pairs} user/assistant pairs into CSV rows.")
    print(f"Skipped {skipped_lines} problematic lines.")
    print(f"New file created: {output_file_name}")

except FileNotFoundError:
    print(f"Error: The file '{input_file_name}' was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Starting conversion from data/processed/combined3dataset.jsonl to data/processed/combined3dataset.csv...

--- Conversion Complete ---
Successfully converted 76767 user/assistant pairs into CSV rows.
Skipped 0 problematic lines.
New file created: data/processed/combined3dataset.csv
