In [7]:
import os
import json
from datasets import load_dataset

def convert_to_jsonl(input_directory, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    train_filename = os.path.join(output_dir, 'train.jsonl')
    eval_filename = os.path.join(output_dir, 'eval.jsonl')

    # Clear existing data in output files
    open(train_filename, 'w').close()
    open(eval_filename, 'w').close()

    # Dictionary to hold content by bot name
    content_by_bot = {}

    # Gather files from both 'raw' and 'autonomy' subdirectories
    for subfolder in ['raw', 'autonomy']:
        subfolder_path = os.path.join(input_directory, subfolder)
        for input_filename in os.listdir(subfolder_path):
            if input_filename.endswith('.txt'):
                bot_name = input_filename.replace('_autonomy', '').replace('.txt', '')
                file_path = os.path.join(subfolder_path, input_filename)

                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                if bot_name not in content_by_bot:
                    content_by_bot[bot_name] = content
                else:
                    content_by_bot[bot_name] += "\n\n" + content  # Add content from both files

    # Process content for each bot
    for bot_name, content in content_by_bot.items():
        process_conversations(content, bot_name, train_filename, eval_filename)

def process_conversations(content, bot_name, train_filename, eval_filename):
    conversation_blocks = content.split('\n\n')
    train_count = 0
    eval_count = 0
    total_lines = 0
    total_characters = 0
    block_line_counts = []

    for block in conversation_blocks:
        lines = block.split('\n')
        total_lines += len(lines)
        block_line_counts.append(len(lines))

        # Calculate total characters in each line
        for line in lines:
            total_characters += len(line)

        if lines[0].startswith('[t]'):
            target_file = train_filename
            train_count += 1
            lines[0] = lines[0][3:]  # Remove the [t] flag
        elif lines[0].startswith('[e]'):
            target_file = eval_filename
            eval_count += 1
            lines[0] = lines[0][3:]  # Remove the [e] flag
        else:
            continue  # Skip blocks without valid flags

        chat = extract_chats(lines, bot_name)
        write_jsonl(chat, target_file)

    # Calculate and print the training/evaluation ratio for this character
    total_blocks = train_count + eval_count
    if total_blocks > 0:
        train_ratio = train_count / total_blocks
        eval_ratio = eval_count / total_blocks
        avg_lines_per_block = total_lines / total_blocks
        avg_chars_per_line = total_characters / total_lines
        print(f"{bot_name}: Train {train_count} ({train_ratio:.2f}), Eval {eval_count} ({eval_ratio:.2f}), Total lines {total_lines}, Avg lines/block {avg_lines_per_block:.2f}, Avg chars/line {avg_chars_per_line:.2f}")

def extract_chats(lines, bot_name):
    chat = [{"role": "system", "content": bot_name}]
    for line in lines:
        if line.startswith("HUMAN:"):
            chat.append({"role": "user", "content": line.replace("HUMAN:", "").strip()})
        elif line.startswith("RESPONSE:"):
            chat.append({"role": "assistant", "content": line.replace("RESPONSE:", "").strip()})

    return [{"chat": chat}] if len(chat) > 1 else []

def write_jsonl(chats, filename):
    with open(filename, 'a', encoding='utf-8') as jsonl_file:
        for chat in chats:
            json.dump(chat, jsonl_file)
            jsonl_file.write('\n')

# Configuration
base_dir = os.getcwd()
repo_dir = 'datasets/TRACHI'
input_directory = os.path.join(base_dir, 'datasets')
output_dir = os.path.join(base_dir, repo_dir)

# Convert txt files to jsonl files
convert_to_jsonl(input_directory, output_dir)

dataset = load_dataset(repo_dir)
dataset.push_to_hub("norygano/TRACHI")

Daphne: Train 99 (0.79), Eval 26 (0.21), Total lines 1057, Avg lines/block 8.46, Avg chars/line 59.91
Atlas: Train 137 (0.79), Eval 36 (0.21), Total lines 1674, Avg lines/block 9.68, Avg chars/line 57.05
Pandora: Train 35 (0.78), Eval 10 (0.22), Total lines 283, Avg lines/block 6.29, Avg chars/line 55.71
Ganymede: Train 222 (0.80), Eval 57 (0.20), Total lines 1982, Avg lines/block 7.10, Avg chars/line 54.98


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/430 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/norygano/TRACHI/commit/90d3842cb037e2b5fd34388b481c21c969200f43', commit_message='Upload dataset', commit_description='', oid='90d3842cb037e2b5fd34388b481c21c969200f43', pr_url=None, pr_revision=None, pr_num=None)

In [4]:
# Ratio Check (jsonl)
def manage_ratio(input_directory, expected_ratio=0.8):
    train_count = 0
    eval_count = 0

    # Count existing training and evaluation blocks
    for input_filename in os.listdir(input_directory):
        if input_filename.endswith('.txt'):
            file_path = os.path.join(input_directory, input_filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                blocks = file.read().split('\n\n')
                for block in blocks:
                    if block.startswith('[t]'):
                        train_count += 1
                    elif block.startswith('[e]'):
                        eval_count += 1

    total_blocks = train_count + eval_count
    current_train_ratio = train_count / total_blocks if total_blocks > 0 else 0

    print(f"Current Training Blocks: {train_count}")
    print(f"Current Evaluation Blocks: {eval_count}")
    print(f"Current Training Ratio: {current_train_ratio:.2f}")

    # Determine how many new blocks to add based on the expected ratio
    if current_train_ratio < expected_ratio:
        print("Add more training blocks.")
    else:
        print("Add more evaluation blocks.")

    return current_train_ratio

# Call the function
base_dir = os.getcwd()
repo_dir = 'datasets/TRACHI'
input_directory = os.path.join(base_dir, 'datasets', 'raw')
manage_ratio(input_directory)

Current Training Blocks: 143
Current Evaluation Blocks: 39
Current Training Ratio: 0.79
Add more training blocks.


0.7857142857142857

In [2]:
import pandas as pd

def load_csv(file_path):
    # Load the CSV with specified delimiter and escape character
    return pd.read_csv(file_path, delimiter='\t', escapechar='\\')

# Define character aliases
character_aliases = {
    "Eurydice": ['Eurydice'],
}

df = load_csv('AUTONOMY.csv')

def process_character_dialogues(character, aliases, df):
    # Identify conversations that include the character
    df['Character_present'] = df['Speaker'].isin(aliases)
    relevant_maps = df[df['Character_present']]['Map'].unique()
    filtered_df = df[df['Map'].isin(relevant_maps)].copy()
    
    output_path = f"datasets/autonomy/{character}_autonomy.txt"
    with open(output_path, 'w', encoding='utf-8') as f:
        current_map = None
        last_speaker = None

        for _, row in filtered_df.iterrows():
            if row['Map'] != current_map:
                # Close the last conversation with RESPONSE if needed
                if current_map is not None and last_speaker not in aliases:
                    f.write("\nRESPONSE:")
                f.write("\n\n")
                
                current_map = row['Map']
                last_speaker = None  # Reset the last speaker for new map
                # Start with an empty HUMAN if the first speaker is the character
                if row['Speaker'] in aliases:
                    f.write("HUMAN:\n")

            if last_speaker != row['Speaker']:
                if last_speaker is not None:
                    f.write("\n")
                f.write("RESPONSE: " if row['Speaker'] in aliases else "HUMAN: ")
            f.write(row['Dialogue'].strip() + " ")

            last_speaker = row['Speaker']

        # Ensure the last dialogue in the file ends with RESPONSE:
        if last_speaker not in aliases:
            f.write("\nRESPONSE: ")

    print(f"Dialogues saved to '{output_path}'.")

# Process dialogues for each character
for character, aliases in character_aliases.items():
    process_character_dialogues(character, aliases, df)

Dialogues saved to 'datasets/autonomy/Daphne_autonomy.txt'.
Dialogues saved to 'datasets/autonomy/Atlas_autonomy.txt'.
Dialogues saved to 'datasets/autonomy/Lorna_autonomy.txt'.


In [4]:
# OLD: Notate
import os
from random import sample, seed

def add_split_notation(input_directory, split_ratio=0.8, random_seed=42):
    seed(random_seed)

    for input_filename in os.listdir(input_directory):
        if input_filename.endswith('Atlas_autonomy.txt'):
            file_path = os.path.join(input_directory, input_filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                conversation_blocks = file.read().split('\n\n')
                
            total_blocks = len(conversation_blocks)
            split_index = int(total_blocks * split_ratio)
            
            # Generate a random sample of indexes for training
            training_indexes = set(sample(range(total_blocks), split_index))
            
            with open(file_path, 'w', encoding='utf-8') as file:
                for i, block in enumerate(conversation_blocks):
                    lines = block.split('\n')
                    
                    if i in training_indexes:
                        lines[0] = "[t]" + lines[0]
                    else:
                        lines[0] = "[e]" + lines[0]
                    
                    # Join the lines back together and write back to the file
                    file.write('\n'.join(lines) + '\n\n')

# Configuration
base_dir = os.getcwd()
repo_dir = 'datasets/TRACHI'
input_directory = os.path.join(base_dir, 'datasets', 'autonomy')

# Run the function
add_split_notation(input_directory)