- Convert Dataset (jsonl) -> fine-tune dolphin-mistal-7b -> quantize -> create ollama Modelfile -> push .gguf (HF)

TODO:
- Convert .ipynb to py
  - dataset.py | train.py
- Create config.yaml
  - Base Model + Dataset
  - Training parameters
  - Quantization Methods
- Check Discord implementation

In [6]:
import os
import json
from datasets import load_dataset
from random import shuffle, seed

def convert_to_jsonl(input_directory, 
                     output_dir, 
                     split_ratio=0.8, 
                     random_seed=42):

    seed(random_seed)
    os.makedirs(output_dir, exist_ok=True)
    train_filename = os.path.join(output_dir, 'train.jsonl')
    eval_filename = os.path.join(output_dir, 'eval.jsonl')

    # Clear existing data in output files
    open(train_filename, 'w').close()
    open(eval_filename, 'w').close()

    for input_filename in os.listdir(input_directory):
        if input_filename.endswith('.txt'):
            bot_name = input_filename[:-4]
            file_path = os.path.join(input_directory, input_filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                conversation_blocks = file.read().split('\n\n')
                shuffle(conversation_blocks)  # Shuffle for random splitting
                split_index = int(len(conversation_blocks) * split_ratio)

            train_chats = []
            eval_chats = []

            for block in conversation_blocks[:split_index]:
                train_chats.extend(extract_chats(block, bot_name))

            for block in conversation_blocks[split_index:]:
                eval_chats.extend(extract_chats(block, bot_name))

            # Write to train and eval files
            write_jsonl(train_chats, train_filename)
            write_jsonl(eval_chats, eval_filename)

def extract_chats(block, bot_name):
    lines = block.split('\n')
    chat = [{"role": "system", "content": bot_name}]
    for line in lines:
        if line.startswith("HUMAN:"):
            chat.append({"role": "user", "content": line.replace("HUMAN:", "").strip()})
        elif line.startswith("RESPONSE:"):
            chat.append({"role": "assistant", "content": line.replace("RESPONSE:", "").strip()})

    return [{"chat": chat}] if len(chat) > 1 else []

def write_jsonl(chats, filename):
    with open(filename, 'a', encoding='utf-8') as jsonl_file:
        for chat in chats:
            json.dump(chat, jsonl_file)
            jsonl_file.write('\n')

# Configuration
base_dir = os.getcwd()
repo_dir = 'datasets/TRACHI'
input_directory = os.path.join(base_dir, 'datasets', 'raw')
output_dir = os.path.join(base_dir, repo_dir)

# Convert txt files to jsonl files with splitting
convert_to_jsonl(input_directory, output_dir)

dataset = load_dataset(repo_dir)
dataset.push_to_hub("norygano/TRACHI")