In [7]:
%%capture
import os, importlib.util
!pip install --upgrade -qqq uv
if importlib.util.find_spec("torch") is None or "COLAB_" in "".join(os.environ.keys()):    
    try: import numpy, PIL; get_numpy = f"numpy=={numpy.__version__}"; get_pil = f"pillow=={PIL.__version__}"
    except: get_numpy = "numpy"; get_pil = "pillow"
    !uv pip install -qqq \
        "torch>=2.8.0" "triton>=3.4.0" {get_numpy} {get_pil} torchvision bitsandbytes "transformers==4.56.2" \
        "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
        "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
        git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
elif importlib.util.find_spec("unsloth") is None:
    !uv pip install -qqq unsloth
!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers trl==0.22.2 unsloth unsloth_zoo huggingface-hub==0.34.4 datasets==4.3.0 numpy==2.3.4 pandas==2.3.3 pyarrow==22.0.0 tqdm==4.67.1

In [10]:
!pip list | grep -E "datasets|pandas|tqdm|numpy|pyarrow|transformers|tokenizers|trl|unsloth|unsloth_zoo|huggingface-hub|datasets|numpy"

datasets                          4.3.0
fastrlock                         0.8.3
huggingface-hub                   0.34.4
numpy                             2.3.4
pandas                            2.3.3
pyarrow                           22.0.0
tokenizers                        0.22.1
tqdm                              4.67.1
transformers                      4.56.2
trl                               0.22.2
unsloth                           2025.11.2
unsloth_zoo                       2025.11.3


In [11]:
from huggingface_hub import notebook_login

In [12]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [13]:
from datasets import load_dataset, concatenate_datasets, Dataset
import pandas as pd
import random
import json

# Set a seed for reproducibility
random.seed(42)

INSPECT_ON = False

# Prints ONE random entry
def inspect_random_entry(ds_name, ds_object, force_disp = False):

    if not force_disp:
        if not INSPECT_ON:
            return 
    
    """Randomly selects and prints one raw sample from the dataset for observation."""
    print(f"\n--- INSPECTION: {ds_name} (ONE RANDOM RAW SAMPLE) ---")
    random_index = random.randint(0, len(ds_object) - 1)
    sample = ds_object[random_index]
    print(f"Sample Index: {random_index} Raw Data:")
    print(json.dumps(sample, indent=2, default=str))
    print("-" * 50)
    
# Prints parts of the formatted datasets
def inspect_formatted_sft(ds_name, ds_object, n = 2, disp_len = 50, force_disp = False):

    if not force_disp:
        if not INSPECT_ON:
            return 
    
    for i, entry in enumerate(ds_object):
        if i > n:
            break
        print(f"--- Entry {i+1} ---")
        print(f"Instruction: {entry['instruction'][:disp_len]}...") # Truncating for readability
        print(f"Response: {entry['response'][:disp_len]}")
        print(f"Source: {entry['source']}")
        print("-" * 50)
        print("\n")


#  EXTRACTION FUNCTIONS. Convert various of datasets to  { "instruction": "xxx", "response": "xxx", "source": "xxx" }
def extract_lmsys_winner(example):
    """
    Identifies the winning conversation (model_a or model_b) and extracts the 
    first instruction/response pair from that conversation list.
    """
    winner_key = example.get('winner')
    
    if winner_key == 'model_a':
        conv_list = example.get('conversation_a')
    elif winner_key == 'model_b':
        conv_list = example.get('conversation_b')
    else: # Handles 'tie', 'tie (a/b)', or missing winner by defaulting to A
        conv_list = example.get('conversation_a')
        
    if not conv_list or len(conv_list) < 2:
        return {"instruction": None, "response": None}

    instruction = conv_list[0]['content'] if conv_list[0]['role'] == 'user' else None
    response = conv_list[1]['content'] if conv_list[1]['role'] == 'assistant' else None
    
    return {"instruction": instruction, "response": response, "source": "lmsys_conv"}


def format_ultrachat(example):
    instruction, response = None, None
    
    user_messages = [m['content'] for m in example['messages'] if m['role'] == 'user']
    model_messages = [m['content'] for m in example['messages'] if m['role'] == 'assistant']
    instruction = user_messages[0] if user_messages else None
    response = model_messages[0] if model_messages else None
    
    return {"instruction": instruction, "response": response, "source": "ultrachat"}


def format_dolly(example):
    """
    Formats a single Dolly 15k entry, integrating 'context' into the 'instruction'
    when available. Assumes the primary keys are 'instruction', 'context', and 'response'.
    """
    
    # Core fields for Dolly (will be None/empty string if not present)
    core_instruction = example.get("instruction", "")
    context = example.get("context", "")
    response = example.get("response", "")
    
    # Initialize the instruction to be returned
    final_instruction = ""

    # --- 1. Build the Instruction ---
    
    # Check if context exists and is not just whitespace
    if isinstance(context, str) and context.strip():
        # Prepend the context to the instruction clearly
        final_instruction = (
            f"Context: {context.strip()}\n\n"
            f"Instruction: {core_instruction.strip()}"
        )
    elif isinstance(core_instruction, str) and core_instruction.strip():
        # If no context, just use the core instruction
        final_instruction = core_instruction.strip()
    # Note: If both are empty, final_instruction remains "" (which will be filtered later).

    # --- 2. Keep the Response and Source ---
    
    if isinstance(response, str):
        final_response = response.strip()
    else:
        final_response = response # Keep None/other type for filtering

    return {
        "instruction": final_instruction, 
        "response": final_response, 
        "source": "dolly"
    }



import re 
# Note: You'd need to ensure 'import re' is at the top of your main script 
# or use pd.Series as you did before, but the pure Python approach is cleaner.

def format_GSM8K(example):
    """
    Formats a single GSM8K entry by extracting the question and cleaning 
    the step-by-step answer (solution) by removing calculator tokens and 
    the final answer marker.
    """
    
    # GSM8K uses 'question' for instruction and 'answer' for response
    instruction = example.get("question")
    response = example.get("answer")

    if isinstance(response, str):
        # --- FINAL FIXES FOR GSM8K using pure Python re module ---
        
        # 1. Remove GSM8K Calculator Tokens (e.g., <<16+20=36>>)
        # The re.sub is much cleaner than the pd.Series chaining.
        response = re.sub(r'<<.*?>>', '', response).strip()

        # 2. Remove the GSM8K Final Answer Marker (e.g., #### 40 or #### 12.5)
        # Using [^\n]+ to catch any numerical/decimal/fractional answer following the marker
        response = re.sub(r'####\s*[^\n]+', '', response).strip()
    
    # The 'instruction' only needs to be the question
    if isinstance(instruction, str):
        instruction = instruction.strip()
    
    return {"instruction": instruction, "response": response, "source": "GSM8K"}

def format_OpenThoughts(example):
    instruction, response = None, None

    # Check for the 'conversations' key typical of OpenThoughts structure
    if 'conversations' in example and isinstance(example['conversations'], list):
        # The user's instruction is the 'value' of the first 'user' turn
        # 1. Extract Instruction (User's turn)
        user_turn = next((c for c in example['conversations'] if c['from'] == 'user'), None)
        if user_turn:
            instruction = user_turn.get('value')

        # 2. Extract Response (Assistant's turn)
        assistant_turn = next((c for c in example['conversations'] if c['from'] == 'assistant'), None)
        if assistant_turn:
            response = assistant_turn.get('value')
    

    if isinstance(response, str):
        # Apply the CoT cleaning logic
        response = response.replace("<|begin_of_thought|>", "").replace("<|end_of_thought|>", "").strip()
        response = response.replace("<|begin_of_solution|>", "").replace("<|end_of_solution|>", "").strip()
    
    return {"instruction": instruction, "response": response, "source": "OpenThoughts"}    
    
def format_math(example):
    
    return {"instruction": example["problem"], "response": example["solution"], "source": "math"}


def format_alpaca(example):
    
    # Base instruction
    inst = example.get("instruction", "").strip()
    inp = example.get("input", "")
    out = example.get("output") or example.get("response") or example.get("answer") or ""

    # If input exists and is non-empty, append it with a label
    if isinstance(inp, str) and inp.strip():
        inst = f"{inst}\n\nInput:\n{inp.strip()}"

    # final cleaning
    inst = inst.strip()
    out = out.strip()

    return {"instruction": inst, "response": out, "source": "alpaca"}


def format_sharegpt(example):
    
    if "conversations" not in example or len(example["conversations"]) < 2:
        return {"instruction": None, "response": None}
    convs = example["conversations"]
    user_turns = [c["value"] for c in convs if c["from"] == "human"]
    assistant_turns = [c["value"] for c in convs if c["from"] == "gpt"]
    
    return {
        "instruction": user_turns[0] if user_turns else None,
        "response": assistant_turns[0] if assistant_turns else None,
    }
    
def format_boolq(example):
    
    q = example.get("question", "").strip()
    p = example.get("passage", "").strip()
    a = example.get("answer", None)

    # Convert boolean answer to natural language
    if a is True:
        answer_text = "Yes"
    elif a is False:
        answer_text = "No"
    else:
        answer_text = ""

    # Build structured instruction
    if p:
        instruction = f"Question: {q}\n\nContext:\n{p}"
    else:
        instruction = f"Question: {q}"

    return {"instruction": instruction.strip(), "response": answer_text.strip(), "source": "boolq"}

def format_arc(sample):
    """
    Converts a single sample from the allenai/ai2_arc dataset into 
    the Supervised Fine-Tuning (SFT) instruction-response format.

    Args:
        sample (dict): A single dictionary sample from the dataset.

    Returns:
        dict: A dictionary in the SFT format {"instruction": ..., "response": ...}.
    """
    # 1. Extract the main question and the answer key
    question = sample['question']
    answer_key = sample['answerKey']
    
    # 2. Extract the choices and labels
    choices_text = sample['choices']['text']
    choices_label = sample['choices']['label']
    
    # 3. Create the formatted list of choices (e.g., "A. pulley.\nB. lever.\n...")
    formatted_choices = []
    # Use zip to pair the labels (A, B, C, D) with the corresponding text
    for label, text in zip(choices_label, choices_text):
        formatted_choices.append(f"{label}. {text}")
        
    # Join the choices with newlines to make them easy for the model to read
    choices_block = "\n".join(formatted_choices)
    
    # 4. Construct the full 'instruction' prompt
    instruction = (
        f"Question: {question}\n\n"
        f"Choices:\n{choices_block}\n\n"
        "Please select the letter (A, B, C, or D) that represents the correct answer."
    )
    
    # 5. Determine the correct 'response'
    
    # Find the index of the answer key (e.g., 'B' is at index 1)
    answer_index = choices_label.index(answer_key)
    
    # Get the text corresponding to the answer key
    correct_answer_text = choices_text[answer_index]
    
    # The response should include the key and the full text for a complete answer
    response = f"Correct Answer: {answer_key}. {correct_answer_text}"
    
    # 6. Return the final SFT dictionary
    return {
        "instruction": instruction,
        "response": response,
        "source": "arc"
    }

def format_hotpot(example):
    context_text = "\n".join(example["context"]["sentences"][0]) if "context" in example and isinstance(example["context"], dict) else ""
    response = f"Answer: {example['answer']}\n\nContext:\n{context_text}"
    return {"instruction": example["question"], "response": response}


def format_gpt4(example):
    instruction, response = None, None

    # Check for the 'conversations' key typical of OpenThoughts structure
    if 'items' in example and isinstance(example['items'], list):
        # The user's instruction is the 'value' of the first 'user' turn
        # 1. Extract Instruction (User's turn)
        user_turn = next((c for c in example['items'] if c['from'] == 'human'), None)
        if user_turn:
            instruction = user_turn.get('value')

        # 2. Extract Response (Assistant's turn)
        assistant_turn = next((c for c in example['items'] if c['from'] == 'gpt'), None)
        if assistant_turn:
            response = assistant_turn.get('value')
    
    
    return {"instruction": instruction, "response": response, "source": "sharegpt4"}    
    

#  EXTRACTION FUNCTIONS END.

# --- 1. Load Stable Datasets and Inspect ---

print("üì• Starting dataset loading process and inspection...")


# Set a consistent seed for reproducible random sampling
SEED = 42 

def load_and_random_select(dataset_name, split_name, percentage, config_name=None):
    """Loads a split, shuffles it, and selects the specified percentage of entries."""
    print(f"\n-> Loading and selecting {percentage*100:.0f}% randomly from {dataset_name} ({split_name})")
    
    # 1. Load the full split
    if config_name:
        ds_full = load_dataset(dataset_name, config_name, split=split_name)
    else:
        ds_full = load_dataset(dataset_name, split=split_name)
    
    # Calculate the number of entries to select
    num_to_select = int(len(ds_full) * percentage)
    
    # 2. Shuffle the dataset and select the required number of entries
    # The 'seed' makes the random selection reproducible
    ds_random = ds_full.shuffle(seed=SEED).select(range(num_to_select))
    
    return ds_random


#Conversational tone / flow  Ultrachat, Dolly 45 %  Teaches helpful, polite phrasing and structure

#dolly = load_dataset("databricks/databricks-dolly-15k", split="train[:80%]")
dolly = load_and_random_select("databricks/databricks-dolly-15k", "train", 0.15)
inspect_random_entry("databricks/databricks-dolly-15k", dolly)

#ultrachat = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft[:10%]")
ultrachat = load_and_random_select("HuggingFaceH4/ultrachat_200k", "train_sft", 0.01)
inspect_random_entry("HuggingFaceH4/ultrachat_200k", ultrachat)

# Structured reasoning (CoT) Open-Thoughts, GSM8K, MATH 35 % Injects step-by-step clarity and logical progression

#cot_thoughts = load_dataset("open-thoughts/open-thoughts-114k", split="train[:21%]")
cot_thoughts = load_and_random_select("open-thoughts/open-thoughts-114k", "train", 0.40)
inspect_random_entry("open-thoughts/open-thoughts-114k", cot_thoughts)

gsm8k = load_dataset("openai/gsm8k", "main", split="train") 
inspect_random_entry("openai/gsm8k", gsm8k)

math_ds = load_dataset("HuggingFaceH4/math", split="train")
inspect_random_entry("HuggingFaceH4/math", math_ds)

# Instruction diversity / broad generality Alpaca, ARC, BoolQ 15 % Keeps model generalist and able to follow any prompt
#alpaca = load_dataset("tatsu-lab/alpaca", split="train[:15%]")
alpaca = load_and_random_select("tatsu-lab/alpaca", "train", 0.05)
inspect_random_entry("tatsu-lab/alpaca", alpaca)

#boolq = load_dataset("google/boolq", split="train[:30%]")
boolq = load_and_random_select("google/boolq", "train", 0.30)
inspect_random_entry("google/boolq", boolq)

arc = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="train")
inspect_random_entry("allenai/ai2_arc", arc)

# Human-preferred style LMSYS winners 5 % ‚ÄúStyle anchor‚Äù for alignment and empathy
#lmsys_conv_raw = load_dataset("lmsys/chatbot_arena_conversations", split="train[:15%]") 
lmsys_conv_raw = load_and_random_select("lmsys/chatbot_arena_conversations", "train", 0.08)
inspect_random_entry("lmsys/chatbot_arena_conversations", lmsys_conv_raw)


# lmsys/sharegpt4-dataset (subset)
#sharegpt = load_dataset("anon8231489123/ShareGPT_Vicuna_unfiltered", split="train[:3%]")
#sharegpt = load_dataset("LDJnr/ShareGPT-processed", split="train[:5%]")
#sharegpt = load_dataset("lmsys/sharegpt4-dataset", split="train[:5%]")
#inspect_random_entry("lmsys/sharegpt4-dataset", sharegpt)

sharegpt4 = load_dataset(
    "openchat/openchat_sharegpt4_dataset",
    data_files="sharegpt_gpt4.json",
    split="train"
)

inspect_random_entry("openchat/openchat_sharegpt4_dataset/sharegpt_gpt4.json", sharegpt4, force_disp = False)




# --- 2. APPLY FORMATTERS ---

print("\nüß† Applying OpenThoughts formatter...")
cot_thoughts_formatted = cot_thoughts.map(format_OpenThoughts, remove_columns=cot_thoughts.column_names)
inspect_formatted_sft("cot_thoughts_formatted", cot_thoughts_formatted)

print("\nüß† Applying gsm8k formatter...")
gsm8k_formatted = gsm8k.map(format_GSM8K, remove_columns=gsm8k.column_names)
inspect_formatted_sft("gsm8k_formatted", gsm8k_formatted, 2, 200)

print("\nüß† Applying LMSYS winner extraction logic...")
lmsys_conv_formatted = lmsys_conv_raw.map(extract_lmsys_winner, remove_columns=lmsys_conv_raw.column_names)
inspect_formatted_sft("lmsys_conv_formatted", lmsys_conv_formatted)

print("\nüß† Applying dolly formatter...")
dolly_formatted = dolly.map(format_dolly, remove_columns=dolly.column_names)
inspect_formatted_sft("dolly_formatted", dolly_formatted)

print("\nüß† Applying ultrachat formatter...")
ultrachat_formatted = ultrachat.map(format_ultrachat, remove_columns=ultrachat.column_names)
inspect_formatted_sft("ultrachat_formatted", ultrachat_formatted)

print("\nüß† Formatting math formatter...")
math_formatted = math_ds.map(format_math, remove_columns=math_ds.column_names)
inspect_formatted_sft("math_formatted", math_formatted)

print("\nüß† Formatting alpaca formatter...")
alpaca_formatted = alpaca.map(format_alpaca, remove_columns=alpaca.column_names)
inspect_formatted_sft("alpaca_formatted", alpaca_formatted)

print("\nüß† Formatting boolq formatter...")
boolq_formatted = boolq.map(format_boolq, remove_columns=boolq.column_names)
inspect_formatted_sft("boolq_formatted", boolq_formatted)

print("\nüß† Formatting arc formatter...")
arc_formatted = arc.map(format_arc, remove_columns=arc.column_names)
inspect_formatted_sft("arc_formatted", arc_formatted)

print("\nüß† Formatting arc formatter...")
gpt4_formatted = sharegpt4.map(format_gpt4, remove_columns=sharegpt4.column_names)
inspect_formatted_sft("gpt4_formatted", gpt4_formatted, force_disp = False)

# --- 3.  Merge Remaining Datasets ---

formatted_datasets = []

formatted_datasets.extend([
    cot_thoughts_formatted,
    #gsm8k_formatted,
    lmsys_conv_formatted,
    dolly_formatted,
    ultrachat_formatted,
    #math_formatted,
    alpaca_formatted,
    boolq_formatted,
    arc_formatted,
    gpt4_formatted,
])


mixed = concatenate_datasets(formatted_datasets).shuffle(seed=42)

# --- 4. Quality Filter (Crucial step to remove null/empty entries) ---
print("üóëÔ∏è Applying quality filter...")

def filter_null_or_empty(example):
    inst = example.get("instruction")
    resp = example.get("response")
    
    if not isinstance(inst, str) or not isinstance(resp, str):
        return False
    if len(inst.strip()) < 5 or len(resp.strip()) < 2:
        return False
        
    return True

mixed = mixed.filter(filter_null_or_empty)


MAX_SAMPLES = 38000 

# 1. Shuffle the dataset first (use a seed for reproducibility)
shuffled_dataset = mixed.shuffle(seed=42)

# 2. Then, select the first N samples from the now-shuffled dataset
dataset_random = shuffled_dataset.select(range(min(MAX_SAMPLES, len(shuffled_dataset))))

# take all of gsmk8 math
# combined_dataset = concatenate_datasets([dataset_random, gpt4_formatted, gsm8k_formatted, math_formatted ])
combined_dataset = concatenate_datasets([dataset_random, gsm8k_formatted, math_formatted ])

from transformers import AutoTokenizer
import pandas as pd

MAX_SEQ_LEN = 4096

# Define your custom system prompt
CUSTOM_SYSTEM_PROMPT = """\
You are a highly professional, concise technical expert across modern computing domains ‚Äî 
including software architecture, cloud infrastructure, data systems, machine learning, and applied AI.

Your task is to:
- Answer the user‚Äôs question using the provided CONTEXT as your primary source.
- If the CONTEXT does not contain enough information, use your own knowledge,
  but clearly distinguish between context-based and general reasoning.

Your responses must be:
- Structured ‚Äî use clear formatting and logical reasoning.
- Contextual ‚Äî rely only on the information available.
- Concise ‚Äî eliminate filler words while preserving precision.
- Aligned with industry best practices ‚Äî modern, reproducible, and standards-based.
"""


from transformers import AutoTokenizer

# Just load the tokenizer ‚Äî no GPU or model weights needed
tokenizer = AutoTokenizer.from_pretrained(
    "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
    use_fast=True,      # Use Fast tokenizer
    trust_remote_code=True,  # Required for Unsloth custom templates
)


# Define the filtering function
def is_length_within_limit(example):
    # Create a Harmony-format conversation
    messages = [
        {"role": "system", "content": CUSTOM_SYSTEM_PROMPT},
        {"role": "user", "content": example.get("instruction", "")},
        {"role": "assistant", "content": example.get("response", "")},
    ]
    wrapped = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=False,
        tokenize=True,
    )
    #print(wrapped)
    token_count = len(wrapped["input_ids"]) if isinstance(wrapped, dict) else len(wrapped)
    return token_count <= MAX_SEQ_LEN



# 1. Use the .filter() method which returns a new Dataset object
filtered_dataset = combined_dataset.filter(
    is_length_within_limit,
    num_proc=4 # Use multiple processes to speed up tokenization/filtering
)

# 2. Now you can safely call .shuffle() on the Dataset object
final_dataset_random = filtered_dataset.shuffle(seed=42)

print("\n‚úÖ Final dataset successfully filtered and shuffled.")


# Prints parts of the  datasets with chat template
def inspect_formatted_sft(ds_name, ds_object, n = 2, force_disp = False):

    if not force_disp:
        if not INSPECT_ON:
            return 
    
    for i, entry in enumerate(ds_object):
        if i > n:
            break
        print(f"--- Entry {i+1} ---")

            # Create a Harmony-format conversation
        messages = [
            {"role": "system", "content": CUSTOM_SYSTEM_PROMPT},
            {"role": "user", "content": entry.get("instruction", "")},
            {"role": "assistant", "content": entry.get("response", "")},
        ]
        wrapped = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=False,
            tokenize=False,
        )
        
        print(f"Message with chat template: {wrapped}...") 

        print("-" * 50)
        print("\n")


inspect_formatted_sft("final_dataset_random", final_dataset_random, force_disp = True)


# Analysis: Getting the Source Statistics ---
print("\nüìä Analyzing Sources in Final Dataset...")

# Extract the 'source' column into a list
sources_list = final_dataset_random['source']

# Get the total size of the final dataset
final_dataset_size = len(final_dataset_random)

# Convert the list to a Pandas Series and count the unique values (This gives the 'Count' column)
source_counts = pd.Series(sources_list).value_counts()

# Calculate the total number of unique sources
total_unique_sources = len(source_counts)

# --- NEW LOGIC: Create a DataFrame for summary and add the Percentage column ---

# Convert the Series to a DataFrame
summary_df = source_counts.rename('Count').to_frame()

# Calculate the percentage
summary_df['Percentage'] = (summary_df['Count'] / final_dataset_size) * 100

# Format the Percentage column for clean display
summary_df['Percentage'] = summary_df['Percentage'].map('{:.2f}%'.format)

# --- Printing the Summary ---
print("-" * 50)
print(f"Final Dataset Size: {final_dataset_size} entries")
print(f"Total Unique Sources Found: {total_unique_sources}")
print("-" * 50)
print("Count and Percentage of Data Entries per Source:")
print(summary_df.to_string()) # Print the DataFrame with both Count and Percentage
print("-" * 50)

# --- 4. Final Output ---
# Save as JSONL for Unsloth
df = pd.DataFrame(final_dataset_random)
df.to_json("train_sft_final.jsonl", orient="records", lines=True, force_ascii=False)

print(f"\nüéâ SUCCESS! Final SFT dataset created: train_sft_final.jsonl")
print(f"Total samples after filtering: {len(final_dataset_random)}")

üì• Starting dataset loading process and inspection...

-> Loading and selecting 15% randomly from databricks/databricks-dolly-15k (train)

-> Loading and selecting 1% randomly from HuggingFaceH4/ultrachat_200k (train_sft)

-> Loading and selecting 40% randomly from open-thoughts/open-thoughts-114k (train)

-> Loading and selecting 5% randomly from tatsu-lab/alpaca (train)

-> Loading and selecting 30% randomly from google/boolq (train)

-> Loading and selecting 8% randomly from lmsys/chatbot_arena_conversations (train)

üß† Applying OpenThoughts formatter...

üß† Applying gsm8k formatter...

üß† Applying LMSYS winner extraction logic...

üß† Applying dolly formatter...

üß† Applying ultrachat formatter...

üß† Formatting math formatter...

üß† Formatting alpaca formatter...

üß† Formatting boolq formatter...

üß† Formatting arc formatter...

üß† Formatting arc formatter...
üóëÔ∏è Applying quality filter...

‚úÖ Final dataset successfully filtered and shuffled.
--- Entry 1 

In [14]:


# Alternatively, a cleaner map that keeps only source and tokens
def prepare_data_for_analysis(example, tokenizer):
    combined_text = example['instruction'] + " " + example['response']
    tokenized = tokenizer(combined_text, truncation=False, return_length=True)
    return {
        'source': example['source'], # Assuming you have a 'source' column
        'total_tokens': tokenized['length'][0]
    }

analysis_dataset = final_dataset_random.map(
    lambda x: prepare_data_for_analysis(x, tokenizer),
    batched=False, # Process one example at a time
    num_proc=4,
    remove_columns=final_dataset_random.column_names # Keep only the two new columns
)

# Convert the analysis dataset to a Pandas DataFrame
analysis_df = analysis_dataset.to_pandas()

# Group by the 'source' column and find the maximum 'total_tokens'
max_lengths_by_source = analysis_df.groupby('source')['total_tokens'].max().sort_values(ascending=False)

## Output the result
print("Maximum Token Length (Instruction + Response) per Source:")
print("-------------------------------------------------------")
print(max_lengths_by_source)


# Convert to Pandas for aggregation
analysis_df = analysis_dataset.to_pandas()

# Filter for entries where token length is greater than the threshold
large_entries_df = analysis_df[analysis_df['total_tokens'] > MAX_SEQ_LEN]

#  Group by 'source' and count the number of entries
count_by_source = large_entries_df.groupby('source').size().sort_values(ascending=False)

# Print the final result
print(f"Count of Entries with Token Length > {MAX_SEQ_LEN} per Source:")
print(count_by_source)






Maximum Token Length (Instruction + Response) per Source:
-------------------------------------------------------
source
OpenThoughts    3872
sharegpt4       3728
dolly           3337
math            2445
ultrachat       2366
lmsys_conv      1256
boolq            580
alpaca           444
GSM8K            409
arc              235
Name: total_tokens, dtype: int64
Count of Entries with Token Length > 4096 per Source:
Series([], dtype: int64)
