In [5]:
PROJECT_PATH = "/home/lixy/workspace/SkipLM"

In [6]:
# Load dataset and tokenizer
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM


tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-72B-Instruct")# dataset = load_dataset("data/opc-sft-stage2/evol_instruct", split="train")"train")


def print_subset_stats(path: str, subset: str):
    dataset = load_dataset(os.path.join(path, subset), split="train")
    def get_num_tokens(text: str, tokenizer: AutoTokenizer):
        return len(tokenizer(text).input_ids)
    
    dataset = dataset.map(
        lambda x: {
            "num_instruction_tokens": get_num_tokens(x["instruction"], tokenizer),
            "num_output_tokens": get_num_tokens(x["output"], tokenizer)
        },
        num_proc=128
    )
    # %%
    instruction_tokens = dataset["num_instruction_tokens"]
    output_tokens = dataset["num_output_tokens"]
    #%%
    import numpy as np 
    
    print(f"[{subset}] mean tokens of instruction: {np.mean(instruction_tokens)}")
    print(f"[{subset}] mean tokens of output: {np.mean(output_tokens)}")
    # %%
    print(f"[{subset}] p99 tokens of instruction: {np.percentile(instruction_tokens, 99)}")
    print(f"[{subset}] p99 tokens of output: {np.percentile(output_tokens, 99)}")

In [8]:
print_subset_stats(path=os.path.join(PROJECT_PATH, "data/opc-sft-stage2"), subset="educational_instruct")

Map (num_proc=128):   0%|                                                                                     …

[educational_instruct] mean tokens of instruction: 34.69028052554152
[educational_instruct] mean tokens of output: 101.86909653528129
[educational_instruct] p99 tokens of instruction: 129.22999999999593
[educational_instruct] p99 tokens of output: 342.0


In [9]:
print_subset_stats(path=os.path.join(PROJECT_PATH, "data/opc-sft-stage2"), subset="evol_instruct")

Map (num_proc=128):   0%|                                                                                     …

[evol_instruct] mean tokens of instruction: 184.62217245442199
[evol_instruct] mean tokens of output: 370.5421242456131
[evol_instruct] p99 tokens of instruction: 2255.179999999993
[evol_instruct] p99 tokens of output: 983.179999999993


In [10]:
print_subset_stats(path=os.path.join(PROJECT_PATH, "data/opc-sft-stage2"), subset="mceval_instruct")

Map (num_proc=128):   0%|                                                                                     …

[mceval_instruct] mean tokens of instruction: 227.30181120106835
[mceval_instruct] mean tokens of output: 637.8425006259912
[mceval_instruct] p99 tokens of instruction: 449.0
[mceval_instruct] p99 tokens of output: 1374.0


In [11]:
print_subset_stats(path=os.path.join(PROJECT_PATH, "data/opc-sft-stage2"), subset="package_instruct")

Map (num_proc=128):   0%|                                                                                     …

[package_instruct] mean tokens of instruction: 368.1339393833032
[package_instruct] mean tokens of output: 483.42384888530097
[package_instruct] p99 tokens of instruction: 723.0
[package_instruct] p99 tokens of output: 888.0
