In [1]:
dataset_name = "omarkamali/wikipedia-monthly"
tokenizer_name = "mistralai/Mistral-7B-Instruct-v0.3"
model_config_name = tokenizer_name

train_dataset_size = 1280000
if train_dataset_size > 0:
    eval_dataset_size = int(train_dataset_size * 0.1)
else:
    eval_dataset_size = 0.1

In [2]:
from datasets import load_dataset

# Load the English Wikipedia dataset from the latest dump
dataset = load_dataset(dataset_name, "latest.en", cache_dir="./cache")

# Split the dataset
dataset = dataset["train"].train_test_split(
    test_size=eval_dataset_size,
    shuffle=True,
    seed=42,
)

# Subset the dataset
if train_dataset_size > 0 and dataset["train"].num_rows > train_dataset_size:
    dataset["train"] = dataset["train"].select(range(train_dataset_size))
dataset

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'raw_mediawiki', 'text'],
        num_rows: 1280000
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'raw_mediawiki', 'text'],
        num_rows: 128000
    })
})

In [3]:
from transformers import AutoTokenizer, MistralConfig, MistralForCausalLM

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token = tokenizer.eos_token

config = MistralConfig.from_pretrained(model_config_name)

# Dummy model
config.hidden_size = 16
config.intermediate_size = 16
config.num_hidden_layers = 1
config.num_attention_heads = 2
config.num_key_value_heads = 2
config._attn_implementation = "flash_attention_2"

model = MistralForCausalLM(config)

In [4]:
from trl import SFTTrainer, SFTConfig

training_args = SFTConfig(
    packing=True,
    max_length=32768,
    report_to="none",
    dataset_num_proc=4,
    eos_token=tokenizer.eos_token,
    pad_token=tokenizer.pad_token,
    dataset_kwargs={
        "save_preprocessed_dataset": True,
    },
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
)

Adding EOS to train dataset (num_proc=4):   0%|          | 0/1280000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=4):   0%|          | 0/1280000 [00:00<?, ? examples/s]

Packing train dataset (num_proc=4):   0%|          | 0/1280000 [00:00<?, ? examples/s]

Adding EOS to eval dataset (num_proc=4):   0%|          | 0/128000 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=4):   0%|          | 0/128000 [00:00<?, ? examples/s]

Packing eval dataset (num_proc=4):   0%|          | 0/128000 [00:00<?, ? examples/s]

Saving the dataset (0/19 shards):   0%|          | 0/69668 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/7001 [00:00<?, ? examples/s]

Preprocessed dataset saved to preprocessed_dataset_69668_7001


In [5]:
from datasets import load_from_disk

dataset_name = "./preprocessed_dataset_69668_7001"
dataset = load_from_disk(dataset_name)
dataset

Loading dataset from disk:   0%|          | 0/19 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'seq_lengths'],
        num_rows: 69668
    })
    test: Dataset({
        features: ['input_ids', 'seq_lengths'],
        num_rows: 7001
    })
})