In [31]:
from datasets import load_dataset, Dataset
from collections import defaultdict, deque
from tqdm import tqdm
import random
import re

## Collect Data

In [3]:
ds = load_dataset("nvidia/OpenMathInstruct-1", split='train')

incorrect_solutions/train.jsonl:   0%|          | 0.00/6.42G [00:00<?, ?B/s]

correct_solutions/validation.jsonl:   0%|          | 0.00/203M [00:00<?, ?B/s]

incorrect_solutions/validation.jsonl:   0%|          | 0.00/981M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [4]:
groups = {'gsm8k': defaultdict(list), 'math': defaultdict(list)}

for i, ex in enumerate(tqdm(ds, desc='iter dataset')):
  dataset_name = ex.get('dataset')
  if dataset_name in ('gsm8k', 'math'):
    q = ex.get('question')
    groups[dataset_name][q].append(i)

iter dataset: 100%|██████████| 7321344/7321344 [02:39<00:00, 45967.57it/s]


In [7]:
def get_fair_downsample_subset(q2indices, target, seed=42):
  ran = random.Random(seed)
  questions = list(q2indices.keys())
  ran.shuffle(questions)
  result = []
  q_deques = {}

  for q in questions:
    ls_indices = q2indices[q][:]
    ran.shuffle(ls_indices)
    q_deques[q] = deque(ls_indices)

  q_cycle = deque(questions)

  while q_cycle and len(result) < target:
    q = q_cycle.popleft()
    dq = q_deques[q]
    if dq:
      result.append(dq.popleft())
      if dq:
        q_cycle.append(q)

  return result

In [8]:
def get_any_code_filtering_subset(q2indices, target, seed=42):
  result = []
  for q, indices in tqdm(q2indices.items(), desc='processing any code filtering'):
    code_indices = []
    text_indices = []
    for i in indices:
      em = ds[i].get('error_message')
      code_used = (em != '<not_executed>')
      if code_used:
        code_indices.append(i)
      else:
        text_indices.append(i)
    if code_indices:
      result.extend(code_indices)
    else:
      result.extend(text_indices)

  ran = random.Random(seed)
  ran.shuffle(result)
  return result[:target]

In [9]:
TARGET_GSM8K = 256000
TARGET_MATH = 256000

gsm8k_subset = get_fair_downsample_subset(groups['gsm8k'], TARGET_GSM8K)
math_subset = get_any_code_filtering_subset(groups['math'], TARGET_MATH)

processing any code filtering: 100%|██████████| 6500/6500 [03:02<00:00, 35.60it/s]


In [10]:
ds_indices = gsm8k_subset + math_subset

In [27]:
ds[ds_indices[0]].get('is_correct') == True

False

In [28]:
correct_count = 0
incorrect_count = 0
for sam in tqdm(ds, desc='iter dataset'):
  if sam.get('is_correct') == False:
    incorrect_count += 1
  else:
    correct_count += 1
print(f"Correct: {correct_count}, Incorrect: {incorrect_count}")


iter dataset: 100%|██████████| 7321344/7321344 [02:40<00:00, 45623.12it/s]

Correct: 1579780, Incorrect: 5741564





In [32]:
def save_subset(ds, ds_indices):
  subset_data = [ds[i] for i in ds_indices]
  subset_data = [{"question": ex["question"], "generated_solution": ex["generated_solution"]} for ex in subset_data]

  subset_ds = Dataset.from_list(subset_data)
  subset_ds.save_to_disk("data/subset_openmathinstruct_1/256K")
  return subset_ds

In [33]:
subset_ds = save_subset(ds, ds_indices)

Saving the dataset (0/1 shards):   0%|          | 0/512000 [00:00<?, ? examples/s]

## Process data

In [1]:
from datasets import Dataset
subset_ds = Dataset.load_from_disk("../data/subset_openmathinstruct_1/256K")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [3]:
from process_data import DataPreprocessing

dp = DataPreprocessing(dataset=subset_ds, tokenizer=tokenizer, train_ratio=0.8, dev_ratio=0.1)
train_processed, dev_processed, test_processed = dp.preprocess()

Map: 100%|██████████| 409600/409600 [02:04<00:00, 3294.44 examples/s]
Map: 100%|██████████| 51200/51200 [00:15<00:00, 3299.27 examples/s]
Map: 100%|██████████| 51200/51200 [00:15<00:00, 3316.78 examples/s]


In [4]:
train_processed.save_to_disk("../data/processed_data/256K/train/")
dev_processed.save_to_disk("../data/processed_data/256K/dev/")
test_processed.save_to_disk("../data/processed_data/256K/test/")

Saving the dataset (4/4 shards): 100%|██████████| 409600/409600 [00:00<00:00, 744461.10 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 51200/51200 [00:00<00:00, 719164.81 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 51200/51200 [00:00<00:00, 745090.05 examples/s]


In [5]:
data_path = 'data/processed_data/256K/'
train_processed = Dataset.load_from_disk(data_path + 'train/')

# Lấy 1 sample để kiểm tra
sample = train_processed[0]
print(sample)

FileNotFoundError: No such files: '/home/guest/AdvancedLLMReasoning/data_utils/data/processed_data/256K/train/dataset_info.json', nor '/home/guest/AdvancedLLMReasoning/data_utils/data/processed_data/256K/train/state.json' found. Expected to load a `Dataset` object but provided path is not a `Dataset`.

## Prepare Data

In [1]:
# load processed datasets
from datasets import Dataset
data_path = '../data/processed_data/256K/'
train_processed = Dataset.load_from_disk(data_path + 'train/')
dev_processed = Dataset.load_from_disk(data_path + 'dev/')
test_processed = Dataset.load_from_disk(data_path + 'test/')

  from .autonotebook import tqdm as notebook_tqdm


## Supervised Fine-tuned

In [2]:
from huggingface_hub import login
login('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')

In [3]:
train_processed.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
dev_processed.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# train_small = train_processed.select(range(1))
# dev_small = dev_processed.select(range(1))

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

# Cấu hình 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Load model với quantization
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

# Chuẩn bị model cho training
model = prepare_model_for_kbit_training(model)

# Add LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)


In [5]:
# %pip install 'accelerate>=0.26.0'
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./math_tutor_model",
    num_train_epochs=3,
    
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    warmup_steps=500,
    fp16=True,
    
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    gradient_accumulation_steps = 1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_processed,
    eval_dataset=dev_processed,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 