In [31]:
from datasets import load_dataset, Dataset
from collections import defaultdict, deque
from tqdm import tqdm
import random
import re

## Collect Data

In [3]:
ds = load_dataset("nvidia/OpenMathInstruct-1", split='train')

incorrect_solutions/train.jsonl:   0%|          | 0.00/6.42G [00:00<?, ?B/s]

correct_solutions/validation.jsonl:   0%|          | 0.00/203M [00:00<?, ?B/s]

incorrect_solutions/validation.jsonl:   0%|          | 0.00/981M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [4]:
groups = {'gsm8k': defaultdict(list), 'math': defaultdict(list)}

for i, ex in enumerate(tqdm(ds, desc='iter dataset')):
  dataset_name = ex.get('dataset')
  if dataset_name in ('gsm8k', 'math'):
    q = ex.get('question')
    groups[dataset_name][q].append(i)

iter dataset: 100%|██████████| 7321344/7321344 [02:39<00:00, 45967.57it/s]


In [7]:
def get_fair_downsample_subset(q2indices, target, seed=42):
  ran = random.Random(seed)
  questions = list(q2indices.keys())
  ran.shuffle(questions)
  result = []
  q_deques = {}

  for q in questions:
    ls_indices = q2indices[q][:]
    ran.shuffle(ls_indices)
    q_deques[q] = deque(ls_indices)

  q_cycle = deque(questions)

  while q_cycle and len(result) < target:
    q = q_cycle.popleft()
    dq = q_deques[q]
    if dq:
      result.append(dq.popleft())
      if dq:
        q_cycle.append(q)

  return result

In [8]:
def get_any_code_filtering_subset(q2indices, target, seed=42):
  result = []
  for q, indices in tqdm(q2indices.items(), desc='processing any code filtering'):
    code_indices = []
    text_indices = []
    for i in indices:
      em = ds[i].get('error_message')
      code_used = (em != '<not_executed>')
      if code_used:
        code_indices.append(i)
      else:
        text_indices.append(i)
    if code_indices:
      result.extend(code_indices)
    else:
      result.extend(text_indices)

  ran = random.Random(seed)
  ran.shuffle(result)
  return result[:target]

In [9]:
TARGET_GSM8K = 256000
TARGET_MATH = 256000

gsm8k_subset = get_fair_downsample_subset(groups['gsm8k'], TARGET_GSM8K)
math_subset = get_any_code_filtering_subset(groups['math'], TARGET_MATH)

processing any code filtering: 100%|██████████| 6500/6500 [03:02<00:00, 35.60it/s]


In [10]:
ds_indices = gsm8k_subset + math_subset

In [27]:
ds[ds_indices[0]].get('is_correct') == True

False

In [28]:
correct_count = 0
incorrect_count = 0
for sam in tqdm(ds, desc='iter dataset'):
  if sam.get('is_correct') == False:
    incorrect_count += 1
  else:
    correct_count += 1
print(f"Correct: {correct_count}, Incorrect: {incorrect_count}")


iter dataset: 100%|██████████| 7321344/7321344 [02:40<00:00, 45623.12it/s]

Correct: 1579780, Incorrect: 5741564





In [32]:
def save_subset(ds, ds_indices):
  subset_data = [ds[i] for i in ds_indices]
  subset_data = [{"question": ex["question"], "generated_solution": ex["generated_solution"]} for ex in subset_data]

  subset_ds = Dataset.from_list(subset_data)
  subset_ds.save_to_disk("data/subset_openmathinstruct_1/256K")
  return subset_ds

In [33]:
subset_ds = save_subset(ds, ds_indices)

Saving the dataset (0/1 shards):   0%|          | 0/512000 [00:00<?, ? examples/s]

## Process data

In [34]:
import torch
from torch.utils.data import Dataset as Dataset, DataLoader

In [36]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [37]:
class DataPreprocessing:
    '''Tiền xử lý dữ liệu gồm làm sạch dữ liệu, tách dữ liệu ra train/dev/test, và tokenize dữ liệu'''
    
    def __init__(self, dataset, tokenizer, train_ratio=0.8, dev_ratio=0.1):
        '''
        Khởi tạo với dataset và tokenizer
        Args:
            dataset: Dataset Huggingface
            tokenizer: Tokenizer từ transformers
            train_ratio: Tỉ lệ tập train (mặc định 0.8)
            dev_ratio: Tỉ lệ tập dev (mặc định 0.1)
        '''
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.train_ratio = train_ratio
        self.dev_ratio = dev_ratio
        
    def clean_text(self, text):
        if not isinstance(text, str):
            return ""
        
        text = text.strip()
        text = re.sub(r'[ \t]+', ' ', text)  # Chỉ xóa space/tab thừa
        text = re.sub(r'\n{3,}', '\n\n', text)  # Giữ tối đa 2 newlines
        return text
    
    def prepare_input(self, question, solution):
        '''
        Chuẩn bị input cho mô hình
        Args:
            question: Câu hỏi
            solution: Lời giải
        Returns:
            string đã được format theo mẫu cho mô hình
        '''
        question = self.clean_text(question)
        solution = self.clean_text(solution)
        return f"### Question:\n{question}\n\n### Solution:\n{solution}"
    
    def tokenize_data(self, text, max_length=1024):
        '''
        Tokenize dữ liệu text
        Args:
            text: string cần tokenize
            max_length: độ dài tối đa sau khi tokenize
        Returns:
            dict chứa input_ids và attention_mask
        '''
        return self.tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_tensors='pt'
        )
    
    def split_data(self):
        '''
        Chia dữ liệu thành các tập train/dev/test
        Returns:
            tuple (train_data, dev_data, test_data)
        '''
        total_size = len(self.dataset)
        indices = list(range(total_size))
        random.shuffle(indices)
        
        train_size = int(total_size * self.train_ratio)
        dev_size = int(total_size * self.dev_ratio)
        
        train_indices = indices[:train_size]
        dev_indices = indices[train_size:train_size + dev_size]
        test_indices = indices[train_size + dev_size:]
        
        return (
            self.dataset.select(train_indices),
            self.dataset.select(dev_indices),
            self.dataset.select(test_indices)
        )
    
    def process_example(self, example, max_length=1024):
        '''
        Xử lý một mẫu dữ liệu
        Args:
            example: Một mẫu từ dataset
            max_length: độ dài tối đa cho tokenize
        Returns:
            dict chứa input_ids, attention_mask và labels
        '''
        text = self.prepare_input(
            example['question'],
            example['generated_solution']
        )
        tokenized = self.tokenize_data(text, max_length)
        
        # Thêm labels cho causal LM
        result = {
            'input_ids': tokenized['input_ids'].squeeze(),
            'attention_mask': tokenized['attention_mask'].squeeze(),
            'labels': tokenized['input_ids'].squeeze().clone()
        }
        return result
    
    def preprocess(self, max_length=1024):
        '''
        Tiền xử lý toàn bộ dataset
        Args:
            max_length: độ dài tối đa cho tokenize
        Returns:
            tuple (train_data, dev_data, test_data) đã được tiền xử lý
        '''
        # Chia dữ liệu
        train_data, dev_data, test_data = self.split_data()
        
        # Áp dụng xử lý cho từng tập
        train_processed = train_data.map(lambda x: self.process_example(x, max_length))
        dev_processed = dev_data.map(lambda x: self.process_example(x, max_length))
        test_processed = test_data.map(lambda x: self.process_example(x, max_length))
        
        return train_processed, dev_processed, test_processed

In [38]:
dp = DataPreprocessing(dataset=subset_ds, tokenizer=tokenizer, train_ratio=0.8, dev_ratio=0.1)
train_processed, dev_processed, test_processed = dp.preprocess()

Map:   0%|          | 0/409600 [00:00<?, ? examples/s]

Map:   0%|          | 0/51200 [00:00<?, ? examples/s]

Map:   0%|          | 0/51200 [00:00<?, ? examples/s]

In [39]:
# Check the dataset and processed results
print("Dataset size:", len(subset_ds))
print("\nExample from original dataset:")
print(subset_ds[0])

# Try processing a single example to debug
example = dp.process_example(subset_ds[0])
print("\nProcessed example:")
for key, value in example.items():
    print(f"\n{key} shape:", value.shape)

Dataset size: 512000

Example from original dataset:
{'question': 'Aunt Gemma has four dogs. She feeds them with dog food twice a day. Each dog can consume 250 grams of food per meal. If aunt Gemma bought 2 sacks of dog food, each weighing 50 kilograms, how many days will those food last?', 'generated_solution': 'Let\'s solve this problem using Python\'s sympy library.\n<llm-code>\nimport sympy as sp\n\n# let\'s denote the number of days in which the food lasts\n# since the "number of days" cannot be a symbol we use q instead\nq = sp.symbols(\'q\')\n\n# total number of grams of food per day\n# a dog consumes 250 grams each day (per meal x 2)\n# there are 4 dogs\nfood_per_day = 250 * 4\n# there are q days\nfood_total = food_per_day * q\n\n# the food is 50 kilograms each in 2 sacks\n# 1 kilogram is 1000 grams\n# 50 kilograms is 50000 grams\ntotal_kg = 50000 * 2\n# expressed in grams\ntotal_g = 50000 * 2 * 1000\n\n# total grams of food per day is what the dogs ate\neq = sp.Eq(food_total, 

In [40]:
train_processed.save_to_disk("../data/processed_data/256K/train/")
dev_processed.save_to_disk("../data/processed_data/256K/dev/")
test_processed.save_to_disk("data/processed_data/256K/test/")

Saving the dataset (0/12 shards):   0%|          | 0/409600 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/51200 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/51200 [00:00<?, ? examples/s]

## Prepare Data

In [1]:
# load processed datasets
from datasets import Dataset
data_path = '../data/processed_data/256K/'
train_processed = Dataset.load_from_disk(data_path + 'train/')
dev_processed = Dataset.load_from_disk(data_path + 'dev/')
test_processed = Dataset.load_from_disk(data_path + 'test/')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# import torch
# from torch.utils.data import Dataset, DataLoader

In [None]:
# class CustomDataset(Dataset):
#     def __init__(self, hf_dataset):
#         self.input_ids = [torch.tensor(x['input_ids']) for x in hf_dataset]
#         self.attention_mask = [torch.tensor(x['attention_mask']) for x in hf_dataset]
#         self.labels = [torch.tensor(x['labels']) for x in hf_dataset]

#     def __len__(self):
#         return len(self.input_ids)

#     def __getitem__(self, idx):
#         return {
#             'input_ids': self.input_ids[idx],
#             'attention_mask': self.attention_mask[idx],
#             'labels': self.labels[idx]
#         }

In [None]:
# train_dataset = CustomDataset(train_processed)
# dev_dataset = CustomDataset(dev_processed)

KeyboardInterrupt: 

: 

## Supervised Fine-tuned

In [2]:
from huggingface_hub import login
login('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')

In [3]:
train_processed.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
dev_processed.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

# 1. Load base model & tokenizer
base_model = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(base_model)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 2. Add LoRA 
lora_config = LoraConfig(r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.1)
model = get_peft_model(model, lora_config)


In [8]:
# %pip install 'accelerate>=0.26.0'
import gc
import torch
training_args = TrainingArguments(
    output_dir="./math_tutor_model",
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    warmup_steps=500,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_processed,
    eval_dataset=dev_processed,
    tokenizer=tokenizer,
)

torch.cuda.empty_cache()
gc.collect()

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 23.52 GiB of which 121.06 MiB is free. Process 3666247 has 18.09 GiB memory in use. Including non-PyTorch memory, this process has 5.15 GiB memory in use. Of the allocated memory 4.69 GiB is allocated by PyTorch, and 17.67 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)