In [17]:
from datasets import load_dataset, Dataset
from collections import defaultdict, deque
from tqdm import tqdm
import random
import re

In [32]:
import torch
from torch.utils.data import Dataset, DataLoader

## Collect Data

In [2]:
ds = load_dataset("nvidia/OpenMathInstruct-1", split='train')

incorrect_solutions/train.jsonl:   0%|          | 0.00/6.42G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


correct_solutions/validation.jsonl:   0%|          | 0.00/203M [00:00<?, ?B/s]

incorrect_solutions/validation.jsonl:   0%|          | 0.00/981M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [3]:
groups = {'gsm8k': defaultdict(list), 'math': defaultdict(list)}

for i, ex in enumerate(tqdm(ds, desc='iter dataset')):
  dataset_name = ex.get('dataset')
  if dataset_name in ('gsm8k', 'math'):
    q = ex.get('question')
    groups[dataset_name][q].append(i)

iter dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7321344/7321344 [05:02<00:00, 24234.87it/s]


In [4]:
len(groups['gsm8k'])
len(groups['math'])

6500

In [5]:
def get_fair_downsample_subset(q2indices, target, seed=42):
  ran = random.Random(seed)
  questions = list(q2indices.keys())
  ran.shuffle(questions)
  result = []
  q_deques = {}

  for q in questions:
    ls_indices = q2indices[q][:]
    ran.shuffle(ls_indices)
    q_deques[q] = deque(ls_indices)

  q_cycle = deque(questions)

  while q_cycle and len(result) < target:
    q = q_cycle.popleft()
    dq = q_deques[q]
    if dq:
      result.append(dq.popleft())
      if dq:
        q_cycle.append(q)

  return result

In [6]:
def get_any_code_filtering_subset(q2indices, target, seed=42):
  result = []
  for q, indices in tqdm(q2indices.items(), desc='processing any code filtering'):
    code_indices = []
    text_indices = []
    for i in indices:
      em = ds[i].get('error_message')
      code_used = (em != '<not_executed>')
      if code_used:
        code_indices.append(i)
      else:
        text_indices.append(i)
    if code_indices:
      result.extend(code_indices)
    else:
      result.extend(text_indices)

  ran = random.Random(seed)
  ran.shuffle(result)
  return result[:target]

In [7]:
TARGET_GSM8K = 10000
TARGET_MATH = 10000

gsm8k_subset = get_fair_downsample_subset(groups['gsm8k'], TARGET_GSM8K)
math_subset = get_any_code_filtering_subset(groups['math'], TARGET_MATH)

processing any code filtering: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6500/6500 [04:53<00:00, 22.16it/s]


In [8]:
ds_indices = gsm8k_subset + math_subset

In [9]:
count = 0
for sam in tqdm(ds, desc='iter dataset'):
  if sam.get('is_correct') == 'false':
    count += 1
print(count, len(ds))


iter dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7321344/7321344 [04:35<00:00, 26581.38it/s]

0 7321344





In [19]:
def save_subset(ds, ds_indices):
  subset_data = [ds[i] for i in ds_indices]
  subset_data = [{"question": ex["question"], "generated_solution": ex["generated_solution"]} for ex in subset_data]

  subset_ds = Dataset.from_list(subset_data)
  subset_ds.save_to_disk("data/subset_openmathinstruct_1")
  return subset_ds

In [20]:
subset_ds = save_subset(ds, ds_indices)

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

## Process data

In [28]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [29]:
class DataPreprocessing:
    '''Tiền xử lý dữ liệu gồm làm sạch dữ liệu, tách dữ liệu ra train/dev/test, và tokenize dữ liệu'''
    
    def __init__(self, dataset, tokenizer, train_ratio=0.8, dev_ratio=0.1):
        '''
        Khởi tạo với dataset và tokenizer
        Args:
            dataset: Dataset Huggingface
            tokenizer: Tokenizer từ transformers
            train_ratio: Tỉ lệ tập train (mặc định 0.8)
            dev_ratio: Tỉ lệ tập dev (mặc định 0.1)
        '''
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.train_ratio = train_ratio
        self.dev_ratio = dev_ratio
        
    def clean_text(self, text):
        if not isinstance(text, str):
            return ""
        
        text = text.strip()
        text = re.sub(r'[ \t]+', ' ', text)  # Chỉ xóa space/tab thừa
        text = re.sub(r'\n{3,}', '\n\n', text)  # Giữ tối đa 2 newlines
        return text
    
    def prepare_input(self, question, solution):
        '''
        Chuẩn bị input cho mô hình
        Args:
            question: Câu hỏi
            solution: Lời giải
        Returns:
            string đã được format theo mẫu cho mô hình
        '''
        question = self.clean_text(question)
        solution = self.clean_text(solution)
        return f"### Question:\n{question}\n\n### Solution:\n{solution}"
    
    def tokenize_data(self, text, max_length=1024):
        '''
        Tokenize dữ liệu text
        Args:
            text: string cần tokenize
            max_length: độ dài tối đa sau khi tokenize
        Returns:
            dict chứa input_ids và attention_mask
        '''
        return self.tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_tensors='pt'
        )
    
    def split_data(self):
        '''
        Chia dữ liệu thành các tập train/dev/test
        Returns:
            tuple (train_data, dev_data, test_data)
        '''
        total_size = len(self.dataset)
        indices = list(range(total_size))
        random.shuffle(indices)
        
        train_size = int(total_size * self.train_ratio)
        dev_size = int(total_size * self.dev_ratio)
        
        train_indices = indices[:train_size]
        dev_indices = indices[train_size:train_size + dev_size]
        test_indices = indices[train_size + dev_size:]
        
        return (
            self.dataset.select(train_indices),
            self.dataset.select(dev_indices),
            self.dataset.select(test_indices)
        )
    
    def process_example(self, example, max_length=1024):
        '''
        Xử lý một mẫu dữ liệu
        Args:
            example: Một mẫu từ dataset
            max_length: độ dài tối đa cho tokenize
        Returns:
            dict chứa input_ids, attention_mask và labels
        '''
        text = self.prepare_input(
            example['question'],
            example['generated_solution']
        )
        tokenized = self.tokenize_data(text, max_length)
        
        # Thêm labels cho causal LM
        result = {
            'input_ids': tokenized['input_ids'].squeeze(),
            'attention_mask': tokenized['attention_mask'].squeeze(),
            'labels': tokenized['input_ids'].squeeze().clone()
        }
        return result
    
    def preprocess(self, max_length=1024):
        '''
        Tiền xử lý toàn bộ dataset
        Args:
            max_length: độ dài tối đa cho tokenize
        Returns:
            tuple (train_data, dev_data, test_data) đã được tiền xử lý
        '''
        # Chia dữ liệu
        train_data, dev_data, test_data = self.split_data()
        
        # Áp dụng xử lý cho từng tập
        train_processed = train_data.map(lambda x: self.process_example(x, max_length))
        dev_processed = dev_data.map(lambda x: self.process_example(x, max_length))
        test_processed = test_data.map(lambda x: self.process_example(x, max_length))
        
        return train_processed, dev_processed, test_processed

In [30]:
dp = DataPreprocessing(dataset=subset_ds, tokenizer=tokenizer, train_ratio=0.8, dev_ratio=0.1)
train_processed, dev_processed, test_processed = dp.preprocess()

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [31]:
# Check the dataset and processed results
print("Dataset size:", len(subset_ds))
print("\nExample from original dataset:")
print(subset_ds[0])

# Try processing a single example to debug
example = dp.process_example(subset_ds[0])
print("\nProcessed example:")
for key, value in example.items():
    print(f"\n{key} shape:", value.shape)

Dataset size: 20000

Example from original dataset:
{'question': 'Aunt Gemma has four dogs. She feeds them with dog food twice a day. Each dog can consume 250 grams of food per meal. If aunt Gemma bought 2 sacks of dog food, each weighing 50 kilograms, how many days will those food last?', 'generated_solution': 'Let\'s solve this problem using Python\'s sympy library.\n<llm-code>\nimport sympy as sp\n\n# let\'s denote the number of days in which the food lasts\n# since the "number of days" cannot be a symbol we use q instead\nq = sp.symbols(\'q\')\n\n# total number of grams of food per day\n# a dog consumes 250 grams each day (per meal x 2)\n# there are 4 dogs\nfood_per_day = 250 * 4\n# there are q days\nfood_total = food_per_day * q\n\n# the food is 50 kilograms each in 2 sacks\n# 1 kilogram is 1000 grams\n# 50 kilograms is 50000 grams\ntotal_kg = 50000 * 2\n# expressed in grams\ntotal_g = 50000 * 2 * 1000\n\n# total grams of food per day is what the dogs ate\neq = sp.Eq(food_total, t

## Prepare Data

In [33]:
class CustomDataset(Dataset):
    def __init__(self, hf_dataset):
        self.input_ids = [torch.tensor(x['input_ids']) for x in hf_dataset]
        self.attention_mask = [torch.tensor(x['attention_mask']) for x in hf_dataset]
        self.labels = [torch.tensor(x['labels']) for x in hf_dataset]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

In [34]:
train_dataset = CustomDataset(train_processed)
dev_dataset = CustomDataset(dev_processed)

In [36]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=128)

## Supervised Fine-tuned

In [41]:
from huggingface_hub import login
login('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')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

# 1. Load base model & tokenizer
base_model = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(base_model)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 2. Add LoRA 
lora_config = LoraConfig(r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.1)
model = get_peft_model(model, lora_config)


In [49]:
%pip install 'accelerate>=0.26.0'

training_args = TrainingArguments(
    output_dir="./math_tutor_model",
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    warmup_steps=500,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_processed,
    eval_dataset=dev_processed,
    tokenizer=tokenizer,
)

trainer.train()

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: "'accelerate": Expected package name at the start of dependency specifier
    'accelerate
    ^


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`