In [1]:
!git clone https://github.com/ngthvinhrai/MathViT.git

Cloning into 'MathViT'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 35 (delta 14), reused 34 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (35/35), 4.47 MiB | 7.72 MiB/s, done.
Resolving deltas: 100% (14/14), done.


##PARTICULAR

In [2]:
import json
import torch
import gc
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
MODEL_NAME = "VietAI/vit5-base"
MAX_LENGTH = 512
BASE_DATA_PATH = '/content/MathViT/data/vie_base_knowledge.jsonl'
PROBLEM_DATA_PATH = '/content/MathViT/data/vie_train.jsonl'
THEORY_MODEL_PATH = '/content/drive/MyDrive/RAI/Project/MathViT/model/theory-model'
SOLVER_MODEL_PATH = "/content/drive/MyDrive/RAI/Project/MathViT/model/solver-model"

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
class IdentityMappingMathDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_length=MAX_LENGTH):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self._prepare_data(data_path)

    def _prepare_data(self, data_path):
        processed_data = []
        with open(data_path, 'r', encoding='utf-8') as f:
          for line in f:
            item = json.loads(line)
            text = item.get('input', '') + " " + item.get('output', '')

            tokenized = self.tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=self.max_length,
                return_tensors='pt'
            )

            labels = tokenized['input_ids'].clone()
            input_ids = tokenized['input_ids'].clone()

            processed_data.append({
                'input_ids': input_ids.squeeze(),
                'attention_mask': tokenized['attention_mask'].squeeze(),
                'labels': labels.squeeze()
            })

        return processed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [7]:
class MathDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=MAX_LENGTH):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item['input']
        target_text = item['output']

        input_encodings = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length)
        with self.tokenizer.as_target_tokenizer():
          target_encodings = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=self.max_length)

        return {
            'input_ids': torch.tensor(input_encodings['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(input_encodings['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(target_encodings['input_ids'], dtype=torch.long)
        }

In [8]:
def create_differential_optimizer(model, encoder_lr=5e-7, decoder_lr=1e-5):
    encoder_params = [p for n, p in model.named_parameters() if 'encoder' in n]
    decoder_params = [p for n, p in model.named_parameters() if 'decoder' in n]

    optimizer_grouped_parameters = [
        {'params': encoder_params, 'lr': encoder_lr},
        {'params': decoder_params, 'lr': decoder_lr},
    ]

    return (AdamW(optimizer_grouped_parameters), None)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [None]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
theory_train_dataset = IdentityMappingMathDataset(BASE_DATA_PATH, tokenizer)

In [None]:
theory_training_args = Seq2SeqTrainingArguments(
    output_dir=THEORY_MODEL_PATH,
    per_device_train_batch_size=4,
    num_train_epochs=8,
    learning_rate=1e-5,
    save_strategy="no",
    logging_dir='./logs/theory',
    logging_steps=10
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

theory_trainer = Seq2SeqTrainer(
    model=model,
    args=theory_training_args,
    train_dataset=theory_train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

theory_trainer.train()

  theory_trainer = Seq2SeqTrainer(


Step,Training Loss
10,23.8543
20,13.6464
30,6.5615
40,2.5395
50,1.3424
60,0.9136
70,0.7338
80,0.5699
90,0.546


TrainOutput(global_step=96, training_loss=5.312626736859481, metrics={'train_runtime': 100.3345, 'train_samples_per_second': 3.588, 'train_steps_per_second': 0.957, 'total_flos': 219224840601600.0, 'train_loss': 5.312626736859481, 'epoch': 8.0})

In [None]:
model.save_pretrained(THEORY_MODEL_PATH)
tokenizer.save_pretrained(THEORY_MODEL_PATH)

('/content/drive/MyDrive/Project/MathViT/model/theory-model/tokenizer_config.json',
 '/content/drive/MyDrive/Project/MathViT/model/theory-model/special_tokens_map.json',
 '/content/drive/MyDrive/Project/MathViT/model/theory-model/spiece.model',
 '/content/drive/MyDrive/Project/MathViT/model/theory-model/added_tokens.json',
 '/content/drive/MyDrive/Project/MathViT/model/theory-model/tokenizer.json')

In [None]:
del theory_trainer
del model
del tokenizer
# Xóa các đối tượng lớn khác
# (model và tokenizer sẽ được gán lại ở Giai đoạn 2)
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [9]:
solver_tokenizer = AutoTokenizer.from_pretrained(SOLVER_MODEL_PATH)
solver_model = AutoModelForSeq2SeqLM.from_pretrained(SOLVER_MODEL_PATH)

In [10]:
problem_dataset = []

with open(PROBLEM_DATA_PATH, 'r', encoding='utf-8') as f:
  for line in f:
    problem_dataset.append(json.loads(line))

# train_data, val_data = train_test_split(problem_dataset, test_size=0.15, random_state=42)

In [11]:
problem_train_dataset = MathDataset(problem_dataset, solver_tokenizer)
# problem_val_dataset = MathDataset(val_data, tokenizer)

In [None]:
solver_model.to(device)

In [18]:
problem_training_args = Seq2SeqTrainingArguments(
    output_dir=SOLVER_MODEL_PATH,
    per_device_train_batch_size=4,
    num_train_epochs=4,
    save_strategy="no",
    logging_dir='./logs/solver',
    logging_steps=100,
    learning_rate=5e-7
)

problem_trainer = Seq2SeqTrainer(
    model=solver_model,
    args=problem_training_args,
    train_dataset=problem_train_dataset,
    tokenizer=solver_tokenizer,
    optimizers=create_differential_optimizer(solver_model)
)

problem_trainer.train()

  problem_trainer = Seq2SeqTrainer(


Step,Training Loss
100,0.247
200,0.2509
300,0.2458
400,0.2454
500,0.2543
600,0.247
700,0.2514
800,0.2435
900,0.2558
1000,0.2609


TrainOutput(global_step=1869, training_loss=0.25629319629291597, metrics={'train_runtime': 2131.2828, 'train_samples_per_second': 3.506, 'train_steps_per_second': 0.877, 'total_flos': 4550742316154880.0, 'train_loss': 0.25629319629291597, 'epoch': 1.0})

In [20]:
solver_model.save_pretrained(SOLVER_MODEL_PATH)
solver_tokenizer.save_pretrained(SOLVER_MODEL_PATH)

('/content/drive/MyDrive/RAI/Project/MathViT/model/solver-model/tokenizer_config.json',
 '/content/drive/MyDrive/RAI/Project/MathViT/model/solver-model/special_tokens_map.json',
 '/content/drive/MyDrive/RAI/Project/MathViT/model/solver-model/spiece.model',
 '/content/drive/MyDrive/RAI/Project/MathViT/model/solver-model/added_tokens.json',
 '/content/drive/MyDrive/RAI/Project/MathViT/model/solver-model/tokenizer.json')

In [None]:
del problem_trainer
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

##TEST

In [14]:
def solve_math_problem(problem_text):
    input_ids = solver_tokenizer(
        problem_text,
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding=True
    ).input_ids.to(device)

    with torch.no_grad():
      outputs = solver_model.generate(
          input_ids,
          max_length=512,
          num_beams=4,
          early_stopping=True,
      )

    # return outputs
    return solver_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [31]:
input = [
    "Terry ăn 2 hộp sữa chua mỗi ngày. Hiện tại, chúng đang được bán với giá 4 hộp, mỗi hộp 5 đô la. Vậy anh ấy đã chi bao nhiêu tiền cho sữa chua trong 30 ngày?",
    "Một đội bóng đá đã chơi 22 trận. Họ thắng nhiều hơn thua 8 trận. Vậy họ đã thắng bao nhiêu trận?",
    "Một bụi mâm xôi có 6 cụm, mỗi cụm 20 quả và 67 quả riêng lẻ rải rác khắp bụi. Hỏi tổng cộng có bao nhiêu quả mâm xôi?"
]

In [32]:
solve_math_problem(input[2])

'Mỗi cụm có 6 x 20   6 x 20  120>> 120 quả mâm xôi. Tổng cộng, bụi mâm xôi có 120 + 67   120 + 67  120>> 120 quả mâm xôi.  120'