In [1]:
#pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.5.8-py3-none-any.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.5.10 (from unsloth)
  Downloading unsloth_zoo-2025.5.10-py3-none-any.whl.metadata (8.1 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.22-py3-none-any.whl.metadata (10 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Downloading trl-0.18.0-py3-none-any.whl.metadata (11 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.4.1->unsloth)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB

In [3]:
#!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [4]:
!pip install mlflow huggingface_hub datasets

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.55.0-py3-none-any.whl.metadata (39 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting packaging<25 (from mlflow-skinny==2.22.0->mlflow)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downl

# Lib

In [5]:
import os
import json
from unsloth import FastLanguageModel
import torch
import mlflow
import mlflow.pytorch
from datasets import Dataset
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer, SFTConfig

from huggingface_hub import HfApi, login
import gc
import time
from datetime import datetime
from typing import Dict, List, Tuple, Optional
import logging
from pathlib import Path

import mlflow
from transformers import TrainerCallback
import logging
from trl import setup_chat_format

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-05-29 16:29:24.605376: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748536164.814226      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748536164.871854      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


# Config

In [6]:
class TrainingConfig:
    """Centralized configuration for training pipeline"""
    
    # Model settings
    model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit"
    max_seq_length = 2048
    load_in_4bit = True

    # LoRA settings
    lora_r = 16
    lora_alpha = 16
    lora_dropout = 0.2
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

    # Training settings
    per_device_train_batch_size = 1
    gradient_accumulation_steps = 8
    warmup_steps = 50
    max_steps = 500
    num_train_epochs = 3 
    learning_rate = 5e-6
    fp16 = not torch.cuda.is_bf16_supported()
    bf16 = torch.cuda.is_bf16_supported()
    logging_steps = 5
    optim = "adamw_8bit"
    weight_decay = 0.01
    lr_scheduler_type = "linear"
    seed = 42

    # MLflow settings
    mlflow_tracking_uri = "http://localhost:5000"  # Thay đổi theo server MLflow của bạn
    experiment_name = "test"
    
    # Hugging Face settings
    hf_username = None
    hf_model_name = None
    hf_token = None
    push_to_hub = True
    
    # Paths
    output_dir = "outputs"
    model_save_dir = "saved_models"

# MLflow

In [7]:
class MLflowCallback(TrainerCallback):
    """Callback to log metrics to MLflow in real-time during training"""
    
    def __init__(self, log_every_n_steps: int = 25):
        self.log_every_n_steps = log_every_n_steps
        
    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        """Called when logging occurs during training"""
        if logs is not None and state.global_step > 0:
            # Log metrics to MLflow
            step = state.global_step
            
            for key, value in logs.items():
                if isinstance(value, (int, float)):
                    mlflow.log_metric(key, value, step=step)
                    
            # Log thông tin chi tiết mỗi log_every_n_steps
            if step % self.log_every_n_steps == 0:
                logger.info(f"Step {step}: Logged metrics to MLflow - {logs}")
    
    def on_train_begin(self, args, state, control, **kwargs):
        """Called at the beginning of training"""
        logger.info("🚀 Bắt đầu training - MLflow logging activated")
        
    def on_train_end(self, args, state, control, **kwargs):
        """Called at the end of training"""
        logger.info("✅ Training hoàn thành - Final metrics logged to MLflow")
        
    def on_evaluate(self, args, state, control, model=None, logs=None, **kwargs):
        """Called after evaluation"""
        if logs is not None:
            step = state.global_step
            for key, value in logs.items():
                if isinstance(value, (int, float)) and key.startswith('eval_'):
                    mlflow.log_metric(key, value, step=step)
            logger.info(f"Step {step}: Evaluation metrics logged to MLflow")



In [8]:
class MLflowTracker:
    """Handle MLflow experiment tracking""" 
    
    def __init__(self, tracking_uri: str, experiment_name: str):
        self.tracking_uri = tracking_uri
        self.experiment_name = experiment_name
        self._setup_mlflow()
    
    def _setup_mlflow(self):
        """Setup MLflow tracking"""
        mlflow.set_tracking_uri(self.tracking_uri)
        mlflow.set_experiment(self.experiment_name)
        logger.info(f"📊 MLflow tracking URI: {self.tracking_uri}")
        logger.info(f"🧪 Experiment: {self.experiment_name}")
    
    def start_run(self, run_name: str):
        """Start MLflow run"""
        return mlflow.start_run(run_name=run_name)
    
    def log_config(self, config: TrainingConfig, train_size: int, test_size: int):
        """Log training configuration"""
        mlflow.log_params({
            "model_name": config.model_name,
            "max_seq_length": config.max_seq_length,
            "lora_r": config.lora_r,
            "lora_alpha": config.lora_alpha,
            "lora_dropout": config.lora_dropout,
            "learning_rate": config.learning_rate,
            "batch_size": config.per_device_train_batch_size,
            "gradient_accumulation_steps": config.gradient_accumulation_steps,
            "max_steps": config.max_steps,
            "warmup_steps": config.warmup_steps,
            "train_samples": train_size,
            "test_samples": test_size,
            "epochs": config.num_train_epochs
        })
    
    def log_training_metrics(self, trainer):
        """Log training metrics from trainer"""
        try:
            for log_entry in trainer.state.log_history:
                step = log_entry.get('step', 0)
                for key, value in log_entry.items():
                    if key != 'step' and isinstance(value, (int, float)):
                        mlflow.log_metric(key, value, step=step)
        except Exception as e:
            logger.warning(f"⚠️ Không thể log training metrics: {e}")
    
    def log_model_info(self, model_path: str, hf_model_name: str = None):
        """Log model information"""
        mlflow.log_artifacts(model_path, "model_files")
        mlflow.log_param("model_save_path", model_path)
        
        if hf_model_name:
            mlflow.log_param("hf_model_name", hf_model_name)
            mlflow.log_param("hf_model_url", f"https://huggingface.co/{hf_model_name}")
            mlflow.log_param("pushed_to_hf", True)

# Registry class

In [9]:
class ModelRegistry:
    """Handle model registry operations with Hugging Face"""
    
    def __init__(self, username: str, token: str):
        self.username = username
        self.token = token
        self.api = HfApi()
        self._authenticate()
    
    def _authenticate(self):
        """Authenticate with Hugging Face"""
        try:
            login(token=self.token)
            logger.info("✅ Đã đăng nhập Hugging Face Hub thành công")
        except Exception as e:
            logger.error(f"❌ Lỗi đăng nhập Hugging Face: {e}")
            raise
    
    def create_model_card(self, model_name: str, config: TrainingConfig, 
                         training_results: Dict = None) -> str:
        """Create comprehensive model card"""
        training_info = ""
        if training_results:
            training_info = f"""
## Training Results
- **Final Training Loss**: {training_results.get('train_loss', 'N/A')}
- **Final Evaluation Loss**: {training_results.get('eval_loss', 'N/A')}
- **Training Steps**: {training_results.get('steps', 'N/A')}
- **Training Time**: {training_results.get('training_time', 'N/A')}
"""
        
        model_card = f"""---
language:
- vi
license: apache-2.0
base_model: {config.model_name}
tags:
- vietnamese
- news
- summarization 
- unsloth
- fine-tuned
- lora
library_name: transformers
pipeline_tag: text2text-generation
metrics:
- rouge
- bleu
---

# {model_name}

## Model Description
Model fine-tuned từ **{config.model_name}** để tóm tắt tin tức tiếng Việt. Được huấn luyện bằng phương pháp LoRA với thư viện Unsloth.

## Model Details
- **Base Model**: {config.model_name}
- **Language**: Vietnamese (Tiếng Việt)
- **Task**: News Summarization (Tóm tắt tin tức)
- **Training Method**: LoRA (Low-Rank Adaptation)
- **Library**: Unsloth + Transformers

## Training Configuration
- **LoRA Rank**: {config.lora_r}
- **LoRA Alpha**: {config.lora_alpha}
- **Learning Rate**: {config.learning_rate}
- **Batch Size**: {config.per_device_train_batch_size}
- **Max Steps**: {config.max_steps}
- **Sequence Length**: {config.max_seq_length}
{training_info}

## Usage

```python
from unsloth import FastLanguageModel
import torch

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="{self.username}/{model_name}",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

# Create prompt
def create_prompt(title, text):
    return f'''<|im_start|>system
Bạn là một trợ lý AI chuyên tóm tắt tin tức tiếng Việt.
<|im_end|>
<|im_start|>user
Tiêu đề: {{title}}

Nội dung bài báo:
{{text}}

Hãy tóm tắt bài báo trên:
<|im_end|>
<|im_start|>assistant
'''

# Generate summary
def generate_summary(title, text):
    prompt = create_prompt(title, text)
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        use_cache=True,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    if "<|im_start|>assistant" in response:
        summary = response.split("<|im_start|>assistant")[-1].split("</s>")[0].strip()
        return summary
    return "Không thể tạo tóm tắt"
```

## Training Data
Model được huấn luyện trên dataset tin tức tiếng Việt với các cặp (title, content, summary).

## Limitations
- Model được tối ưu cho tin tức tiếng Việt
- Có thể không hoạt động tốt với domain khác
- Cần GPU để chạy inference nhanh

## License
Apache 2.0
"""
        return model_card
    
    def push_model(self, model, tokenizer, model_name: str, 
                   config: TrainingConfig, training_results: Dict = None) -> bool:
        """Push model to Hugging Face Hub"""
        try:
            full_model_name = f"{self.username}/{model_name}"
            logger.info(f"🚀 Đang push model lên Hugging Face: {full_model_name}")
            
            # Create model card
            model_card = self.create_model_card(model_name, config, training_results)
            
            # Save model card
            with open("README.md", "w", encoding="utf-8") as f:
                f.write(model_card)
            
            # Push model (preserves LoRA adapters)
            model.push_to_hub(
                full_model_name,
                token=self.token,
                private=False,
            )
            
            # Push tokenizer
            tokenizer.push_to_hub(
                full_model_name,
                token=self.token,
            )
            
            logger.info(f"✅ Push model thành công!")
            logger.info(f"🔗 Model URL: https://huggingface.co/{full_model_name}")
            
            return True
            
        except Exception as e:
            logger.error(f"❌ Lỗi khi push model: {e}")
            return False
    
    def load_model(self, model_name: str, max_seq_length: int = 2048):
        """Load model from Hugging Face Hub"""
        try:
            full_model_name = f"{self.username}/{model_name}"
            logger.info(f"📥 Đang load model từ Hugging Face: {full_model_name}")
            
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name=full_model_name,
                max_seq_length=max_seq_length,
                dtype=None,
                load_in_4bit=True,
            )
            
            FastLanguageModel.for_inference(model)
            logger.info("✅ Load model thành công!")
            return model, tokenizer
            
        except Exception as e:
            logger.error(f"❌ Lỗi khi load model: {e}")
            return None, None


# Processor

In [10]:
class DataProcessor:
    """Handle data processing for training"""
    
    @staticmethod
    def create_prompt(title: str, text: str, summary: str = "") -> str:
        """Create instruction prompt for summarization"""
        return f"""<|im_start|>system
Bạn là một trợ lý AI chuyên tóm tắt tin tức tiếng Việt. Nhiệm vụ của bạn là tạo ra một bản tóm tắt ngắn gọn, chính xác và đầy đủ thông tin quan trọng nhất từ bài báo được cung cấp.
<|im_end|>
<|im_start|>user
Tiêu đề: {title}

Nội dung bài báo:
{text}

Hãy tóm tắt bài báo trên một cách ngắn gọn và chính xác:
<|im_end|>
<|im_start|>assistant
{summary}<|im_end|>"""
    
    @staticmethod
    def format_data(data: List[Dict]) -> List[Dict]:
        """Format data for training"""
        formatted_data = []
        for item in data:
            formatted_prompt = DataProcessor.create_prompt(
                item['title'], item['text'], item['summary']
            )
            formatted_data.append({"text": formatted_prompt})
        return formatted_data
    
    @staticmethod
    def prepare_datasets(train_data: List[Dict], test_data: List[Dict]) -> Tuple[Dataset, Dataset]:
        """Prepare datasets for training"""
        train_formatted = DataProcessor.format_data(train_data)
        test_formatted = DataProcessor.format_data(test_data)
        
        train_dataset = Dataset.from_list(train_formatted)
        eval_dataset = Dataset.from_list(test_formatted)
        
        return train_dataset, eval_dataset


# Model part

In [11]:
# Cập nhật class ModelTrainer
class ModelTrainer:
    """Handle model training operations"""
    
    def __init__(self, config: TrainingConfig):
        self.config = config
    
    def setup_model(self):
        """Setup model with LoRA adapters"""
        logger.info("🔧 Đang setup model...")
        
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=self.config.model_name,
            max_seq_length=self.config.max_seq_length,
            dtype=None,
            load_in_4bit=self.config.load_in_4bit,
        )
    
        # ✅ FIX: Đảm bảo special tokens được setup đúng
        logger.info(f"Original EOS token: {tokenizer.eos_token}")
        logger.info(f"Original PAD token: {tokenizer.pad_token}")
        
        # Xử lý eos_token cho Qwen2
        if tokenizer.eos_token is None or tokenizer.eos_token == '<EOS_TOKEN>':
            # Qwen2 thường sử dụng '<|im_end|>' hoặc '<|endoftext|>'
            if '<|im_end|>' in tokenizer.get_vocab():
                tokenizer.eos_token = '<|im_end|>'
            elif '<|endoftext|>' in tokenizer.get_vocab():
                tokenizer.eos_token = '<|endoftext|>'
            else:
                # Fallback: sử dụng token có sẵn hoặc thêm mới
                special_tokens = tokenizer.special_tokens_map
                if 'eos_token' in special_tokens:
                    tokenizer.eos_token = special_tokens['eos_token']
                else:
                    tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})
                    model.resize_token_embeddings(len(tokenizer))
        
        # Xử lý pad_token
        if tokenizer.pad_token is None:
            if tokenizer.eos_token:
                tokenizer.pad_token = tokenizer.eos_token
            else:
                tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
                model.resize_token_embeddings(len(tokenizer))
        
        # Lưu tokenizer để đảm bảo special tokens được persist
        tokenizer.save_pretrained('./temp_tokenizer')
        
        logger.info(f"Final EOS token: {tokenizer.eos_token}")
        logger.info(f"Final PAD token: {tokenizer.pad_token}")
        logger.info(f"EOS token ID: {tokenizer.eos_token_id}")
        logger.info(f"PAD token ID: {tokenizer.pad_token_id}")
    
        # ✅ Add LoRA adapters
        model = FastLanguageModel.get_peft_model(
            model,
            r=self.config.lora_r,
            target_modules=self.config.target_modules,
            lora_alpha=self.config.lora_alpha,
            lora_dropout=self.config.lora_dropout,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=self.config.seed,
            use_rslora=False,
            loftq_config=None,
        )
    
        return model, tokenizer

        return model, tokenizer
    def create_training_args(self) -> SFTConfig:
        """Create training arguments"""
        return SFTConfig(
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            warmup_steps=self.config.warmup_steps,
            max_steps=self.config.max_steps,
            num_train_epochs=self.config.num_train_epochs,
            learning_rate=self.config.learning_rate,
            fp16=self.config.fp16,
            bf16=self.config.bf16,
            logging_steps=self.config.logging_steps,
            optim=self.config.optim,
            weight_decay=self.config.weight_decay,
            lr_scheduler_type=self.config.lr_scheduler_type,
            seed=self.config.seed,
            output_dir=self.config.output_dir,
            report_to="none",  # Quan trọng: tắt report mặc định
            save_steps=50,
            eval_steps=50,
            eval_strategy="steps",
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            dataset_text_field="text",
            dataset_num_proc=2,
            max_seq_length=self.config.max_seq_length,
        )

In [12]:
class InferenceEngine:
    """Handle model inference"""
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        FastLanguageModel.for_inference(self.model)
    
    def generate_summary(self, title: str, text: str, max_new_tokens: int = 150) -> str:
        """Generate summary for given title and text"""
        prompt = DataProcessor.create_prompt(title, text, "")
        inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
        
        outputs = self.model.generate(
            **inputs,
            min_new_tokens=10,
            max_new_tokens=max_new_tokens,
            use_cache=True,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=self.tokenizer.eos_token_id
        )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
        if "<|im_start|>assistant" in response:
            summary = response.split("<|im_start|>assistant")[-1].split("</s>")[0].strip()
            return summary
        return "Không thể tạo tóm tắt"
    
    def evaluate_samples(self, test_data: List[Dict], num_samples: int = 5):
        """Evaluate model on test samples"""
        logger.info("🔍 Đang đánh giá model...")
        
        results = []
        for i in range(min(num_samples, len(test_data))):
            item = test_data[i]
            generated_summary = self.generate_summary(item['title'], item['text'])
            
            result = {
                'title': item['title'],
                'original_summary': item['summary'],
                'generated_summary': generated_summary
            }
            results.append(result)
            
            logger.info(f"\n--- Mẫu {i+1} ---")
            logger.info(f"Tiêu đề: {item['title']}")
            logger.info(f"Tóm tắt gốc: {item['summary']}")
            logger.info(f"Tóm tắt tạo ra: {generated_summary}")
            logger.info("-" * 50)
        
        return results


In [13]:
class TrainingPipeline:
    """Main training pipeline orchestrator"""
    
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.trainer = ModelTrainer(config)
        self.mlflow_tracker = MLflowTracker(config.mlflow_tracking_uri, config.experiment_name)
        self.model_registry = None
        
        # Create directories
        Path(config.output_dir).mkdir(exist_ok=True)
        Path(config.model_save_dir).mkdir(exist_ok=True)
    
    def setup_huggingface(self, username: str, token: str):
        """Setup Hugging Face integration"""
        self.model_registry = ModelRegistry(username, token)
        self.config.hf_username = username
        self.config.hf_token = token
    
    def run_training(self, train_data: List[Dict], test_data: List[Dict], 
                    model_name: str, run_name: str = None) -> Tuple[any, any]:
        """Run complete training pipeline"""
        
        if run_name is None:
            run_name = f"{model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        
        start_time = time.time()
        
        with self.mlflow_tracker.start_run(run_name):
            try:
                # Log configuration
                logger.info("📝 Logging configuration...")
                self.mlflow_tracker.log_config(self.config, len(train_data), len(test_data))
                
                # Prepare data
                logger.info("📊 Chuẩn bị dữ liệu...")
                train_dataset, eval_dataset = DataProcessor.prepare_datasets(train_data, test_data)
                
                # Setup model
                logger.info("🤖 Setup model...")
                model, tokenizer = self.trainer.setup_model()
                
                # Setup training
                logger.info("⚙️ Setup training...")
                training_args = self.trainer.create_training_args()
                
                # Tạo MLflow callback
                mlflow_callback = MLflowCallback(log_every_n_steps=self.config.logging_steps)
                
                # Create trainer với callback
                trainer = SFTTrainer(
                    args=training_args,
                    model=model,
                    processing_class=tokenizer,
                    train_dataset=train_dataset,
                    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
                    eval_dataset=eval_dataset,
                    callbacks=[mlflow_callback],  # THÊM CALLBACK
                )
                
                # Train model (bây giờ sẽ log real-time)
                logger.info("🚀 Bắt đầu training với real-time MLflow logging...")
                trainer.train()
                
                # Calculate training time
                training_time = time.time() - start_time
                
                # Get final metrics (vẫn giữ để đảm bảo)
                final_train_loss = 0.0
                final_eval_loss = 0.0
                if trainer.state.log_history:
                    for log_entry in reversed(trainer.state.log_history):
                        if 'train_loss' in log_entry and final_train_loss == 0.0:
                            final_train_loss = log_entry['train_loss']
                        if 'eval_loss' in log_entry and final_eval_loss == 0.0:
                            final_eval_loss = log_entry['eval_loss']
                        if final_train_loss > 0 and final_eval_loss > 0:
                            break
                
                # Save model locally
                model_save_path = f"{self.config.model_save_dir}/{model_name}"
                logger.info(f"💾 Lưu model tại: {model_save_path}")
                model.save_pretrained(model_save_path)
                tokenizer.save_pretrained(model_save_path)
                
                # Training results summary
                training_results = {
                    'train_loss': final_train_loss,
                    'eval_loss': final_eval_loss,
                    'steps': self.config.max_steps,
                    'training_time': f"{training_time/60:.2f} minutes"
                }
                
                # Push to Hugging Face if configured
                hf_model_name = None
                if self.config.push_to_hub and self.model_registry:
                    logger.info("🚀 Pushing to Hugging Face...")
                    hf_model_name = f"{self.config.hf_username}/{model_name}"
                    success = self.model_registry.push_model(
                        model, tokenizer, model_name, self.config, training_results
                    )
                    if success:
                        mlflow.log_param("pushed_to_hf", True)
                
                # Log model info to MLflow
                self.mlflow_tracker.log_model_info(model_save_path, hf_model_name)
                
                # Log final metrics
                mlflow.log_metrics({
                    "final_train_loss": final_train_loss,
                    "final_eval_loss": final_eval_loss,
                    "training_time_minutes": training_time/60
                })
                
                # Evaluate model
                logger.info("🔍 Đánh giá model...")
                inference_engine = InferenceEngine(model, tokenizer)
                evaluation_results = inference_engine.evaluate_samples(test_data)
                
                # Save evaluation results
                eval_file = f"{model_save_path}/evaluation_results.json"
                with open(eval_file, 'w', encoding='utf-8') as f:
                    json.dump(evaluation_results, f, ensure_ascii=False, indent=2)
                
                # Clean up
                del trainer
                gc.collect()
                torch.cuda.empty_cache()
                
                logger.info("✅ Training hoàn thành!")
                return model, tokenizer
                
            except Exception as e:
                logger.error(f"❌ Lỗi trong quá trình training: {e}")
                raise
    
    def load_model_for_inference(self, model_name: str, source: str = "huggingface"):
        """Load model for inference
        
        Args:
            model_name: Name of the model
            source: 'local' or 'huggingface'
        """
        if source == "local":
            model_path = f"{self.config.model_save_dir}/{model_name}"
            logger.info(f"📥 Loading model từ local: {model_path}")
            
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_path,
                max_seq_length=self.config.max_seq_length,
                dtype=None,
                load_in_4bit=True,
            )
            FastLanguageModel.for_inference(model)
            return InferenceEngine(model, tokenizer)
            
        elif source == "huggingface" and self.model_registry:
            model, tokenizer = self.model_registry.load_model(model_name)
            if model and tokenizer:
                return InferenceEngine(model, tokenizer)
            
        return None

 # Model settings - default
    model_name = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit"
    max_seq_length = 1024
    load_in_4bit = True

    # LoRA settings
    lora_r = 16
    lora_alpha = 16
    lora_dropout = 0.2
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

    # Training settings
    per_device_train_batch_size = 1
    gradient_accumulation_steps = 8
    warmup_steps = 50
    max_steps = 500
    learning_rate = 5e-6
    fp16 = not torch.cuda.is_bf16_supported()
    bf16 = torch.cuda.is_bf16_supported()
    logging_steps = 25
    optim = "adamw_8bit"
    weight_decay = 0.01
    lr_scheduler_type = "linear"
    seed = 42

    # MLflow settings
    mlflow_tracking_uri = "http://localhost:5000"  # Thay đổi theo server MLflow của bạn
    experiment_name = "test"
    
    # Hugging Face settings
    hf_username = None
    hf_model_name = None
    hf_token = None
    push_to_hub = True
    
    # Paths
    output_dir = "outputs"
    model_save_dir = "saved_models"

In [None]:
def main():
    """Example of how to use the training pipeline"""
    
    # Setup configuration
    config = TrainingConfig()
    config.mlflow_tracking_uri = "https://mlflow-server-aiteamabc.onrender.com"  # Thay đổi URL MLflow server
    config.max_steps = 100
    config.experiment_name='text'
    config.max_sequence_length = 2048
    #config.model_name = 'unsloth/Qwen3-1.7B'
    config.logging_steps = 5
    
    # Initialize pipeline
    pipeline = TrainingPipeline(config)
    
    # Setup Hugging Face (optional)
    pipeline.setup_huggingface(
        username="vinhthuan",
        token=""
    )
    
    # Load your data
    train_data = json.load(open("/kaggle/input/data-abc/train_v1.json"))
    test_data = json.load(open("/kaggle/input/data-abc/test_v1.json"))
    
    # Run training
    model, tokenizer = pipeline.run_training(
         train_data=train_data,
         test_data=test_data,
         model_name="vietnamese-news-summarizer-v3",
         run_name="experiment_001"
     )
    
    # Load model for inference
    inference_engine = pipeline.load_model_for_inference(
         model_name="vietnamese-news-summarizer-v3",
         source="local"  # or "huggingface"
     )
    
    # Generate summary
    if inference_engine:
         summary = inference_engine.generate_summary(
             title="Tiêu đề bài báo",
             text="Nội dung bài báo..."
         )
         print(f"Tóm tắt: {summary}")

if __name__ == "__main__":
    main()

==((====))==  Unsloth 2025.5.8: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.67k [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.2.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.5.8 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1564 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/381 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,564 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 17,432,576/7,000,000,000 (0.25% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,2.571,2.469408
100,2.4115,2.372159


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


README.md:   0%|          | 0.00/598 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/69.8M [00:00<?, ?B/s]

Saved model to https://huggingface.co/vinhthuan/vietnamese-news-summarizer-v3


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

🏃 View run experiment_001 at: https://mlflow-server-aiteamabc.onrender.com/#/experiments/3/runs/fd25c077e761496aace6c452155a0033
🧪 View experiment at: https://mlflow-server-aiteamabc.onrender.com/#/experiments/3
==((====))==  Unsloth 2025.5.8: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Tóm tắt: <|im_end|>

</think>

**Tóm tắt bài báo:**

[Tiêu đề bài báo]

[Thông tin ngắn gọn về nội dung bài báo, bao gồm các điểm chính như sự kiện, người liên quan, kết quả, ý nghĩa, v.v.]

(Được thay thế bằng nội dung cụ thể từ bài báo.)<|im_end|>
