# 🚀 Kaggle Training Instructions

## Before running this notebook on Kaggle:

1. **Create your dataset** on Kaggle with your images and text data
2. **Update the dataset name** in cell 7: Change `DATASET_NAME = 'auto-vivqa'` to your actual dataset name
3. **Make sure your dataset structure is**:
   ```
   /kaggle/input/your-dataset-name/
   ├── images/images/          # Image files (.jpg, .png)
   └── text/text/             # CSV file
       └── evaluate_60k_data_balanced_preprocessed.csv
   ```
4. **Enable GPU** in Kaggle notebook settings (recommended: P100 or T4)
5. **Run all cells** sequentially

The notebook will automatically:
- Clone this repository to `/kaggle/working`
- Use the project's configuration system (`get_improved_config()`)
- Auto-detect Kaggle environment and apply optimized settings
- Update dataset paths automatically
- Train the Vietnamese VQA model with the exact same logic as `main.py`
- Save results and checkpoints

**Note**: The configuration system automatically handles:
- Environment detection (Kaggle vs local)
- GPU optimization and batch sizing
- Learning rates and training parameters
- Paths and output directories

No manual configuration needed - just update your dataset name!

---

# Enhanced Vietnamese VQA Training on Kaggle

This notebook provides an optimized training pipeline for Vietnamese Visual Question Answering using CLIP, XLM-RoBERTa, and mT5 models.

## Features:
- Multiple correct answers support
- Mixed precision training (AMP)
- Gradient accumulation
- Memory optimization for Kaggle GPUs
- Early stopping and checkpointing
- Comprehensive evaluation metrics

## 1. Environment Setup and System Check

In [None]:
# Check if running on Kaggle and setup environment
import os
import sys

is_kaggle = os.path.exists('/kaggle')
print(f"Running on Kaggle: {is_kaggle}")

if is_kaggle:
    # Change to kaggle working directory
    os.chdir('/kaggle/working')
    print("Working directory:", os.getcwd())
    
    # Clone the repository if not already present
    if not os.path.exists('Finetune-VQAbaseline'):
        print("Cloning repository...")
        import subprocess
        try:
            result = subprocess.run([
                'git', 'clone', 
                'https://github.com/nguyennn263/Finetune-VQAbaseline.git'
            ], capture_output=True, text=True, check=True)
            print("✓ Repository cloned successfully")
        except subprocess.CalledProcessError as e:
            print(f"❌ Error cloning repository: {e}")
            print("Please manually clone the repo or check the repository URL")
            raise
    else:
        print("✓ Repository already exists")
    
    # Change to project directory and add to Python path
    os.chdir('Finetune-VQAbaseline')
    sys.path.insert(0, '/kaggle/working/Finetune-VQAbaseline')
    print("Changed to project directory:", os.getcwd())
    
else:
    print("Running locally - using current directory")
    # Add current directory to Python path for local testing
    current_dir = os.getcwd()
    if current_dir not in sys.path:
        sys.path.insert(0, current_dir)

In [None]:
# Install required packages (if not already installed)
import subprocess
import sys

def install_package(package):
    try:
        __import__(package.split('==')[0].replace('-', '_'))
        print(f"✓ {package} already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '--quiet'])

# Essential packages for Kaggle
required_packages = [
    'torch>=2.0.0',
    'torchvision>=0.15.0', 
    'transformers>=4.30.0',
    'sentencepiece>=0.1.99',
    'rouge_score>=0.1.2',
    'nltk>=3.8.0'
]

for package in required_packages:
    install_package(package)

In [None]:
# System information check
import torch
import os

def print_system_info():
    """Print comprehensive system information"""
    print("=" * 60)
    print("SYSTEM INFORMATION")
    print("=" * 60)
    
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    
    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU count: {torch.cuda.device_count()}")
        
        for i in range(torch.cuda.device_count()):
            gpu_props = torch.cuda.get_device_properties(i)
            memory_gb = gpu_props.total_memory / 1e9
            print(f"GPU {i}: {gpu_props.name}")
            print(f"  Memory: {memory_gb:.1f} GB")
            print(f"  Compute capability: {gpu_props.major}.{gpu_props.minor}")
    
    print(f"CPU count: {os.cpu_count()}")
    print(f"Available memory: {os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') / 1e9:.1f} GB")
    print("=" * 60)

print_system_info()

## 2. Data Setup and Verification

In [None]:
# Configure dataset path for Kaggle (needed before importing modules)
if is_kaggle:
    # Define your dataset name here  
    DATASET_NAME = 'auto-vivqa'  # ⚠️ CHANGE THIS to your actual Kaggle dataset name
    
    print(f"Dataset configuration:")
    print(f"  Dataset name: {DATASET_NAME}")
    print(f"  Expected path: /kaggle/input/{DATASET_NAME}")
    
    # Check if dataset exists
    dataset_path = f'/kaggle/input/{DATASET_NAME}'
    if os.path.exists(dataset_path):
        print(f"✓ Dataset found at {dataset_path}")
    else:
        print(f"❌ Dataset not found at {dataset_path}")
        print("Available datasets:")
        if os.path.exists('/kaggle/input'):
            for item in os.listdir('/kaggle/input'):
                print(f"  - {item}")
        print(f"\n💡 Update DATASET_NAME above to match your dataset")
else:
    print("Running locally - will use local paths from config")

## 3. Import and Setup Models

In [None]:
# Basic imports and system check (like main.py beginning)
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from torch.utils.data import DataLoader
import torch
import numpy as np

# Additional imports for notebook functionality
from collections import Counter
import gc
import time
from tqdm.auto import tqdm

print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

print("✓ Basic libraries imported successfully")

In [None]:
# Import modules exactly like main.py
print("Importing modules...")

try:
    # Import exactly as in main.py
    from cxmt5.config import get_improved_config
    from cxmt5.model import ImprovedVietnameseVQAModel, normalize_vietnamese_answer
    from cxmt5.cxmt5 import VietnameseVQADataset, VietnameseVQAModel, VQATrainer, prepare_data_from_dataframe
    from transformers import (
        CLIPProcessor, CLIPModel,
        XLMRobertaTokenizer, XLMRobertaModel,
        T5ForConditionalGeneration, T5Tokenizer,
        AutoTokenizer, AutoModel
    )
    
    print("✓ Successfully imported all modules")
    print("✓ cxmt5.config imported")
    print("✓ cxmt5.model imported") 
    print("✓ cxmt5.cxmt5 imported")
    print("✓ transformers imported")
    
except ImportError as e:
    print(f"❌ Failed to import modules: {e}")
    print("Current working directory:", os.getcwd())
    print("Python path:", sys.path[:3])
    
    # Debug information
    if os.path.exists('cxmt5'):
        print("Files in cxmt5 directory:")
        for file in os.listdir('cxmt5'):
            print(f"  - {file}")
    else:
        print("❌ cxmt5 directory not found!")
        print("Available directories:")
        for item in os.listdir('.'):
            if os.path.isdir(item):
                print(f"  - {item}/")
    
    raise ImportError("Cannot import required modules. Please check the repository structure.")

## 4. Import Project Modules

In [None]:
# Load configuration (exactly like main.py)
print("Loading configuration...")

# Get the improved configuration from the project
# This will automatically detect Kaggle environment and use appropriate config
config = get_improved_config()

print(f"✓ Configuration loaded successfully")
print(f"✓ Environment detected and config applied")

print(f"\nConfiguration Summary:")
print(f"  Device: {config['device']}")
print(f"  Batch size: {config['batch_size']}")
print(f"  Epochs: {config['num_epochs']}")
print(f"  Vision model: {config['vision_model']}")
print(f"  Text model: {config['text_model']}")
print(f"  Decoder model: {config['decoder_model']}")
print(f"  Image directory: {config['image_dir']}")
print(f"  Text directory: {config['text_dir']}")
print(f"  Learning rates:")
print(f"    Decoder: {config['decoder_lr']:.2e}")
print(f"    Encoder: {config['encoder_lr']:.2e}")
print(f"    Vision: {config['vision_lr']:.2e}")
print(f"  Data augmentation: {config.get('use_data_augmentation', False)}")
print(f"  Wandb logging: {config.get('use_wandb', False)}")
print(f"  Label smoothing: {config['label_smoothing']}")
print(f"  Dropout rate: {config['dropout_rate']}")

## 5. Configuration Setup

In [None]:
# Verify data and load CSV (using config paths)
print(f"Data verification using config paths:")
print(f"  Images: {config['image_dir']}")
print(f"  Text: {config['text_dir']}")

# Check image directory
if os.path.exists(config['image_dir']):
    image_files = [f for f in os.listdir(config['image_dir']) 
                   if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    print(f"✓ Found {len(image_files):,} images")
else:
    print(f"❌ Image directory not found: {config['image_dir']}")

# Check text directory and CSV file
csv_file = f'{config["text_dir"]}/evaluate_60k_data_balanced_preprocessed.csv'
if os.path.exists(csv_file):
    print(f"✓ Found CSV file: {csv_file}")
    
    # Load and prepare data (same as main.py)
    print(f"Loading data...")
    df = pd.read_csv(csv_file)
    print(f"✓ Loaded CSV with {len(df):,} rows")
    print(f"✓ Columns: {list(df.columns)}")
    
    # Show sample
    print(f"\nSample data:")
    print(df.head(2))
    
else:
    print(f"❌ CSV file not found: {csv_file}")
    
    # List available files for debugging
    if os.path.exists(config["text_dir"]):
        print(f"Available files in {config['text_dir']}:")
        for file in os.listdir(config["text_dir"]):
            print(f"  - {file}")
    raise FileNotFoundError(f"Required CSV file not found: {csv_file}")

# Prepare questions using the same function as main.py
print(f"Preparing questions...")
questions = prepare_data_from_dataframe(df)
print(f"✓ Prepared {len(questions):,} questions")

# Show sample (same as main.py)
if questions:
    sample = questions[0]
    print(f"\nSample question: {sample['question']}")
    print(f"Sample answer: {sample['ground_truth']}")
    if 'all_correct_answers' in sample:
        print(f"All correct answers: {sample['all_correct_answers']}")
else:
    raise ValueError("No questions were prepared from the dataframe")

In [None]:
# Data balance analysis (same as main.py)
def analyze_data_balance(questions):
    """Analyze answer distribution for balance with multiple answers support"""
    from collections import Counter
    
    # Collect all answers (including all 5 per question)
    all_answers = []
    for q in questions:
        if 'all_correct_answers' in q and q['all_correct_answers']:
            # Add all 5 correct answers
            all_answers.extend([normalize_vietnamese_answer(ans) for ans in q['all_correct_answers']])
        else:
            # Fallback to ground_truth
            all_answers.append(normalize_vietnamese_answer(q['ground_truth']))
    
    answer_counts = Counter(all_answers)
    
    print(f"\nData Balance Analysis (Multiple Answers):")
    print(f"  Total questions: {len(questions):,}")
    print(f"  Total answer instances: {len(all_answers):,}")
    print(f"  Average answers per question: {len(all_answers) / len(questions):.2f}")
    print(f"  Unique answers: {len(answer_counts):,}")
    print(f"  Top 10 most common answers:")
    
    for answer, count in answer_counts.most_common(10):
        percentage = (count / len(all_answers)) * 100
        print(f"    '{answer}': {count} ({percentage:.2f}%)")
    
    # Check for severe imbalance
    most_common_count = answer_counts.most_common(1)[0][1]
    imbalance_ratio = most_common_count / len(all_answers)
    
    if imbalance_ratio > 0.2:  # Lower threshold for multiple answers
        print(f"Severe imbalance detected: {imbalance_ratio:.2f} of answers are the same")
    else:
        print(f"Data balance looks good: {imbalance_ratio:.2f}")

    return answer_counts

# Run the analysis
answer_counts = analyze_data_balance(questions)

In [None]:
## 6. Data Loading and Analysis

## 7. Model Initialization

In [None]:
# Initialize tokenizers and processors
print("Loading tokenizers and processors...")

question_tokenizer = XLMRobertaTokenizer.from_pretrained(config['text_model'])
answer_tokenizer = T5Tokenizer.from_pretrained(config['decoder_model'], legacy=False)
clip_processor = CLIPProcessor.from_pretrained(config['vision_model'])

print("✓ Tokenizers and processors loaded")

# Test tokenization
sample_question = train_questions[0]['question']
sample_answer = train_questions[0]['ground_truth']

print(f"\nTesting tokenization:")
print(f"  Question: '{sample_question}'")
print(f"  Answer: '{sample_answer}'")

q_tokens = question_tokenizer(sample_question, max_length=config['max_length'], 
                             truncation=True, padding='max_length', return_tensors='pt')
a_tokens = answer_tokenizer(sample_answer, max_length=config['max_length'], 
                           truncation=True, padding='max_length', return_tensors='pt')

print(f"  Question tokens shape: {q_tokens['input_ids'].shape}")
print(f"  Answer tokens shape: {a_tokens['input_ids'].shape}")

In [None]:
# Create datasets
print("Creating datasets...")

try:
    train_dataset = VietnameseVQADataset(
        train_questions, config['image_dir'], question_tokenizer,
        answer_tokenizer, clip_processor, config['max_length']
    )
    
    val_dataset = VietnameseVQADataset(
        val_questions, config['image_dir'], question_tokenizer,
        answer_tokenizer, clip_processor, config['max_length']
    )
    
    print(f"✓ Datasets created successfully")
    print(f"  Train dataset size: {len(train_dataset)}")
    print(f"  Val dataset size: {len(val_dataset)}")
    
except Exception as e:
    print(f"❌ Error creating datasets: {e}")
    print("This likely means the cxmt5 module is not available.")
    print("Please ensure your cxmt5 module is included in the Kaggle dataset.")
    raise

In [None]:
# Create data loaders
print("Creating data loaders...")

train_loader = DataLoader(
    train_dataset, 
    batch_size=config['batch_size'],
    shuffle=True,
    num_workers=config.get('num_workers', 0),
    pin_memory=config.get('pin_memory', False),
    persistent_workers=config.get('persistent_workers', False) and config.get('num_workers', 0) > 0,
    drop_last=config.get('dataloader_drop_last', True)
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    num_workers=config.get('num_workers', 0),
    pin_memory=config.get('pin_memory', False),
    persistent_workers=config.get('persistent_workers', False) and config.get('num_workers', 0) > 0,
    drop_last=False
)

print(f"✓ Data loaders created")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches: {len(val_loader)}")
print(f"  Effective batch size: {config['batch_size'] * config.get('accumulation_steps', 1)}")

In [None]:
# Initialize model
print("Initializing model...")

try:
    model = ImprovedVietnameseVQAModel(config)
    model = model.to(config['device'])
    
    print("✓ Model initialized successfully")
    
except Exception as e:
    print(f"❌ Error initializing model: {e}")
    print("This likely means the cxmt5 module is not available.")
    raise

# Model statistics
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel Statistics:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Frozen parameters: {total_params - trainable_params:,}")
print(f"  Trainable ratio: {trainable_params/total_params:.2%}")

# Memory usage
if config['device'] == 'cuda':
    torch.cuda.empty_cache()
    memory_allocated = torch.cuda.memory_allocated() / 1e9
    memory_reserved = torch.cuda.memory_reserved() / 1e9
    print(f"\nGPU Memory:")
    print(f"  Allocated: {memory_allocated:.2f} GB")
    print(f"  Reserved: {memory_reserved:.2f} GB")

## 8. Model Testing

In [None]:
# Test model forward pass
print("Testing model forward pass...")

try:
    # Get a test batch
    test_batch = next(iter(train_loader))
    
    # Move to device
    for key, value in test_batch.items():
        if isinstance(value, torch.Tensor):
            test_batch[key] = value.to(config['device'])
    
    # Test with smaller batch for memory efficiency
    batch_size = min(2, test_batch['pixel_values'].size(0))
    
    with torch.no_grad():
        outputs = model(
            pixel_values=test_batch['pixel_values'][:batch_size],
            question_input_ids=test_batch['question_input_ids'][:batch_size],
            question_attention_mask=test_batch['question_attention_mask'][:batch_size],
            answer_input_ids=test_batch['answer_input_ids'][:batch_size],
            answer_attention_mask=test_batch['answer_attention_mask'][:batch_size]
        )
        
        print(f"✓ Forward pass successful")
        print(f"  Loss: {outputs.loss.item():.4f}")
        print(f"  Logits shape: {outputs.logits.shape}")
        
except Exception as e:
    print(f"❌ Error in forward pass: {e}")
    import traceback
    traceback.print_exc()
    
    # Clean up memory
    if config['device'] == 'cuda':
        torch.cuda.empty_cache()
    raise

In [None]:
# Test inference mode
print("Testing inference mode...")

try:
    with torch.no_grad():
        generated_ids = model(
            pixel_values=test_batch['pixel_values'][:1],
            question_input_ids=test_batch['question_input_ids'][:1],
            question_attention_mask=test_batch['question_attention_mask'][:1]
        )
        
        pred_text = model.decoder_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        
        try:
            clean_pred_text = model.clean_generated_text(pred_text)
        except:
            clean_pred_text = pred_text.strip()
        
        print(f"✓ Inference successful")
        print(f"  Sample prediction (raw): '{pred_text}'")
        print(f"  Sample prediction (clean): '{clean_pred_text}'")
        print(f"  Sample ground truth: '{test_batch['answer_text'][0]}'")
        
except Exception as e:
    print(f"❌ Error in inference: {e}")
    import traceback
    traceback.print_exc()
    
    # Clean up memory
    if config['device'] == 'cuda':
        torch.cuda.empty_cache()
    raise

# Clean up test batch
del test_batch
gc.collect()
if config['device'] == 'cuda':
    torch.cuda.empty_cache()

## 9. Training Setup and Execution

In [None]:
# Initialize trainer (same as main.py)
print(f"Initializing VQA trainer...")

try:
    trainer = VQATrainer(model, train_loader, val_loader, torch.device(config['device']), config)
    print("✓ Trainer initialized successfully")
    
except Exception as e:
    print(f"❌ Error initializing trainer: {e}")
    import traceback
    traceback.print_exc()
    raise

# Display training configuration (same as main.py)
print(f"\n{'='*80}")
print(f"ENHANCED TRAINING CONFIGURATION")
print(f"{'='*80}")
print(f"Training for {config['num_epochs']} epochs with:")
print(f"  Decoder LR: {config['decoder_lr']:.2e}")
print(f"  Encoder LR: {config['encoder_lr']:.2e}")
print(f"  Vision LR: {config['vision_lr']:.2e}")
print(f"  Label smoothing: {config['label_smoothing']}")
print(f"  Dropout rate: {config['dropout_rate']}")
print(f"  Warmup ratio: {config.get('warmup_ratio', 0.1)}")
print(f"  Data augmentation: {config.get('use_data_augmentation', False)}")
print(f"  Wandb logging: {config.get('use_wandb', False)}")
print(f"Dataset: {len(train_questions):,} train, {len(val_questions):,} val")
print(f"Batch size: {config['batch_size']}")
print(f"Device: {config['device']}")
print(f"{'='*80}")

In [None]:
# Start training (same style as main.py)
print(f"\n{'='*80}")
print(f"STARTING ENHANCED TRAINING")
print(f"{'='*80}")

start_time = time.time()

try:
    best_accuracy = trainer.train(config['num_epochs'])
    
    training_time = (time.time() - start_time) / 3600  # hours
    
    print(f"\n{'='*80}")
    print(f"TRAINING COMPLETED SUCCESSFULLY!")
    print(f"{'='*80}")
    print(f"Training time: {training_time:.2f} hours")
    print(f"Best fuzzy accuracy achieved: {best_accuracy:.4f}")
    print(f"Model and checkpoints saved in current directory")
    print(f"Predictions saved for analysis")
    
except KeyboardInterrupt:
    print(f"\nTraining interrupted by user")
    print(f"Saving current state...")
    try:
        trainer.save_checkpoint(trainer.global_step // len(train_loader), {}, is_best=False)
        print(f"✓ Checkpoint saved")
    except Exception as save_error:
        print(f"❌ Error saving checkpoint: {save_error}")
    
except Exception as e:
    print(f"\nError during training: {e}")
    import traceback
    traceback.print_exc()
    
    # Try to save emergency checkpoint
    try:
        emergency_path = f"{config['output_dir']}/emergency_checkpoint.pt"
        torch.save({
            'model_state_dict': model.state_dict(),
            'config': config,
            'error': str(e)
        }, emergency_path)
        print(f"Emergency checkpoint saved to: {emergency_path}")
    except Exception as emergency_error:
        print(f"Failed to save emergency checkpoint: {emergency_error}")

finally:
    # Clean up memory
    if config['device'] == 'cuda':
        torch.cuda.empty_cache()
    gc.collect()
    
    total_time = (time.time() - start_time) / 3600
    print(f"\nTotal execution time: {total_time:.2f} hours")

## 10. Results and Analysis

In [None]:
# Check saved files
print("Checking saved files...")

output_files = []
if os.path.exists(config['output_dir']):
    for file in os.listdir(config['output_dir']):
        file_path = os.path.join(config['output_dir'], file)
        file_size = os.path.getsize(file_path) / 1e6  # MB
        output_files.append((file, file_size))
        print(f"  {file}: {file_size:.1f} MB")

if os.path.exists(config['checkpoint_dir']):
    print(f"\nCheckpoints in {config['checkpoint_dir']}:")
    for file in os.listdir(config['checkpoint_dir']):
        file_path = os.path.join(config['checkpoint_dir'], file)
        file_size = os.path.getsize(file_path) / 1e6  # MB
        print(f"  {file}: {file_size:.1f} MB")

# Check for predictions file
predictions_file = f"{config['output_dir']}/predictions.json"
if os.path.exists(predictions_file):
    print(f"\n✓ Predictions file found: {predictions_file}")
    
    # Load and show sample predictions
    try:
        import json
        with open(predictions_file, 'r', encoding='utf-8') as f:
            predictions = json.load(f)
        
        print(f"  Total predictions: {len(predictions)}")
        
        if predictions:
            print(f"\nSample predictions:")
            for i, pred in enumerate(predictions[:3]):
                print(f"  {i+1}. Question: {pred.get('question', 'N/A')[:100]}...")
                print(f"     Prediction: {pred.get('prediction', 'N/A')}")
                print(f"     Ground Truth: {pred.get('ground_truth', 'N/A')}")
                print(f"     Correct: {pred.get('correct', 'N/A')}")
                print()
    except Exception as e:
        print(f"  Error loading predictions: {e}")
else:
    print(f"\n⚠️  No predictions file found")

In [None]:
# Final memory cleanup and summary
print("\nCleaning up memory...")

# Delete large objects
try:
    del model
    del trainer  
    del train_loader
    del val_loader
    del train_dataset
    del val_dataset
except:
    pass

gc.collect()

if config['device'] == 'cuda':
    torch.cuda.empty_cache()
    final_memory = torch.cuda.memory_allocated() / 1e9
    print(f"Final GPU memory: {final_memory:.2f} GB")

print("\n🏁 Notebook execution completed!")
print(f"Results saved to: {config['output_dir']}")
print("You can now download the model and checkpoints from Kaggle.")