In [24]:
import logging
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    logger.info(f"Evaluation metrics - Accuracy: {acc:.4f}, F1: {f1:.4f}")
    return {"accuracy": acc, "f1": f1}

def tokenize_function(examples, tokenizer):
    tokenized = tokenizer(
        examples["question"],
        examples["passage"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    tokenized["labels"] = [int(a) for a in examples["answer"]]
    return tokenized

class CustomTrainer(Trainer):
    def log(self, logs, start_time=None, **kwargs):
        """Override log method to add more detailed logging"""
        if "eval_accuracy" in logs:
            # Get values with proper type handling
            epoch = logs.get('epoch', 0)  # Default to 0 instead of '?'
            loss = logs.get('loss', 0.0)  # Default to 0.0
            eval_loss = logs.get('eval_loss', 0.0)
            eval_accuracy = logs.get('eval_accuracy', 0.0)
            eval_f1 = logs.get('eval_f1', 0.0)

            logger.info(
                f"Epoch {epoch:.2f} - "
                f"Training Loss: {loss:.4f} - "
                f"Validation Loss: {eval_loss:.4f} - "
                f"Validation Accuracy: {eval_accuracy*100:.2f}% - "
                f"Validation F1: {eval_f1:.4f}"
            )
        return super().log(logs, start_time, **kwargs)

def main():
    logger.info("Starting training script")
    
    # Load the BoolQ dataset
    dataset = load_dataset("boolq")
    
    # Load model and tokenizer
    model_checkpoint = "huawei-noah/TinyBERT_General_4L_312D"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, 
        num_labels=2
    )
    model.config.problem_type = "single_label_classification"
    
    # Tokenize datasets
    logger.info("Tokenizing datasets...")
    tokenized_datasets = dataset.map(
        lambda examples: tokenize_function(examples, tokenizer),
        batched=True,
        remove_columns=["question", "passage", "answer"]
    )
    
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]
    
    logger.info(f"Training samples: {len(train_dataset)}")
    logger.info(f"Validation samples: {len(eval_dataset)}")

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=50,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        push_to_hub=False,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        save_total_limit=1,
    )

    # Initialize trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    logger.info("Starting training...")
    train_result = trainer.train()
    
    # Final evaluation
    logger.info("Performing final evaluation...")
    final_metrics = trainer.evaluate()
    logger.info(f"Final evaluation metrics: {final_metrics}")

    # Save the final model
    logger.info("Saving final model...")
    trainer.save_model("./best_checkpoint")
    tokenizer.save_pretrained("./best_checkpoint")

    # Log training summary
    logger.info("Training completed!")

    return trainer, final_metrics, train_result 

In [None]:
trainer, final_metrics, train_result = main()

In [None]:
def evaluate_with_trainer(checkpoint_dir):
    logger.info(f"Evaluating checkpoint from {checkpoint_dir}")
    
    # Load tokenizer and model with exact same configuration
    model_name = 'huawei-noah/TinyBERT_General_4L_312D'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load the model with detailed logging
    logger.info("Loading model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint_dir,
        num_labels=2,
        problem_type="single_label_classification"
    )
    
    # Log model state
    logger.info(f"Model config: {model.config}")
    logger.info(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Load dataset exactly as in training
    dataset = load_dataset("boolq")
    eval_dataset = dataset["validation"]
    
    # Use the exact same tokenization function as training
    logger.info("Tokenizing dataset...")
    tokenized_eval = eval_dataset.map(
        lambda examples: tokenize_function(examples, tokenizer),
        batched=True,
        remove_columns=["question", "passage", "answer"]
    )
    
    # Use the same training arguments (but only for evaluation)
    eval_args = TrainingArguments(
        output_dir="./eval_results",
        per_device_eval_batch_size=16,  # Same as training
        remove_unused_columns=True,
    )
    
    # Initialize trainer with same configuration
    trainer = CustomTrainer(
        model=model,
        args=eval_args,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Evaluate
    logger.info("Starting evaluation...")
    metrics = trainer.evaluate()
    
    logger.info(f"Evaluation metrics: {metrics}")
    return metrics

# Call the new evaluation function
metrics = evaluate_with_trainer('./best_checkpoint')

In [29]:
import os

import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Use the same data collator as training
from transformers import default_data_collator, AutoConfig


def evaluate(checkpoint_dir):
    # Set device to CPU for evaluation
    device = torch.device('cpu')

    # Load tokenizer and get expected model name
    model_name = 'huawei-noah/TinyBERT_General_4L_312D'
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    dataset = load_dataset("boolq")
    eval_dataset = dataset["validation"]
    
    # Use the exact same tokenization function as training
    logger.info("Tokenizing dataset...")
    val_dataset = eval_dataset.map(
        lambda examples: tokenize_function(examples, tokenizer),
        batched=True,
        remove_columns=["question", "passage", "answer"]
    )

    # Create validation dataloader
    val_loader = DataLoader(
        val_dataset,
        batch_size=16,
        shuffle=False,
        collate_fn=default_data_collator
    )

    # Load model from best_checkpoint directory
    if not os.path.exists(checkpoint_dir):
        raise FileNotFoundError('best_checkpoint directory not found')

    if not os.path.exists(os.path.join(checkpoint_dir, 'config.json')):
        raise FileNotFoundError(
            'Invalid checkpoint: config.json not found in best_checkpoint'
        )


    # Load config first
    config = AutoConfig.from_pretrained(
        checkpoint_dir,
        num_labels=2,
        problem_type="single_label_classification"
    )

    # Load model with config
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint_dir,
        config=config
    )
    # Verify model architecture
    if model.config.model_type != 'bert' or model.config.hidden_size != 312:
        err_msg = (
            'Model architecture mismatch: '
            'Expected TinyBERT with hidden size 312'
        )
        raise ValueError(err_msg)

    model = model.to(device)
    model.eval()

    # Evaluate
    
    # Use lists to collect predictions and labels
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in val_loader:            
            outputs = model(**batch)
            labels = batch['labels'] 
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Use sklearn metrics (matching Trainer)
    accuracy = accuracy_score(all_labels, all_predictions) * 100
    f1 = f1_score(all_labels, all_predictions)
    
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'Accuracy: {accuracy:.2f}')
    # Write result to file
    with open('accuracy.txt', 'w') as f:
        f.write(f'Accuracy: {accuracy:.2f}') 

In [None]:
evaluate('best_checkpoint')