In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==3.5.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth==2025.9.9
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2 #major upgrade in trl (0.23.0) broke the trl patching

In [2]:
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [3]:
from unsloth import FastLanguageModel, FastModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os
import torch
from torch import tensor
import torch.nn.functional as F
from transformers import TrainingArguments, Trainer, ModernBertModel, AutoModelForSequenceClassification, training_args
from datasets import load_dataset, Dataset
from tqdm import tqdm

model_name = 'answerdotai/ModernBERT-base'

NUM_CLASSES = 3
DATA_DIR = "data/"

%env UNSLOTH_DISABLE_FAST_GENERATION = 1

model, tokenizer = FastModel.from_pretrained(
    model_name = model_name,
    load_in_4bit = False,
    max_seq_length = 2048,
    dtype = None,
    auto_model = AutoModelForSequenceClassification,
    num_labels = NUM_CLASSES,
    full_finetuning=True,
)
print("model parameters:" + str(sum(p.numel() for p in model.parameters())))

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
env: UNSLOTH_DISABLE_FAST_GENERATION=1
==((====))==  Unsloth 2025.9.9: Fast Modernbert patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model parameters:149607171


# Experiment time!!

In [4]:
import numpy as np
from sklearn.model_selection import KFold
from datasets import load_dataset
import torch
from torch.utils.data import Subset

# Load the dataset
fpb = load_dataset("financial_phrasebank", "sentences_50agree", trust_remote_code=True)['train']
fpb = fpb.map(lambda ex: {
    "text":   ex["sentence"],
    "labels":  np.eye(NUM_CLASSES)[ex["label"]],
},remove_columns=["sentence","label"])

def create_cross_validation_splits(dataset, n_splits=10, random_state=42):
    """
    Create 10 random 90/10 train/test splits as done in the FinBERT paper
    """
    # Get indices
    indices = np.arange(len(dataset))

    # Store splits
    cv_splits = []

    # Create 10 different random splits
    for i in range(n_splits):
        # Set seed for reproducibility
        np.random.seed(random_state + i)

        # Shuffle indices
        shuffled_indices = np.random.permutation(indices)

        # Calculate split point (90% train, 10% test)
        split_point = int(0.9 * len(indices))

        train_indices = shuffled_indices[:split_point]
        test_indices = shuffled_indices[split_point:]

        cv_splits.append({
            'train': train_indices.tolist(),
            'test': test_indices.tolist()
        })

    return cv_splits

# Create the 10 splits
cv_splits = create_cross_validation_splits(fpb, n_splits=10)


README.md: 0.00B [00:00, ?B/s]

financial_phrasebank.py: 0.00B [00:00, ?B/s]

data/FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4846 [00:00<?, ? examples/s]

Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

In [5]:
def tokenize_function(examples): return tokenizer(examples['text'])

In [6]:
# Function to run cross-validation with your trainer
def run_cross_validation(dataset, cv_splits, your_trainer_function):
    """
    Run cross-validation and return accuracies

    Args:
        dataset: The full dataset
        cv_splits: List of train/test index splits
        your_trainer_function: Your custom trainer function that takes
                              train_dataset and test_dataset and returns accuracy
    """
    accuracies = []

    for fold_idx, split in enumerate(cv_splits):
        print(f"\n=== Fold {fold_idx + 1}/10 ===")

        # Create train and test datasets for this fold
        train_dataset = dataset.select(split['train'])
        test_dataset = dataset.select(split['test'])

        train_dataset = train_dataset.map(tokenize_function, batched=True)
        test_dataset = test_dataset.map(tokenize_function, batched=True)

        print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

        # Train and evaluate using your trainer function
        # Replace this with your actual trainer function call
        accuracy = your_trainer_function(
            train_dataset=train_dataset,
            test_dataset=test_dataset
        )

        accuracies.append(accuracy)
        print(f"Fold {fold_idx + 1} Accuracy: {accuracy:.4f}")

    # Calculate mean and std
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    print(f"\n=== Cross-Validation Results ===")
    print(f"Mean Accuracy: {mean_accuracy:.4f} (+/- {std_accuracy:.4f})")
    print(f"Individual fold accuracies: {accuracies}")

    return mean_accuracy, std_accuracy, accuracies

In [None]:
N_EPOCHS = 3

In [8]:
def custom_trainer(train_dataset, test_dataset, N_EPOCHS = N_EPOCHS):

  trainer = Trainer(
      model=model,
      tokenizer=tokenizer,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      args=TrainingArguments(
          per_device_train_batch_size=32,
          gradient_accumulation_steps=1, #default was 1 but lets make it more stable with 4
          warmup_steps=10,
          fp16=not torch.cuda.is_bf16_supported(),
          bf16=torch.cuda.is_bf16_supported(),
          optim=training_args.OptimizerNames.ADAMW_TORCH,
          learning_rate=5e-5,
          weight_decay=0.001,
          lr_scheduler_type="cosine",
          seed=3407,

          num_train_epochs=N_EPOCHS, #started at 10 but eval accuracy seemed to peak halfway, so going to 5.

          save_strategy="epoch",
          group_by_length=False,
          eval_strategy="epoch",
          logging_strategy="epoch",
      ),
      compute_metrics=lambda eval_pred: { "accuracy": accuracy_score(eval_pred[1].argmax(axis=-1), eval_pred[0].argmax(axis=-1)) }
  )

  trainer_stats = trainer.train()

  from transformers.trainer_utils import get_last_checkpoint
  output_dir = "trainer_output"
  last_checkpoint = get_last_checkpoint(output_dir)
  # print("Last checkpoint:", last_checkpoint)

  import json

  # Option 1: Use json.load() with an open file
  with open(f"/content/{last_checkpoint}/trainer_state.json", 'r') as f:
      trainer_state = json.load(f)

  log_history = trainer_state['log_history']
  hist = [x['eval_accuracy'] for x in log_history if 'eval_accuracy' in x]
  # print(hist)
  best_acc = max(hist)

  return best_acc

In [9]:
from datasets import Dataset, concatenate_datasets

In [10]:
# Run the cross-validation
mean_acc, std_acc, all_accs = run_cross_validation(
    dataset=fpb,
    cv_splits=cv_splits,
    your_trainer_function=custom_trainer
)


=== Fold 1/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3605,0.252847,0.85567
2,0.1772,0.212875,0.876289
3,0.0668,0.238132,0.868041


Unsloth: Will smartly offload gradients to save VRAM!
Fold 1 Accuracy: 0.8763

=== Fold 2/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1169,0.123962,0.927835
2,0.0376,0.168989,0.938144
3,0.0089,0.19684,0.942268


Fold 2 Accuracy: 0.9423

=== Fold 3/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0577,0.106717,0.960825
2,0.0215,0.060595,0.981443
3,0.0099,0.057293,0.979381


Fold 3 Accuracy: 0.9814

=== Fold 4/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.028,0.036794,0.991753
2,0.0143,0.037555,0.991753
3,0.0031,0.039304,0.991753


Fold 4 Accuracy: 0.9918

=== Fold 5/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0451,0.068341,0.981443
2,0.0185,0.026824,0.995876
3,0.003,0.015997,0.997938


Fold 5 Accuracy: 0.9979

=== Fold 6/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0235,0.026426,0.987629
2,0.011,0.017574,0.991753
3,0.0042,0.02367,0.993814


Fold 6 Accuracy: 0.9938

=== Fold 7/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0272,0.035083,0.987629
2,0.0087,0.010099,0.997938
3,0.0032,0.010362,0.995876


Fold 7 Accuracy: 0.9979

=== Fold 8/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0239,0.020048,0.987629
2,0.0108,0.034603,0.987629
3,0.0026,0.022145,0.989691


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0239,0.020048,0.987629
2,0.0108,0.034603,0.987629
3,0.0026,0.022145,0.989691


Fold 8 Accuracy: 0.9897

=== Fold 9/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0206,0.025595,0.989691
2,0.0056,0.009986,0.995876
3,0.0014,0.010179,0.995876


Fold 9 Accuracy: 0.9959

=== Fold 10/10 ===


Map:   0%|          | 0/4361 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Train size: 4361, Test size: 485


  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0151,0.010213,0.993814
2,0.0064,0.008269,0.995876
3,0.0014,0.0072,0.995876


Fold 10 Accuracy: 0.9959

=== Cross-Validation Results ===
Mean Accuracy: 0.9763 (+/- 0.0369)
Individual fold accuracies: [0.8762886597938144, 0.9422680412371134, 0.9814432989690721, 0.9917525773195877, 0.9979381443298969, 0.9938144329896907, 0.9979381443298969, 0.9896907216494846, 0.9958762886597938, 0.9958762886597938]


In [None]:
 
=== Fold 1/10 ===
Map: 100% 4361/4361 [00:01<00:00, 3901.07 examples/s]Map: 100% 485/485 [00:00<00:00, 2741.66 examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 03:15, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.360500
      0.252847
      0.855670
    
    
      2
      0.177200
      0.212875
      0.876289
    
    
      3
      0.066800
      0.238132
      0.868041
    
  
Unsloth: Will smartly offload gradients to save VRAM!
Fold 1 Accuracy: 0.8763

=== Fold 2/10 ===
Map: 100% 4361/4361 [00:01<00:00, 2941.07 examples/s]Map: 100% 485/485 [00:00<00:00, 2336.60 examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 03:14, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.116900
      0.123962
      0.927835
    
    
      2
      0.037600
      0.168989
      0.938144
    
    
      3
      0.008900
      0.196840
      0.942268
    
  
Fold 2 Accuracy: 0.9423

=== Fold 3/10 ===
Map: 100% 4361/4361 [00:00<00:00, 5218.39 examples/s]Map: 100% 485/485 [00:00<00:00, 4322.60 examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 05:48, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.057700
      0.106717
      0.960825
    
    
      2
      0.021500
      0.060595
      0.981443
    
    
      3
      0.009900
      0.057293
      0.979381
    
  
Fold 3 Accuracy: 0.9814

=== Fold 4/10 ===
Map: 100% 4361/4361 [00:00<00:00, 9182.45 examples/s]Map: 100% 485/485 [00:00<00:00, 6778.24 examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 06:56, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.028000
      0.036794
      0.991753
    
    
      2
      0.014300
      0.037555
      0.991753
    
    
      3
      0.003100
      0.039304
      0.991753
    
  
Fold 4 Accuracy: 0.9918

=== Fold 5/10 ===
Map: 100% 4361/4361 [00:00<00:00, 5942.70 examples/s]Map: 100% 485/485 [00:00<00:00, 4241.45 examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 05:38, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.045100
      0.068341
      0.981443
    
    
      2
      0.018500
      0.026824
      0.995876
    
    
      3
      0.003000
      0.015997
      0.997938
    
  
Fold 5 Accuracy: 0.9979

=== Fold 6/10 ===
Map: 100% 4361/4361 [00:00<00:00, 8338.85 examples/s]Map: 100% 485/485 [00:00<00:00, 6366.48 examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 06:02, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.023500
      0.026426
      0.987629
    
    
      2
      0.011000
      0.017574
      0.991753
    
    
      3
      0.004200
      0.023670
      0.993814
    
  
Fold 6 Accuracy: 0.9938

=== Fold 7/10 ===
Map: 100% 4361/4361 [00:00<00:00, 8957.90 examples/s]Map: 100% 485/485 [00:00<00:00, 6343.04 examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 05:38, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.027200
      0.035083
      0.987629
    
    
      2
      0.008700
      0.010099
      0.997938
    
    
      3
      0.003200
      0.010362
      0.995876
    
  
Fold 7 Accuracy: 0.9979

=== Fold 8/10 ===
Map:   0%|          | 0/4361 [00:00<?, ? examples/s]Map:   0%|          | 0/485 [00:00<?, ? examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 03:59, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.023900
      0.020048
      0.987629
    
    
      2
      0.010800
      0.034603
      0.987629
    
    
      3
      0.002600
      0.022145
      0.989691
    
  

    
      
      
      [411/411 04:31, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.023900
      0.020048
      0.987629
    
    
      2
      0.010800
      0.034603
      0.987629
    
    
      3
      0.002600
      0.022145
      0.989691
    
  
Fold 8 Accuracy: 0.9897

=== Fold 9/10 ===
Map: 100% 4361/4361 [00:00<00:00, 8666.71 examples/s]Map: 100% 485/485 [00:00<00:00, 5862.46 examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 04:53, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.020600
      0.025595
      0.989691
    
    
      2
      0.005600
      0.009986
      0.995876
    
    
      3
      0.001400
      0.010179
      0.995876
    
  
Fold 9 Accuracy: 0.9959

=== Fold 10/10 ===
Map: 100% 4361/4361 [00:00<00:00, 8803.92 examples/s]Map: 100% 485/485 [00:00<00:00, 4831.62 examples/s]Train size: 4361, Test size: 485
/tmp/ipython-input-861077650.py:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,361 | Num Epochs = 3 | Total steps = 411
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 149,607,171 of 149,607,171 (100.00% trained)

    
      
      
      [411/411 05:01, Epoch 3/3]
    
    
  
 
      Epoch
      Training Loss
      Validation Loss
      Accuracy
    
  
  
    
      1
      0.015100
      0.010213
      0.993814
    
    
      2
      0.006400
      0.008269
      0.995876
    
    
      3
      0.001400
      0.007200
      0.995876
    
  
Fold 10 Accuracy: 0.9959

=== Cross-Validation Results ===
Mean Accuracy: 0.9763 (+/- 0.0369)
Individual fold accuracies: [0.8762886597938144, 0.9422680412371134, 0.9814432989690721, 0.9917525773195877, 0.9979381443298969, 0.9938144329896907, 0.9979381443298969, 0.9896907216494846, 0.9958762886597938, 0.9958762886597938]


=== Cross-Validation Results ===
- Mean Accuracy: 0.9763 (+/- 0.0369)
- Individual fold accuracies: [0.8762886597938144, 0.9422680412371134, 0.9814432989690721, 0.9917525773195877, 0.9979381443298969, 0.9938144329896907, 0.9979381443298969, 0.9896907216494846, 0.9958762886597938, 0.9958762886597938]
