In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, WeightedRandomSampler
import os
import timm
from PIL import Image
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import classification_report
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Parameters
img_size = 64
batch_size = 64
epochs = 30
learning_rate = 1e-4
min_lr = 1e-6
weight_decay = 1e-4
patience = 3

# Paths
data_dir = '/kaggle/input/emotion'
save_path = '/kaggle/working/emotion_mv_improved.pth'

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Compute dataset mean and std
def compute_mean_std(dataset):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    mean = 0.
    std = 0.
    nb_samples = 0.
    
    for data, _ in loader:
        batch_samples = data.size(0)
        data = data.view(batch_samples, data.size(1), -1)
        mean += data.mean(2).sum(0)
        std += data.std(2).sum(0)
        nb_samples += batch_samples
    
    mean /= nb_samples
    std /= nb_samples
    return mean.item(), std.item()

tmp_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
])

tmp_ds = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=tmp_tf)
gray_mean, gray_std = compute_mean_std(tmp_ds)
print(f'Computed gray mean: {gray_mean:.4f}, std: {gray_std:.4f}')

# Data Augmentation
train_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 0.5)),
    transforms.ToTensor(),
    transforms.Normalize([gray_mean], [gray_std]),
])

test_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize([gray_mean], [gray_std]),
])

# Datasets
train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=train_tf)
test_dataset = datasets.ImageFolder(os.path.join(data_dir, 'test'), transform=test_tf)

# Get class information
classes = train_dataset.classes
num_classes = len(classes)
print('Classes:', classes, "Number of classes:", num_classes)

# Handle class imbalance
class_counts = np.bincount([label for _, label in train_dataset])
print("Class counts:", class_counts)
class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
sample_weights = class_weights[[label for _, label in train_dataset]]
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

# Data Loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    sampler=sampler,
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# CORRECTED MODEL CREATION - Simplified approach
model = timm.create_model(
    'mobilevit_s',
    pretrained=True,
    num_classes=num_classes,  # Set to your actual number of classes
    img_size=(img_size, img_size),
    in_chans=1,
    drop_rate=0.2,
    drop_path_rate=0.1
).to(device)

# Print model to verify output size
print(model)
print(f"Model output size: {model.num_classes} (should match {num_classes})")

# Loss function with class weights
class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Optimizer
optimizer = optim.AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay
)

# Scheduler
scheduler = ReduceLROnPlateau(
    optimizer,
    mode='max',
    factor=0.5,
    patience=patience,
    min_lr=min_lr
)

# Training
best_acc = 0.0
best_epoch = 0
early_stop_counter = 0

for epoch in range(1, epochs + 1):
    model.train()
    running_loss = 0.0
    
    for imgs, labels in train_loader:
        # Move data to device
        imgs, labels = imgs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    
    # Validation
    model.eval()
    correct = 0
    total = 0
    val_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for imgs, labels in test_loader:
            # Move data to device
            imgs, labels = imgs.to(device), labels.to(device)
            
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * imgs.size(0)
            
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    val_loss /= len(test_loader.dataset)
    test_acc = correct / total * 100
    
    # Update learning rate
    scheduler.step(test_acc)
    
    print(f"Epoch {epoch:02d}:")
    print(f"  Train Loss: {epoch_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")
    print(f"  Val Acc: {test_acc:.2f}%")
    print(f"  Current LR: {optimizer.param_groups[0]['lr']:.2e}")
    
    if epoch == epochs or test_acc > best_acc:
        print("\nClassification Report:")
        print(classification_report(all_labels, all_preds, target_names=classes))
    
    if test_acc > best_acc:
        best_acc = test_acc
        best_epoch = epoch
        early_stop_counter = 0
        torch.save(model.state_dict(), save_path)
        print(f"  New best model saved with acc={best_acc:.2f}%")
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience * 2:
            print(f"Early stopping at epoch {epoch}")
            break
    
    print("-" * 50)

print(f"\nBest Test Accuracy: {best_acc:.2f}% at epoch {best_epoch}")

# Load best model
model.load_state_dict(torch.load(save_path, map_location=device))
model.eval()

# Prediction function
def predict(img_path, return_confidence=False):
    img = Image.open(img_path).convert('L')
    x = test_tf(img).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(x)
        probs = torch.softmax(logits, dim=1)
    idx = logits.argmax(dim=1).item()
    if return_confidence:
        return classes[idx], probs[0, idx].item()
    return classes[idx]

In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, WeightedRandomSampler
import os
import timm
from PIL import Image
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import classification_report
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt

# Parameters
img_size = 96
batch_size = 128
epochs = 100
learning_rate = 5e-4
min_lr = 1e-6
weight_decay = 1e-4
patience = 10
warmup_epochs = 5
num_workers = 4

# Paths
data_dir = '/kaggle/input/emotion'
save_path = '/kaggle/working/mobilevit_emotion_best.pth'
history_path = '/kaggle/working/mobilevit_training_history.png'

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Compute dataset mean and std
def compute_mean_std(dataset):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    mean = 0.
    std = 0.
    nb_samples = 0.
    
    for data, _ in tqdm(loader, desc="Computing mean/std"):
        batch_samples = data.size(0)
        data = data.view(batch_samples, data.size(1), -1)
        mean += data.mean(2).sum(0)
        std += data.std(2).sum(0)
        nb_samples += batch_samples
    
    mean /= nb_samples
    std /= nb_samples
    return mean.item(), std.item()

# Create temporary transforms
tmp_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
])

print("Loading dataset for mean/std calculation...")
tmp_ds = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=tmp_tf)
gray_mean, gray_std = compute_mean_std(tmp_ds)
print(f'Computed gray mean: {gray_mean:.4f}, std: {gray_std:.4f}')

# Enhanced Data Augmentation
train_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.RandomResizedCrop(img_size, scale=(0.7, 1.0), ratio=(0.8, 1.2)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.3, contrast=0.3),
    transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 1.0)),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize([gray_mean], [gray_std]),
])

test_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize([gray_mean], [gray_std]),
])

# Datasets
print("Creating datasets...")
train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=train_tf)
test_dataset = datasets.ImageFolder(os.path.join(data_dir, 'test'), transform=test_tf)

# Get class information
classes = train_dataset.classes
num_classes = len(classes)
print('Classes:', classes, "Number of classes:", num_classes)

# Handle class imbalance
class_counts = np.bincount([label for _, label in train_dataset])
print("Class counts:", class_counts)
class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
sample_weights = class_weights[[label for _, label in train_dataset]]
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

# Data Loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    sampler=sampler,
    num_workers=num_workers,
    pin_memory=True,
    persistent_workers=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True,
    persistent_workers=True
)

# Enhanced MobileViT Model with automatic channel detection
class MobileViTEmotion(nn.Module):
    def __init__(self, num_classes, img_size=96):
        super().__init__()
        # Use MobileViT as backbone
        self.backbone = timm.create_model(
            'mobilevit_s',
            pretrained=True,
            features_only=True,
            in_chans=1
        )
        
        # Get actual channel dimensions from backbone
        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, img_size, img_size)
            features = self.backbone(dummy_input)
            self.last_channel = features[-1].shape[1]
            self.feature_size = features[-1].shape[2]
        
        print(f"MobileViT feature dimensions: {self.last_channel} channels, {self.feature_size}x{self.feature_size}")
        
        # Feature refinement
        self.conv1 = nn.Conv2d(self.last_channel, 256, kernel_size=1)
        self.conv2 = nn.Conv2d(256, 128, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(128)
        self.conv3 = nn.Conv2d(128, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        
        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=1),
            nn.BatchNorm2d(64),
            nn.SiLU(),
            nn.Conv2d(64, 1, kernel_size=1),
            nn.Sigmoid()
        )
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(64, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, x):
        # Get features from MobileViT
        features = self.backbone(x)
        x = features[-1]  # Use the deepest feature map
        
        # Feature refinement
        x = F.silu(self.conv1(x))
        x = F.silu(self.bn1(self.conv2(x)))
        x = F.silu(self.bn2(self.conv3(x)))
        
        # Attention mechanism
        att = self.attention(x)
        x = x * att
        
        return self.classifier(x)

model = MobileViTEmotion(num_classes=num_classes, img_size=img_size).to(device)
print(model)

# Advanced Loss Function
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        
        if self.alpha is not None:
            alpha_t = self.alpha[targets]
            focal_loss = alpha_t * focal_loss
            
        if self.reduction == 'mean':
            return focal_loss.mean()
        return focal_loss

# Apply class weights
class_weights = class_weights.to(device)
criterion = FocalLoss(alpha=class_weights, gamma=1.5)

# Optimizer with differential learning rates
backbone_params = []
head_params = []
for name, param in model.named_parameters():
    if 'backbone' in name:
        backbone_params.append(param)
    else:
        head_params.append(param)

optimizer = optim.AdamW(
    [
        {'params': backbone_params, 'lr': learning_rate/10},
        {'params': head_params, 'lr': learning_rate}
    ],
    weight_decay=weight_decay
)

# Combined learning rate schedulers
scheduler_cosine = CosineAnnealingWarmRestarts(
    optimizer,
    T_0=10,
    T_mult=2,
    eta_min=min_lr
)
scheduler_plateau = ReduceLROnPlateau(
    optimizer,
    mode='max',
    factor=0.5,
    patience=3
)

# Training history
history = {
    'train_loss': [],
    'val_loss': [],
    'val_acc': [],
    'lr': []
}

# Training function
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    progress = tqdm(loader, desc="Training")
    
    for imgs, labels in progress:
        imgs, labels = imgs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
        
        progress.set_postfix(loss=loss.item())
    
    return running_loss / len(loader.dataset)

# Validation function
def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        progress = tqdm(loader, desc="Validation")
        for imgs, labels in progress:
            imgs, labels = imgs.to(device), labels.to(device)
            
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * imgs.size(0)
            
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = correct / total * 100
    return running_loss / len(loader.dataset), accuracy, all_preds, all_labels

# Training loop
best_acc = 0.0
early_stop_counter = 0

print("Starting training...")
for epoch in range(1, epochs + 1):
    print(f"\nEpoch {epoch}/{epochs}")
    
    # Learning rate warmup
    if epoch <= warmup_epochs:
        lr_scale = min(1., float(epoch) / warmup_epochs)
        for pg in optimizer.param_groups:
            pg['lr'] = lr_scale * pg.get('initial_lr', learning_rate)
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    history['train_loss'].append(train_loss)
    
    # Validate
    val_loss, val_acc, val_preds, val_labels = validate(model, test_loader, criterion, device)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['lr'].append(optimizer.param_groups[0]['lr'])
    
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    # Update learning rate
    if epoch > warmup_epochs:
        scheduler_cosine.step()
        scheduler_plateau.step(val_acc)
    
    # Save best model
    if val_acc > best_acc:
        best_acc = val_acc
        early_stop_counter = 0
        torch.save(model.state_dict(), save_path)
        print(f"New best model saved with accuracy: {val_acc:.2f}%")
        
        # Print classification report for best model
        print("\nClassification Report:")
        print(classification_report(val_labels, val_preds, target_names=classes))
    else:
        early_stop_counter += 1
        print(f"No improvement for {early_stop_counter}/{patience} epochs")
    
    # Early stopping
    if early_stop_counter >= patience:
        print(f"Early stopping at epoch {epoch}")
        break

print(f"\nBest Validation Accuracy: {best_acc:.2f}%")

# Plot training history
plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.title('Loss History')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(history['val_acc'], 'g-', label='Val Accuracy')
plt.title('Accuracy History')
plt.xlabel('Epochs')
plt.legend()

plt.tight_layout()
plt.savefig(history_path)
plt.show()

# Load best model
model.load_state_dict(torch.load(save_path, map_location=device))
model.eval()

# Prediction function
def predict(img_path, return_confidence=False):
    img = Image.open(img_path).convert('L')
    x = test_tf(img).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(x)
        probs = F.softmax(logits, dim=1)
    idx = logits.argmax(dim=1).item()
    confidence = probs[0, idx].item()
    
    if return_confidence:
        return classes[idx], confidence
    return classes[idx]
    

In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, WeightedRandomSampler
import os
import timm
from PIL import Image
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import classification_report
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter

# Enhanced Parameters
img_size = 112  # Increased resolution for facial details
batch_size = 64  # Reduced for better gradient updates
epochs = 100  # More training time
learning_rate = 1e-3  # Higher initial learning rate
min_lr = 1e-6
weight_decay = 1e-5  # Reduced regularization
patience = 15
warmup_epochs = 10
num_workers = 4
mixup_alpha = 0.4  # Data augmentation

# Paths
data_dir = '/kaggle/input/emotion'
save_path = '/kaggle/working/mobilevit_emotion_best.pth'
history_path = '/kaggle/working/training_history.png'

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Compute dataset mean and std
def compute_mean_std(dataset):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    mean = 0.
    std = 0.
    nb_samples = 0.
    
    for data, _ in tqdm(loader, desc="Computing mean/std"):
        batch_samples = data.size(0)
        data = data.view(batch_samples, data.size(1), -1)
        mean += data.mean(2).sum(0)
        std += data.std(2).sum(0)
        nb_samples += batch_samples
    
    mean /= nb_samples
    std /= nb_samples
    return mean.item(), std.item()

# Create temporary transforms
tmp_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
])

print("Loading dataset for mean/std calculation...")
tmp_ds = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=tmp_tf)
gray_mean, gray_std = compute_mean_std(tmp_ds)
print(f'Computed gray mean: {gray_mean:.4f}, std: {gray_std:.4f}')

# Enhanced Data Augmentation
train_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.RandomResizedCrop(img_size, scale=(0.7, 1.0), ratio=(0.75, 1.33)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.4, contrast=0.4),
    transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),
    transforms.RandomAffine(degrees=0, translate=(0.15, 0.15)),
    transforms.RandomPerspective(distortion_scale=0.2, p=0.7),
    transforms.ToTensor(),
    transforms.Normalize([gray_mean], [gray_std]),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.15), ratio=(0.3, 3.3)),
])

test_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize([gray_mean], [gray_std]),
])

# Datasets
print("Creating datasets...")
train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=train_tf)
test_dataset = datasets.ImageFolder(os.path.join(data_dir, 'test'), transform=test_tf)

# Get class information
classes = train_dataset.classes
num_classes = len(classes)
print('Classes:', classes, "Number of classes:", num_classes)

# Handle class imbalance
class_counts = np.bincount([label for _, label in train_dataset])
print("Class counts:", class_counts)
max_count = max(class_counts)
class_weights = torch.tensor([max_count / count for count in class_counts], dtype=torch.float)
sample_weights = class_weights[[label for _, label in train_dataset]]
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

# Data Loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    sampler=sampler,
    num_workers=num_workers,
    pin_memory=True,
    persistent_workers=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    pin_memory=True,
    persistent_workers=True
)

# Enhanced MobileViT Model with Residual Connections
class MobileViTEmotion(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        # Use MobileViT as backbone
        self.backbone = timm.create_model(
            'mobilevit_s',
            pretrained=True,
            features_only=True,
            in_chans=1
        )
        
        # Get feature dimensions
        dummy = torch.zeros(1, 1, img_size, img_size)
        features = self.backbone(dummy)
        self.feature_channels = [f.shape[1] for f in features]
        print("Feature channels:", self.feature_channels)
        
        # Feature refinement with residual connections
        self.conv1 = nn.Conv2d(self.feature_channels[-1], 256, kernel_size=1)
        self.bn1 = nn.BatchNorm2d(256)
        
        self.conv2 = nn.Conv2d(256, 128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        
        self.conv3 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        
        self.shortcut = nn.Sequential(
            nn.Conv2d(self.feature_channels[-1], 128, kernel_size=1),
            nn.BatchNorm2d(128)
        )
        
        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Conv2d(128, 64, kernel_size=1),
            nn.BatchNorm2d(64),
            nn.SiLU(),
            nn.Conv2d(64, 1, kernel_size=1),
            nn.Sigmoid()
        )
        
        # Classifier with more capacity
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(128, 512),
            nn.BatchNorm1d(512),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, x):
        # Get features from MobileViT
        features = self.backbone(x)
        x = features[-1]  # Use the deepest feature map
        
        # Initial transformation
        x1 = F.silu(self.bn1(self.conv1(x)))
        
        # Residual block
        identity = self.shortcut(x)
        x2 = F.silu(self.bn2(self.conv2(x1)))
        x3 = self.bn3(self.conv3(x2))
        x3 += identity  # Residual connection
        x = F.silu(x3)
        
        # Attention mechanism
        att = self.attention(x)
        x = x * att
        
        return self.classifier(x)

model = MobileViTEmotion(num_classes=num_classes).to(device)
print(model)
print(f"Total parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

# Advanced Loss Function with Label Smoothing
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, smoothing=0.1, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.smoothing = smoothing
        self.reduction = reduction
        
    def forward(self, inputs, targets):
        log_probs = F.log_softmax(inputs, dim=-1)
        nll_loss = -log_probs.gather(dim=-1, index=targets.unsqueeze(1)).squeeze(1)
        
        # Label smoothing
        smooth_loss = -log_probs.mean(dim=-1)
        nll_loss = (1 - self.smoothing) * nll_loss + self.smoothing * smooth_loss
        
        # Focal loss
        pt = torch.exp(-nll_loss)
        focal_loss = ((1 - pt) ** self.gamma) * nll_loss
        
        if self.alpha is not None:
            alpha_t = self.alpha[targets]
            focal_loss = alpha_t * focal_loss
            
        if self.reduction == 'mean':
            return focal_loss.mean()
        return focal_loss

# Apply class weights
class_weights = class_weights.to(device)
criterion = FocalLoss(alpha=class_weights, gamma=2.0, smoothing=0.1)

# Optimizer
optimizer = optim.AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay
)

# Learning rate schedulers
scheduler_cosine = CosineAnnealingLR(
    optimizer,
    T_max=epochs - warmup_epochs,
    eta_min=min_lr
)
scheduler_plateau = ReduceLROnPlateau(
    optimizer,
    mode='max',
    factor=0.5,
    patience=5
)

# Training history
history = {
    'train_loss': [],
    'val_loss': [],
    'val_acc': [],
    'lr': []
}

# Mixup augmentation
def mixup_data(x, y, alpha=1.0):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
        
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(x.device)
    
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

# Training function with mixup
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    progress = tqdm(loader, desc="Training")
    
    for imgs, labels in progress:
        imgs, labels = imgs.to(device), labels.to(device)
        
        # Apply mixup
        inputs, targets_a, targets_b, lam = mixup_data(imgs, labels, alpha=mixup_alpha)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = lam * criterion(outputs, targets_a) + (1 - lam) * criterion(outputs, targets_b)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
        
        progress.set_postfix(loss=loss.item())
    
    return running_loss / len(loader.dataset)

# Validation function
def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        progress = tqdm(loader, desc="Validation")
        for imgs, labels in progress:
            imgs, labels = imgs.to(device), labels.to(device)
            
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * imgs.size(0)
            
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = correct / total * 100
    return running_loss / len(loader.dataset), accuracy, all_preds, all_labels

# Training loop
best_acc = 0.0
early_stop_counter = 0

print("Starting training...")
for epoch in range(1, epochs + 1):
    print(f"\nEpoch {epoch}/{epochs}")
    
    # Learning rate warmup
    if epoch <= warmup_epochs:
        lr = learning_rate * (epoch / warmup_epochs)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    history['train_loss'].append(train_loss)
    
    # Validate
    val_loss, val_acc, val_preds, val_labels = validate(model, test_loader, criterion, device)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['lr'].append(optimizer.param_groups[0]['lr'])
    
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    # Update learning rate
    if epoch > warmup_epochs:
        scheduler_cosine.step()
        scheduler_plateau.step(val_acc)
    
    # Save best model
    if val_acc > best_acc:
        best_acc = val_acc
        early_stop_counter = 0
        torch.save(model.state_dict(), save_path)
        print(f"New best model saved with accuracy: {val_acc:.2f}%")
        
        # Print classification report for best model
        print("\nClassification Report:")
        print(classification_report(val_labels, val_preds, target_names=classes))
    else:
        early_stop_counter += 1
        print(f"No improvement for {early_stop_counter}/{patience} epochs")
    
    # Early stopping
    if early_stop_counter >= patience:
        print(f"Early stopping at epoch {epoch}")
        break

print(f"\nBest Validation Accuracy: {best_acc:.2f}%")

# Plot training history
plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.title('Loss History')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(history['val_acc'], 'g-', label='Val Accuracy')
plt.title('Accuracy History')
plt.xlabel('Epochs')
plt.legend()

plt.tight_layout()
plt.savefig(history_path)
plt.show()

# Load best model
model.load_state_dict(torch.load(save_path, map_location=device))
model.eval()

# Enhanced prediction function
def predict(img_path, return_confidence=False):
    img = Image.open(img_path).convert('L')
    x = test_tf(img).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(x)
        probs = F.softmax(logits, dim=1)
    idx = logits.argmax(dim=1).item()
    confidence = probs[0, idx].item()
    
    if return_confidence:
        return classes[idx], confidence
    return classes[idx]

Using device: cuda
Loading dataset for mean/std calculation...


Computing mean/std: 100%|██████████| 449/449 [00:31<00:00, 14.12it/s]


Computed gray mean: 0.5077, std: 0.2063
Creating datasets...
Classes: ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised'] Number of classes: 7
Class counts: [3995  436 4097 7215 4965 4830 3171]


model.safetensors:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

Feature channels: [32, 64, 96, 128, 640]
MobileViTEmotion(
  (backbone): FeatureListNet(
    (stem): ConvNormAct(
      (conv): Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNormAct2d(
        16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
        (drop): Identity()
        (act): SiLU(inplace=True)
      )
    )
    (stages_0): Sequential(
      (0): BottleneckBlock(
        (conv1_1x1): ConvNormAct(
          (conv): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNormAct2d(
            64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
          )
        )
        (conv2_kxk): ConvNormAct(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (bn): BatchNormAct2d(
            64, eps=1e-05, momentum=0.1, affine=True, track_running_stat

Training: 100%|██████████| 449/449 [01:04<00:00,  6.95it/s, loss=3.07]
Validation: 100%|██████████| 113/113 [00:08<00:00, 12.94it/s]


Train Loss: 3.7766 | Val Loss: 2.4903 | Val Acc: 10.55%
New best model saved with accuracy: 10.55%

Classification Report:
              precision    recall  f1-score   support

       angry       0.13      0.00      0.01       958
   disgusted       0.02      0.96      0.04       111
     fearful       0.00      0.00      0.00      1024
       happy       0.53      0.01      0.03      1774
     neutral       0.31      0.03      0.06      1233
         sad       0.38      0.01      0.01      1247
   surprised       0.35      0.69      0.47       831

    accuracy                           0.11      7178
   macro avg       0.25      0.24      0.09      7178
weighted avg       0.31      0.11      0.07      7178


Epoch 2/100


Training: 100%|██████████| 449/449 [01:07<00:00,  6.64it/s, loss=2.35]
Validation: 100%|██████████| 113/113 [00:05<00:00, 20.83it/s]


Train Loss: 3.0506 | Val Loss: 2.0094 | Val Acc: 31.07%
New best model saved with accuracy: 31.07%

Classification Report:
              precision    recall  f1-score   support

       angry       0.33      0.04      0.07       958
   disgusted       0.04      0.90      0.07       111
     fearful       0.10      0.00      0.01      1024
       happy       0.67      0.51      0.58      1774
     neutral       0.48      0.31      0.38      1233
         sad       0.39      0.07      0.12      1247
   surprised       0.39      0.86      0.53       831

    accuracy                           0.31      7178
   macro avg       0.34      0.38      0.25      7178
weighted avg       0.42      0.31      0.30      7178


Epoch 3/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=3.15]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.08it/s]


Train Loss: 2.8124 | Val Loss: 1.7206 | Val Acc: 45.32%
New best model saved with accuracy: 45.32%

Classification Report:
              precision    recall  f1-score   support

       angry       0.34      0.41      0.37       958
   disgusted       0.06      0.81      0.11       111
     fearful       0.37      0.10      0.16      1024
       happy       0.88      0.61      0.72      1774
     neutral       0.48      0.57      0.52      1233
         sad       0.43      0.21      0.28      1247
   surprised       0.65      0.73      0.69       831

    accuracy                           0.45      7178
   macro avg       0.46      0.49      0.41      7178
weighted avg       0.55      0.45      0.47      7178


Epoch 4/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.69]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.16it/s]


Train Loss: 2.6443 | Val Loss: 1.8311 | Val Acc: 39.34%
No improvement for 1/15 epochs

Epoch 5/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=2.68]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.09it/s]


Train Loss: 2.5863 | Val Loss: 1.6298 | Val Acc: 46.99%
New best model saved with accuracy: 46.99%

Classification Report:
              precision    recall  f1-score   support

       angry       0.46      0.34      0.39       958
   disgusted       0.06      0.90      0.12       111
     fearful       0.35      0.29      0.31      1024
       happy       0.91      0.64      0.75      1774
     neutral       0.58      0.45      0.51      1233
         sad       0.50      0.18      0.26      1247
   surprised       0.52      0.87      0.65       831

    accuracy                           0.47      7178
   macro avg       0.48      0.53      0.43      7178
weighted avg       0.58      0.47      0.49      7178


Epoch 6/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=2.01]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.06it/s]


Train Loss: 2.5286 | Val Loss: 1.6000 | Val Acc: 49.00%
New best model saved with accuracy: 49.00%

Classification Report:
              precision    recall  f1-score   support

       angry       0.37      0.55      0.44       958
   disgusted       0.11      0.80      0.19       111
     fearful       0.36      0.37      0.36      1024
       happy       0.95      0.56      0.70      1774
     neutral       0.54      0.46      0.50      1233
         sad       0.52      0.22      0.31      1247
   surprised       0.55      0.84      0.66       831

    accuracy                           0.49      7178
   macro avg       0.49      0.54      0.45      7178
weighted avg       0.58      0.49      0.50      7178


Epoch 7/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=2.94]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.13it/s]


Train Loss: 2.4879 | Val Loss: 1.6090 | Val Acc: 50.57%
New best model saved with accuracy: 50.57%

Classification Report:
              precision    recall  f1-score   support

       angry       0.46      0.45      0.46       958
   disgusted       0.07      0.90      0.13       111
     fearful       0.43      0.29      0.35      1024
       happy       0.91      0.68      0.78      1774
     neutral       0.58      0.45      0.51      1233
         sad       0.50      0.30      0.37      1247
   surprised       0.63      0.80      0.71       831

    accuracy                           0.51      7178
   macro avg       0.51      0.55      0.47      7178
weighted avg       0.61      0.51      0.54      7178


Epoch 8/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.15]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.12it/s]


Train Loss: 2.4505 | Val Loss: 1.5243 | Val Acc: 54.75%
New best model saved with accuracy: 54.75%

Classification Report:
              precision    recall  f1-score   support

       angry       0.48      0.51      0.49       958
   disgusted       0.14      0.87      0.24       111
     fearful       0.36      0.30      0.33      1024
       happy       0.93      0.69      0.79      1774
     neutral       0.52      0.62      0.57      1233
         sad       0.55      0.28      0.37      1247
   surprised       0.58      0.85      0.69       831

    accuracy                           0.55      7178
   macro avg       0.51      0.59      0.50      7178
weighted avg       0.60      0.55      0.55      7178


Epoch 9/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.67]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.19it/s]


Train Loss: 2.4636 | Val Loss: 1.8072 | Val Acc: 45.07%
No improvement for 1/15 epochs

Epoch 10/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=2.59]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.17it/s]


Train Loss: 2.4080 | Val Loss: 1.5505 | Val Acc: 53.48%
No improvement for 2/15 epochs

Epoch 11/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=2.36]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.09it/s]


Train Loss: 2.3681 | Val Loss: 1.5079 | Val Acc: 54.47%
No improvement for 3/15 epochs

Epoch 12/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=1.63]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.18it/s]


Train Loss: 2.3511 | Val Loss: 1.5109 | Val Acc: 52.28%
No improvement for 4/15 epochs

Epoch 13/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=3.58]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.18it/s]


Train Loss: 2.3172 | Val Loss: 1.5618 | Val Acc: 44.83%
No improvement for 5/15 epochs

Epoch 14/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=3.75]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.14it/s]


Train Loss: 2.2857 | Val Loss: 1.4175 | Val Acc: 57.29%
New best model saved with accuracy: 57.29%

Classification Report:
              precision    recall  f1-score   support

       angry       0.52      0.53      0.52       958
   disgusted       0.21      0.77      0.32       111
     fearful       0.47      0.25      0.33      1024
       happy       0.90      0.72      0.80      1774
     neutral       0.53      0.63      0.57      1233
         sad       0.54      0.37      0.44      1247
   surprised       0.50      0.90      0.64       831

    accuracy                           0.57      7178
   macro avg       0.52      0.60      0.52      7178
weighted avg       0.60      0.57      0.57      7178


Epoch 15/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=2.67]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.21it/s]


Train Loss: 2.2404 | Val Loss: 1.3876 | Val Acc: 59.56%
New best model saved with accuracy: 59.56%

Classification Report:
              precision    recall  f1-score   support

       angry       0.56      0.51      0.53       958
   disgusted       0.19      0.86      0.31       111
     fearful       0.57      0.23      0.33      1024
       happy       0.90      0.77      0.83      1774
     neutral       0.55      0.60      0.57      1233
         sad       0.48      0.51      0.49      1247
   surprised       0.60      0.85      0.70       831

    accuracy                           0.60      7178
   macro avg       0.55      0.62      0.54      7178
weighted avg       0.63      0.60      0.59      7178


Epoch 16/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=2.49]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.19it/s]


Train Loss: 2.2479 | Val Loss: 1.4211 | Val Acc: 59.03%
No improvement for 1/15 epochs

Epoch 17/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.15]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 2.1797 | Val Loss: 1.5062 | Val Acc: 53.23%
No improvement for 2/15 epochs

Epoch 18/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=2.3] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.12it/s]


Train Loss: 2.1917 | Val Loss: 1.3976 | Val Acc: 58.76%
No improvement for 3/15 epochs

Epoch 19/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=2.81]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.09it/s]


Train Loss: 2.1798 | Val Loss: 1.4014 | Val Acc: 59.47%
No improvement for 4/15 epochs

Epoch 20/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.33it/s, loss=2.63]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 2.1151 | Val Loss: 1.4645 | Val Acc: 57.31%
No improvement for 5/15 epochs

Epoch 21/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.33it/s, loss=3.48] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.26it/s]


Train Loss: 2.1208 | Val Loss: 1.3649 | Val Acc: 60.02%
New best model saved with accuracy: 60.02%

Classification Report:
              precision    recall  f1-score   support

       angry       0.56      0.50      0.53       958
   disgusted       0.18      0.86      0.30       111
     fearful       0.47      0.40      0.43      1024
       happy       0.93      0.75      0.83      1774
     neutral       0.52      0.70      0.60      1233
         sad       0.60      0.32      0.42      1247
   surprised       0.62      0.86      0.72       831

    accuracy                           0.60      7178
   macro avg       0.55      0.63      0.55      7178
weighted avg       0.64      0.60      0.60      7178


Epoch 22/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.34] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.16it/s]


Train Loss: 2.1548 | Val Loss: 1.4184 | Val Acc: 61.62%
New best model saved with accuracy: 61.62%

Classification Report:
              precision    recall  f1-score   support

       angry       0.54      0.59      0.56       958
   disgusted       0.18      0.85      0.29       111
     fearful       0.58      0.28      0.38      1024
       happy       0.92      0.77      0.84      1774
     neutral       0.60      0.59      0.59      1233
         sad       0.49      0.59      0.54      1247
   surprised       0.72      0.78      0.75       831

    accuracy                           0.62      7178
   macro avg       0.58      0.64      0.56      7178
weighted avg       0.66      0.62      0.62      7178


Epoch 23/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.33it/s, loss=2.5]  
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.19it/s]


Train Loss: 2.1225 | Val Loss: 1.4609 | Val Acc: 56.94%
No improvement for 1/15 epochs

Epoch 24/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.37it/s, loss=1.71]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.23it/s]


Train Loss: 2.0840 | Val Loss: 1.4023 | Val Acc: 60.25%
No improvement for 2/15 epochs

Epoch 25/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=1.39] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.14it/s]


Train Loss: 2.0982 | Val Loss: 1.3743 | Val Acc: 60.28%
No improvement for 3/15 epochs

Epoch 26/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=1.46]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.08it/s]


Train Loss: 2.1047 | Val Loss: 1.3910 | Val Acc: 60.88%
No improvement for 4/15 epochs

Epoch 27/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=1.19] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 1.9966 | Val Loss: 1.3562 | Val Acc: 62.40%
New best model saved with accuracy: 62.40%

Classification Report:
              precision    recall  f1-score   support

       angry       0.62      0.48      0.54       958
   disgusted       0.26      0.76      0.39       111
     fearful       0.55      0.33      0.41      1024
       happy       0.90      0.80      0.85      1774
     neutral       0.54      0.71      0.61      1233
         sad       0.50      0.51      0.51      1247
   surprised       0.66      0.79      0.72       831

    accuracy                           0.62      7178
   macro avg       0.57      0.63      0.58      7178
weighted avg       0.64      0.62      0.62      7178


Epoch 28/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=3.05] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.18it/s]


Train Loss: 2.1405 | Val Loss: 1.4239 | Val Acc: 61.12%
No improvement for 1/15 epochs

Epoch 29/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.7]  
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 2.0457 | Val Loss: 1.3745 | Val Acc: 59.84%
No improvement for 2/15 epochs

Epoch 30/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=2.87]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.19it/s]


Train Loss: 1.9786 | Val Loss: 1.3602 | Val Acc: 62.86%
New best model saved with accuracy: 62.86%

Classification Report:
              precision    recall  f1-score   support

       angry       0.49      0.65      0.56       958
   disgusted       0.22      0.75      0.34       111
     fearful       0.51      0.36      0.42      1024
       happy       0.92      0.79      0.85      1774
     neutral       0.60      0.64      0.62      1233
         sad       0.58      0.45      0.51      1247
   surprised       0.67      0.83      0.74       831

    accuracy                           0.63      7178
   macro avg       0.57      0.64      0.58      7178
weighted avg       0.65      0.63      0.63      7178


Epoch 31/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=1.27] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 2.0317 | Val Loss: 1.3340 | Val Acc: 64.53%
New best model saved with accuracy: 64.53%

Classification Report:
              precision    recall  f1-score   support

       angry       0.58      0.59      0.58       958
   disgusted       0.26      0.86      0.40       111
     fearful       0.53      0.36      0.43      1024
       happy       0.86      0.85      0.86      1774
     neutral       0.62      0.63      0.63      1233
         sad       0.53      0.52      0.53      1247
   surprised       0.71      0.81      0.76       831

    accuracy                           0.65      7178
   macro avg       0.59      0.66      0.60      7178
weighted avg       0.65      0.65      0.64      7178


Epoch 32/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=1.59] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.16it/s]


Train Loss: 1.9889 | Val Loss: 1.3321 | Val Acc: 61.76%
No improvement for 1/15 epochs

Epoch 33/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=3.28] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.18it/s]


Train Loss: 1.9611 | Val Loss: 1.4061 | Val Acc: 59.06%
No improvement for 2/15 epochs

Epoch 34/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.33it/s, loss=1.96] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.21it/s]


Train Loss: 1.9874 | Val Loss: 1.3708 | Val Acc: 62.32%
No improvement for 3/15 epochs

Epoch 35/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.37it/s, loss=2.77] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.25it/s]


Train Loss: 1.9842 | Val Loss: 1.3190 | Val Acc: 64.50%
No improvement for 4/15 epochs

Epoch 36/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.32] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.25it/s]


Train Loss: 2.0046 | Val Loss: 1.3033 | Val Acc: 64.27%
No improvement for 5/15 epochs

Epoch 37/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=1.64] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.19it/s]


Train Loss: 2.0030 | Val Loss: 1.3347 | Val Acc: 63.47%
No improvement for 6/15 epochs

Epoch 38/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=1.89] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.08it/s]


Train Loss: 1.9007 | Val Loss: 1.2434 | Val Acc: 65.55%
New best model saved with accuracy: 65.55%

Classification Report:
              precision    recall  f1-score   support

       angry       0.55      0.66      0.60       958
   disgusted       0.38      0.81      0.52       111
     fearful       0.55      0.37      0.44      1024
       happy       0.93      0.80      0.86      1774
     neutral       0.56      0.73      0.63      1233
         sad       0.59      0.47      0.52      1247
   surprised       0.71      0.84      0.77       831

    accuracy                           0.66      7178
   macro avg       0.61      0.67      0.62      7178
weighted avg       0.67      0.66      0.65      7178


Epoch 39/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=2.06] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.22it/s]


Train Loss: 1.8922 | Val Loss: 1.2715 | Val Acc: 65.62%
New best model saved with accuracy: 65.62%

Classification Report:
              precision    recall  f1-score   support

       angry       0.59      0.61      0.60       958
   disgusted       0.35      0.82      0.49       111
     fearful       0.55      0.40      0.46      1024
       happy       0.94      0.79      0.86      1774
     neutral       0.55      0.75      0.64      1233
         sad       0.59      0.48      0.53      1247
   surprised       0.70      0.85      0.77       831

    accuracy                           0.66      7178
   macro avg       0.61      0.67      0.62      7178
weighted avg       0.67      0.66      0.66      7178


Epoch 40/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=1.43] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.10it/s]


Train Loss: 1.9084 | Val Loss: 1.2834 | Val Acc: 65.98%
New best model saved with accuracy: 65.98%

Classification Report:
              precision    recall  f1-score   support

       angry       0.53      0.68      0.60       958
   disgusted       0.34      0.70      0.46       111
     fearful       0.58      0.39      0.47      1024
       happy       0.94      0.79      0.86      1774
     neutral       0.56      0.75      0.64      1233
         sad       0.62      0.45      0.52      1247
   surprised       0.72      0.85      0.78       831

    accuracy                           0.66      7178
   macro avg       0.61      0.66      0.62      7178
weighted avg       0.68      0.66      0.66      7178


Epoch 41/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.31it/s, loss=2.16] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.14it/s]


Train Loss: 1.8103 | Val Loss: 1.2906 | Val Acc: 65.41%
No improvement for 1/15 epochs

Epoch 42/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=0.797]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 1.8094 | Val Loss: 1.2794 | Val Acc: 66.06%
New best model saved with accuracy: 66.06%

Classification Report:
              precision    recall  f1-score   support

       angry       0.54      0.67      0.60       958
   disgusted       0.44      0.74      0.55       111
     fearful       0.55      0.41      0.47      1024
       happy       0.93      0.81      0.87      1774
     neutral       0.56      0.75      0.64      1233
         sad       0.60      0.44      0.51      1247
   surprised       0.72      0.84      0.77       831

    accuracy                           0.66      7178
   macro avg       0.62      0.66      0.63      7178
weighted avg       0.67      0.66      0.66      7178


Epoch 43/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=3.16] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.20it/s]


Train Loss: 1.8235 | Val Loss: 1.2507 | Val Acc: 66.68%
New best model saved with accuracy: 66.68%

Classification Report:
              precision    recall  f1-score   support

       angry       0.56      0.63      0.59       958
   disgusted       0.44      0.77      0.56       111
     fearful       0.55      0.46      0.50      1024
       happy       0.92      0.82      0.87      1774
     neutral       0.59      0.72      0.65      1233
         sad       0.60      0.46      0.52      1247
   surprised       0.70      0.85      0.77       831

    accuracy                           0.67      7178
   macro avg       0.62      0.67      0.64      7178
weighted avg       0.67      0.67      0.66      7178


Epoch 44/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=1.33] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.05it/s]


Train Loss: 1.7982 | Val Loss: 1.2635 | Val Acc: 67.15%
New best model saved with accuracy: 67.15%

Classification Report:
              precision    recall  f1-score   support

       angry       0.56      0.67      0.61       958
   disgusted       0.42      0.77      0.55       111
     fearful       0.56      0.43      0.48      1024
       happy       0.92      0.82      0.87      1774
     neutral       0.60      0.71      0.65      1233
         sad       0.59      0.51      0.55      1247
   surprised       0.74      0.82      0.78       831

    accuracy                           0.67      7178
   macro avg       0.63      0.68      0.64      7178
weighted avg       0.68      0.67      0.67      7178


Epoch 45/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.33it/s, loss=2.33] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 1.8780 | Val Loss: 1.2743 | Val Acc: 66.43%
No improvement for 1/15 epochs

Epoch 46/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=2.85] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.20it/s]


Train Loss: 1.8351 | Val Loss: 1.2978 | Val Acc: 65.94%
No improvement for 2/15 epochs

Epoch 47/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=0.747]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 1.8222 | Val Loss: 1.2303 | Val Acc: 66.94%
No improvement for 3/15 epochs

Epoch 48/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=1.51] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.16it/s]


Train Loss: 1.7711 | Val Loss: 1.2683 | Val Acc: 65.87%
No improvement for 4/15 epochs

Epoch 49/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=2.18] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.17it/s]


Train Loss: 1.7910 | Val Loss: 1.2986 | Val Acc: 65.37%
No improvement for 5/15 epochs

Epoch 50/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.32it/s, loss=1.11] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.23it/s]


Train Loss: 1.7950 | Val Loss: 1.2119 | Val Acc: 66.84%
No improvement for 6/15 epochs

Epoch 51/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=1.12] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.09it/s]


Train Loss: 1.7498 | Val Loss: 1.2581 | Val Acc: 67.00%
No improvement for 7/15 epochs

Epoch 52/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.73] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.10it/s]


Train Loss: 1.7379 | Val Loss: 1.2464 | Val Acc: 67.08%
No improvement for 8/15 epochs

Epoch 53/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=1.94] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.14it/s]


Train Loss: 1.6981 | Val Loss: 1.2375 | Val Acc: 67.68%
New best model saved with accuracy: 67.68%

Classification Report:
              precision    recall  f1-score   support

       angry       0.57      0.67      0.61       958
   disgusted       0.48      0.74      0.58       111
     fearful       0.55      0.51      0.53      1024
       happy       0.93      0.81      0.87      1774
     neutral       0.59      0.73      0.65      1233
         sad       0.64      0.45      0.52      1247
   surprised       0.73      0.86      0.79       831

    accuracy                           0.68      7178
   macro avg       0.64      0.68      0.65      7178
weighted avg       0.69      0.68      0.68      7178


Epoch 54/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=1.26] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.11it/s]


Train Loss: 1.7338 | Val Loss: 1.1838 | Val Acc: 68.12%
New best model saved with accuracy: 68.12%

Classification Report:
              precision    recall  f1-score   support

       angry       0.56      0.68      0.61       958
   disgusted       0.73      0.72      0.73       111
     fearful       0.56      0.48      0.52      1024
       happy       0.92      0.84      0.88      1774
     neutral       0.59      0.73      0.65      1233
         sad       0.62      0.46      0.53      1247
   surprised       0.73      0.85      0.79       831

    accuracy                           0.68      7178
   macro avg       0.67      0.68      0.67      7178
weighted avg       0.69      0.68      0.68      7178


Epoch 55/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.83] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.14it/s]


Train Loss: 1.7149 | Val Loss: 1.2827 | Val Acc: 66.90%
No improvement for 1/15 epochs

Epoch 56/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=0.69] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.23it/s]


Train Loss: 1.6781 | Val Loss: 1.2484 | Val Acc: 67.99%
No improvement for 2/15 epochs

Epoch 57/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=1.52] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.20it/s]


Train Loss: 1.7300 | Val Loss: 1.2567 | Val Acc: 67.72%
No improvement for 3/15 epochs

Epoch 58/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.33it/s, loss=2.73] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 1.7244 | Val Loss: 1.2296 | Val Acc: 68.14%
New best model saved with accuracy: 68.14%

Classification Report:
              precision    recall  f1-score   support

       angry       0.57      0.68      0.62       958
   disgusted       0.49      0.73      0.59       111
     fearful       0.59      0.45      0.51      1024
       happy       0.92      0.83      0.87      1774
     neutral       0.60      0.73      0.66      1233
         sad       0.61      0.50      0.55      1247
   surprised       0.72      0.86      0.78       831

    accuracy                           0.68      7178
   macro avg       0.64      0.68      0.65      7178
weighted avg       0.69      0.68      0.68      7178


Epoch 59/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.53] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 1.6791 | Val Loss: 1.2237 | Val Acc: 67.69%
No improvement for 1/15 epochs

Epoch 60/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.31it/s, loss=2.53] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.07it/s]


Train Loss: 1.8016 | Val Loss: 1.2346 | Val Acc: 67.75%
No improvement for 2/15 epochs

Epoch 61/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=1.44] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.18it/s]


Train Loss: 1.6960 | Val Loss: 1.2128 | Val Acc: 68.14%
No improvement for 3/15 epochs

Epoch 62/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=1.11] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.16it/s]


Train Loss: 1.6374 | Val Loss: 1.1984 | Val Acc: 68.10%
No improvement for 4/15 epochs

Epoch 63/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.21] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.17it/s]


Train Loss: 1.6972 | Val Loss: 1.2256 | Val Acc: 68.50%
New best model saved with accuracy: 68.50%

Classification Report:
              precision    recall  f1-score   support

       angry       0.57      0.66      0.61       958
   disgusted       0.44      0.75      0.55       111
     fearful       0.56      0.51      0.54      1024
       happy       0.93      0.83      0.88      1774
     neutral       0.61      0.72      0.66      1233
         sad       0.62      0.51      0.56      1247
   surprised       0.77      0.83      0.79       831

    accuracy                           0.69      7178
   macro avg       0.64      0.69      0.66      7178
weighted avg       0.69      0.69      0.69      7178


Epoch 64/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.47] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.20it/s]


Train Loss: 1.6313 | Val Loss: 1.2453 | Val Acc: 66.84%
No improvement for 1/15 epochs

Epoch 65/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=1.1]  
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.17it/s]


Train Loss: 1.6515 | Val Loss: 1.1908 | Val Acc: 68.61%
New best model saved with accuracy: 68.61%

Classification Report:
              precision    recall  f1-score   support

       angry       0.58      0.67      0.62       958
   disgusted       0.67      0.72      0.70       111
     fearful       0.56      0.48      0.52      1024
       happy       0.92      0.84      0.88      1774
     neutral       0.59      0.75      0.66      1233
         sad       0.62      0.48      0.54      1247
   surprised       0.77      0.85      0.80       831

    accuracy                           0.69      7178
   macro avg       0.67      0.68      0.67      7178
weighted avg       0.69      0.69      0.68      7178


Epoch 66/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=2.78] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.10it/s]


Train Loss: 1.6303 | Val Loss: 1.2387 | Val Acc: 67.64%
No improvement for 1/15 epochs

Epoch 67/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=1.01] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.18it/s]


Train Loss: 1.7695 | Val Loss: 1.2335 | Val Acc: 68.18%
No improvement for 2/15 epochs

Epoch 68/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=0.795]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.31it/s]


Train Loss: 1.7240 | Val Loss: 1.2097 | Val Acc: 68.64%
New best model saved with accuracy: 68.64%

Classification Report:
              precision    recall  f1-score   support

       angry       0.60      0.64      0.62       958
   disgusted       0.57      0.72      0.63       111
     fearful       0.54      0.54      0.54      1024
       happy       0.94      0.82      0.87      1774
     neutral       0.58      0.75      0.66      1233
         sad       0.63      0.49      0.55      1247
   surprised       0.76      0.84      0.80       831

    accuracy                           0.69      7178
   macro avg       0.66      0.68      0.67      7178
weighted avg       0.70      0.69      0.69      7178


Epoch 69/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=1.04] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.25it/s]


Train Loss: 1.6844 | Val Loss: 1.2648 | Val Acc: 67.34%
No improvement for 1/15 epochs

Epoch 70/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.37it/s, loss=2.69] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.17it/s]


Train Loss: 1.7005 | Val Loss: 1.2264 | Val Acc: 68.32%
No improvement for 2/15 epochs

Epoch 71/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=0.854]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.05it/s]


Train Loss: 1.6338 | Val Loss: 1.2283 | Val Acc: 68.47%
No improvement for 3/15 epochs

Epoch 72/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=0.781]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.15it/s]


Train Loss: 1.6445 | Val Loss: 1.2463 | Val Acc: 67.99%
No improvement for 4/15 epochs

Epoch 73/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=0.855]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.24it/s]


Train Loss: 1.6532 | Val Loss: 1.2140 | Val Acc: 68.46%
No improvement for 5/15 epochs

Epoch 74/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.32it/s, loss=0.797]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.08it/s]


Train Loss: 1.6921 | Val Loss: 1.2206 | Val Acc: 68.67%
New best model saved with accuracy: 68.67%

Classification Report:
              precision    recall  f1-score   support

       angry       0.57      0.68      0.62       958
   disgusted       0.58      0.72      0.65       111
     fearful       0.55      0.50      0.53      1024
       happy       0.93      0.83      0.88      1774
     neutral       0.61      0.72      0.66      1233
         sad       0.63      0.49      0.55      1247
   surprised       0.76      0.85      0.80       831

    accuracy                           0.69      7178
   macro avg       0.66      0.69      0.67      7178
weighted avg       0.69      0.69      0.69      7178


Epoch 75/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=2.03] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.01it/s]


Train Loss: 1.6165 | Val Loss: 1.2730 | Val Acc: 67.47%
No improvement for 1/15 epochs

Epoch 76/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.34it/s, loss=2.21] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.18it/s]


Train Loss: 1.6617 | Val Loss: 1.2498 | Val Acc: 68.47%
No improvement for 2/15 epochs

Epoch 77/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=1.09] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.10it/s]


Train Loss: 1.7063 | Val Loss: 1.2243 | Val Acc: 68.82%
New best model saved with accuracy: 68.82%

Classification Report:
              precision    recall  f1-score   support

       angry       0.59      0.66      0.62       958
   disgusted       0.56      0.73      0.64       111
     fearful       0.58      0.49      0.53      1024
       happy       0.93      0.84      0.88      1774
     neutral       0.58      0.75      0.66      1233
         sad       0.61      0.50      0.55      1247
   surprised       0.78      0.84      0.81       831

    accuracy                           0.69      7178
   macro avg       0.66      0.69      0.67      7178
weighted avg       0.70      0.69      0.69      7178


Epoch 78/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=2.29] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.16it/s]


Train Loss: 1.6763 | Val Loss: 1.2099 | Val Acc: 68.81%
No improvement for 1/15 epochs

Epoch 79/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=1.36] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.12it/s]


Train Loss: 1.6135 | Val Loss: 1.1983 | Val Acc: 68.99%
New best model saved with accuracy: 68.99%

Classification Report:
              precision    recall  f1-score   support

       angry       0.59      0.66      0.62       958
   disgusted       0.64      0.71      0.67       111
     fearful       0.56      0.50      0.53      1024
       happy       0.93      0.85      0.88      1774
     neutral       0.59      0.74      0.66      1233
         sad       0.62      0.49      0.55      1247
   surprised       0.76      0.85      0.80       831

    accuracy                           0.69      7178
   macro avg       0.67      0.69      0.67      7178
weighted avg       0.70      0.69      0.69      7178


Epoch 80/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=1.97] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.19it/s]


Train Loss: 1.6612 | Val Loss: 1.2259 | Val Acc: 68.28%
No improvement for 1/15 epochs

Epoch 81/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=1.91] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.19it/s]


Train Loss: 1.6818 | Val Loss: 1.2063 | Val Acc: 69.13%
New best model saved with accuracy: 69.13%

Classification Report:
              precision    recall  f1-score   support

       angry       0.58      0.67      0.62       958
   disgusted       0.60      0.73      0.66       111
     fearful       0.55      0.52      0.54      1024
       happy       0.93      0.84      0.88      1774
     neutral       0.62      0.73      0.67      1233
         sad       0.63      0.49      0.55      1247
   surprised       0.75      0.85      0.79       831

    accuracy                           0.69      7178
   macro avg       0.67      0.69      0.67      7178
weighted avg       0.70      0.69      0.69      7178


Epoch 82/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=3.14] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.19it/s]


Train Loss: 1.6674 | Val Loss: 1.2423 | Val Acc: 68.45%
No improvement for 1/15 epochs

Epoch 83/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.73] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.09it/s]


Train Loss: 1.6057 | Val Loss: 1.2663 | Val Acc: 67.90%
No improvement for 2/15 epochs

Epoch 84/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=2.39] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.06it/s]


Train Loss: 1.6088 | Val Loss: 1.2626 | Val Acc: 68.33%
No improvement for 3/15 epochs

Epoch 85/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.33it/s, loss=0.742]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.13it/s]


Train Loss: 1.6246 | Val Loss: 1.2221 | Val Acc: 68.96%
No improvement for 4/15 epochs

Epoch 86/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.37it/s, loss=2.26] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.11it/s]


Train Loss: 1.6618 | Val Loss: 1.2637 | Val Acc: 68.18%
No improvement for 5/15 epochs

Epoch 87/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.36it/s, loss=0.875]
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.21it/s]


Train Loss: 1.6762 | Val Loss: 1.2621 | Val Acc: 68.06%
No improvement for 6/15 epochs

Epoch 88/100


Training: 100%|██████████| 449/449 [01:11<00:00,  6.32it/s, loss=2.26] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.10it/s]


Train Loss: 1.6591 | Val Loss: 1.3142 | Val Acc: 65.12%
No improvement for 7/15 epochs

Epoch 89/100


Training: 100%|██████████| 449/449 [01:10<00:00,  6.35it/s, loss=2.22] 
Validation: 100%|██████████| 113/113 [00:05<00:00, 21.23it/s]


Train Loss: 1.6843 | Val Loss: 1.2592 | Val Acc: 68.18%
No improvement for 8/15 epochs

Epoch 90/100


Training:  58%|█████▊    | 262/449 [00:41<00:29,  6.38it/s, loss=2.37] 