In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

img_size = 48   
batch_size = 32
epochs = 20
learning_rate = 3e-4
step_size = 5
gamma = 0.1
data_dir = '/content/drive/MyDrive/archive'
save_path = '/content/drive/MyDrive/archive/emotion_mv.pth'

# Emotion Recognition with MobileViT

This notebook fine-tunes MobileViT on a 7-class emotion dataset (angry, disgusted, fearful, happy, neutral, sad, surprised) and evaluates test accuracy.

**Modify `patch_size` and `kernel_size` in the hyperparameters cell to experiment with different configurations.**

In [None]:

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import os

#用全部的圖片算平均跟標準差，之後標準化使dataset的像素分布更一致
tmp_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(), #把像素值 從0~255映射到0~1
])

tmp_ds     = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=tmp_tf) #拿train裡面的圖
tmp_loader = DataLoader(tmp_ds, batch_size=batch_size, shuffle=False, num_workers=4)

sum_c    = torch.zeros(1) #算像素值總和用來算平均
sum2_c   = torch.zeros(1) #算像素值平方總和用來算變異數，標準差
n_pixels = 0  #總共有多少pixel

for imgs, _ in tmp_loader:
    B, C, H, W = imgs.shape # batch大小, 通道數, 影像高寬
    pixels = imgs.view(B, C, -1) # 一次算整張影像
    sum_c    += pixels.sum(dim=[0,2]) #攤平把一個batch的灰階值加總回去
    sum2_c   += (pixels**2).sum(dim=[0,2])
    n_pixels += B * H * W #像素數

gray_mean = (sum_c / n_pixels).item() #平均值與標準差
gray_std  = (sum2_c / n_pixels - (sum_c / n_pixels)**2).sqrt().item()
print(f'Computed gray mean: {gray_mean:.4f}, std: {gray_std:.4f}')



Computed gray mean: 0.5077, std: 0.2550


In [None]:

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import timm
from PIL import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [None]:
train_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.RandomResizedCrop(img_size), #隨機裁切影像，增強處理不同取景
    transforms.RandomHorizontalFlip(), #有機率翻轉影像，增強模型不受左右對稱影響
    transforms.ToTensor(), #把 0~255映射到0~1
    transforms.Normalize([gray_mean], [gray_std]), #標準化
])
test_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.CenterCrop(img_size),
    transforms.ToTensor(),
    transforms.Normalize([gray_mean], [gray_std]),
])
# 用不同情緒的資料夾當分類標籤
train_dataset = datasets.ImageFolder(os.path.join(data_dir,'train'), transform=train_tf)
test_dataset  = datasets.ImageFolder(os.path.join(data_dir,'test'),  transform=test_tf)
# shuffle=True 打亂順序
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  num_workers=4)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, num_workers=4)

classes = train_dataset.classes
num_classes = len(classes) #算有幾種情緒類別
print('Classes:', classes)

Classes: ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']




In [None]:

model = timm.create_model(
    'mobilevit_s',
    pretrained=True,
    num_classes=num_classes, #分類的數量
    img_size=img_size, #圖片大小
    in_chans=1
).to(device)
print(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

ByobNet(
  (stem): ConvNormAct(
    (conv): Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNormAct2d(
      16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
      (drop): Identity()
      (act): SiLU(inplace=True)
    )
  )
  (stages): Sequential(
    (0): Sequential(
      (0): BottleneckBlock(
        (conv1_1x1): ConvNormAct(
          (conv): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNormAct2d(
            64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
          )
        )
        (conv2_kxk): ConvNormAct(
          (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
          (bn): BatchNormAct2d(
            64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
   

In [None]:

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-2)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma) #降低學習率每過step_size * gamma

best_acc = 0.0
for epoch in range(1, epochs+1):
    model.train()
    running_loss = 0.0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels) #計算loss
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
    scheduler.step()
    epoch_loss = running_loss / len(train_dataset)

    model.eval()
    correct = total = 0
    with torch.no_grad():
        for imgs, labels in test_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    test_acc = correct / total * 100
    print(f"Epoch {epoch:02d}: Train Loss={epoch_loss:.4f}, Test Acc={test_acc:.2f}%")
    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), save_path) #把最好的模型存下來
        print(f" New best model saved with acc={best_acc:.2f}%")
print(f"Best Test Accuracy: {best_acc:.2f}%")

In [None]:

model.load_state_dict(torch.load(save_path, map_location=device)) #取出最好的模型實作
model.eval()

def predict(img_path):
    img = Image.open(img_path).convert('L')
    x = test_tf(img).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(x)
    idx = logits.argmax(dim=1).item()
    return classes[idx]
