In [41]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

import torch
import torch.nn as nn
from PIL import Image
from torchvision.transforms import v2
from torchvision import transforms
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [42]:
!pip install -q transformers

In [43]:
np_images = np.load("/kaggle/input/face-aligned/np_images.npy")
data = pd.read_csv("/kaggle/input/face-aligned/labels.csv")

In [44]:
if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'

In [45]:
np_images.shape

(15310, 224, 224, 3)

In [46]:
data

Unnamed: 0.1,Unnamed: 0,file_name,height,width,bbox,age,race,masked,skintone,emotion,gender
0,0,100013282.jpg,1333,2000,"[934.0000000000097, 144.82228672769534, 238.24...",20-30s,Caucasian,unmasked,mid-light,Neutral,Male
1,1,100016175.jpg,1333,2000,"[1094.0513571635438, 422.91772295627203, 55.45...",20-30s,Caucasian,unmasked,light,Neutral,Male
2,2,10004189.jpg,2000,1333,"[419.93871061403877, 269.1250391680045, 377.19...",20-30s,Mongoloid,unmasked,light,Happiness,Female
3,3,100104575.jpg,1333,2000,"[1490.6909678848915, 676.0000000000097, 37.553...",20-30s,Caucasian,unmasked,mid-light,Neutral,Male
4,4,100104600.jpg,2000,1333,"[549.169724453414, 92.52040334013152, 306.8821...",20-30s,Caucasian,unmasked,mid-light,Happiness,Female
...,...,...,...,...,...,...,...,...,...,...,...
15305,15305,image_2713.jpg,1024,1024,"[244.71717171717162, 176.66666666666654, 587.2...",40-50s,Caucasian,unmasked,mid-light,Happiness,Male
15306,15306,image_3332.jpg,1024,1024,"[265.6338028169031, 174.2441314553992, 584.727...",Senior,Caucasian,unmasked,light,Happiness,Female
15307,15307,image_6905.jpg,1024,1024,"[251.3434343434343, 184.7171717171712, 500.535...",20-30s,Caucasian,unmasked,light,Neutral,Male
15308,15308,image_4080.jpg,1024,1024,"[222.72727272727235, 179.8181818181809, 586.90...",Kid,Caucasian,unmasked,light,Happiness,Female


In [47]:
weights = []
cols = data.columns
for col in cols[5:]:
    print(col)
    temp = data[col].value_counts()
    print(temp)
    n = np.sum(temp)
    #temp = temp / n
    #temp = np.exp(temp)
    weights.append(temp)

age
age
20-30s      11236
40-50s       1602
Kid           954
Senior        637
Teenager      536
Baby          345
Name: count, dtype: int64
race
race
Mongoloid    7487
Caucasian    7106
Negroid       717
Name: count, dtype: int64
masked
masked
unmasked    14806
masked        504
Name: count, dtype: int64
skintone
skintone
light        10485
mid-light     3688
mid-dark       798
dark           339
Name: count, dtype: int64
emotion
emotion
Happiness    9218
Neutral      4844
Sadness       380
Anger         319
Surprise      303
Disgust       132
Fear          114
Name: count, dtype: int64
gender
gender
Female    10522
Male       4788
Name: count, dtype: int64


In [48]:
labels_set = {}
for col in cols[5:]:
    temp = data[col].unique()
    labels_set[col] = temp
    
labels_set['age'] = ['Baby', 'Kid', 'Teenager', '20-30s', '40-50s', 'Senior']
print(labels_set)

{'age': ['Baby', 'Kid', 'Teenager', '20-30s', '40-50s', 'Senior'], 'race': array(['Caucasian', 'Mongoloid', 'Negroid'], dtype=object), 'masked': array(['unmasked', 'masked'], dtype=object), 'skintone': array(['mid-light', 'light', 'mid-dark', 'dark'], dtype=object), 'emotion': array(['Neutral', 'Happiness', 'Anger', 'Surprise', 'Fear', 'Sadness',
       'Disgust'], dtype=object), 'gender': array(['Male', 'Female'], dtype=object)}


In [49]:
from sklearn import preprocessing

labels = dict()
labels_dec = dict()
for col in cols[5:]:
    le = preprocessing.LabelEncoder()
    enc = le.fit_transform(data[col])
    print(enc)
    labels[col] = enc
    labels_dec[col] = le.classes_[enc]

[0 0 0 ... 0 3 2]
[0 0 1 ... 0 0 0]
[1 1 1 ... 1 1 1]
[3 1 1 ... 1 1 1]
[4 4 3 ... 4 3 4]
[1 1 0 ... 1 0 0]


**Edit thuộc tính train**

In [50]:
feature = 'emotion'

In [51]:
labels_dec

{'age': array(['20-30s', '20-30s', '20-30s', ..., '20-30s', 'Kid', 'Baby'],
       dtype=object),
 'race': array(['Caucasian', 'Caucasian', 'Mongoloid', ..., 'Caucasian',
        'Caucasian', 'Caucasian'], dtype=object),
 'masked': array(['unmasked', 'unmasked', 'unmasked', ..., 'unmasked', 'unmasked',
        'unmasked'], dtype=object),
 'skintone': array(['mid-light', 'light', 'light', ..., 'light', 'light', 'light'],
       dtype=object),
 'emotion': array(['Neutral', 'Neutral', 'Happiness', ..., 'Neutral', 'Happiness',
        'Neutral'], dtype=object),
 'gender': array(['Male', 'Male', 'Female', ..., 'Male', 'Female', 'Female'],
       dtype=object)}

In [52]:
labels_set

{'age': ['Baby', 'Kid', 'Teenager', '20-30s', '40-50s', 'Senior'],
 'race': array(['Caucasian', 'Mongoloid', 'Negroid'], dtype=object),
 'masked': array(['unmasked', 'masked'], dtype=object),
 'skintone': array(['mid-light', 'light', 'mid-dark', 'dark'], dtype=object),
 'emotion': array(['Neutral', 'Happiness', 'Anger', 'Surprise', 'Fear', 'Sadness',
        'Disgust'], dtype=object),
 'gender': array(['Male', 'Female'], dtype=object)}

In [53]:
labels

{'age': array([0, 0, 0, ..., 0, 3, 2]),
 'race': array([0, 0, 1, ..., 0, 0, 0]),
 'masked': array([1, 1, 1, ..., 1, 1, 1]),
 'skintone': array([3, 1, 1, ..., 1, 1, 1]),
 'emotion': array([4, 4, 3, ..., 4, 3, 4]),
 'gender': array([1, 1, 0, ..., 1, 0, 0])}

In [54]:
labels[feature]

array([4, 4, 3, ..., 4, 3, 4])

In [55]:
mytransform = transforms.Compose([
            #transforms.RandomHorizontalFlip(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),  # mmb
        ]
)

In [56]:
class CustomImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.fromarray((self.image_paths[idx]).astype(np.uint8)).convert('RGB')
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        label = torch.tensor(label)

        return image, label

In [57]:
dataset = CustomImageDataset(image_paths=np_images, labels=labels[feature], transform=mytransform)

In [58]:
dataset[0]

(tensor([[[0.0627, 0.0627, 0.0627,  ..., 0.3569, 0.3647, 0.3647],
          [0.0627, 0.0627, 0.0627,  ..., 0.3647, 0.3569, 0.3686],
          [0.0627, 0.0627, 0.0627,  ..., 0.3647, 0.3529, 0.3647],
          ...,
          [0.2392, 0.2706, 0.2902,  ..., 0.8118, 0.8118, 0.8118],
          [0.2353, 0.2745, 0.2941,  ..., 0.8157, 0.8118, 0.8118],
          [0.2431, 0.2627, 0.2863,  ..., 0.8118, 0.8196, 0.8157]],
 
         [[0.0627, 0.0627, 0.0627,  ..., 0.3059, 0.3020, 0.3020],
          [0.0627, 0.0627, 0.0627,  ..., 0.3137, 0.3020, 0.3020],
          [0.0627, 0.0627, 0.0627,  ..., 0.3137, 0.3020, 0.3020],
          ...,
          [0.1490, 0.1686, 0.1843,  ..., 0.7020, 0.6980, 0.6980],
          [0.1451, 0.1686, 0.1882,  ..., 0.6980, 0.6980, 0.6941],
          [0.1451, 0.1608, 0.1765,  ..., 0.6941, 0.6980, 0.6980]],
 
         [[0.0627, 0.0627, 0.0627,  ..., 0.2392, 0.2392, 0.2392],
          [0.0627, 0.0627, 0.0627,  ..., 0.2471, 0.2353, 0.2392],
          [0.0627, 0.0627, 0.0627,  ...,

In [59]:
id2label = dict(zip(labels[feature], labels_dec[feature]))
label2id = {labels[feature]: i for i, labels[feature] in id2label.items()}

In [60]:
id2label

{4: 'Neutral',
 3: 'Happiness',
 0: 'Anger',
 6: 'Surprise',
 2: 'Fear',
 5: 'Sadness',
 1: 'Disgust'}

In [61]:
label2id

{'Neutral': 4,
 'Happiness': 3,
 'Anger': 0,
 'Surprise': 6,
 'Fear': 2,
 'Sadness': 5,
 'Disgust': 1}

In [62]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import ViTForImageClassification, ViTFeatureExtractor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0

    for images, labels in tqdm(dataloader, desc="Training"):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return running_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc="Evaluating"):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

In [63]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm thử
train_size = 0.8
train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, stratify=labels_dec[feature], random_state=42)

In [64]:
len(set(labels_set[feature]))

7

In [65]:
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
model_name = 'google/vit-base-patch16-224-in21k'
model = ViTForImageClassification.from_pretrained(model_name, num_labels=len(set(labels_set[feature])))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

epochs = 10
for epoch in range(epochs):
    train_loss = train(model, train_dataloader, optimizer, criterion, device)
    accuracy = evaluate(model, test_dataloader, device)

    print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f} - Test Accuracy: {accuracy:.4f}")
    path = f'/kaggle/working/model_{feature}_{epoch+1}.pth'
    torch.save(model, path)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 766/766 [03:54<00:00,  3.26it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  8.98it/s]


Epoch 1/10 - Train Loss: 0.5775 - Test Accuracy: 0.8367


Training: 100%|██████████| 766/766 [03:55<00:00,  3.25it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  9.00it/s]


Epoch 2/10 - Train Loss: 0.4129 - Test Accuracy: 0.8338


Training: 100%|██████████| 766/766 [03:55<00:00,  3.25it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  8.98it/s]


Epoch 3/10 - Train Loss: 0.3005 - Test Accuracy: 0.8387


Training: 100%|██████████| 766/766 [03:55<00:00,  3.25it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  9.00it/s]


Epoch 4/10 - Train Loss: 0.1972 - Test Accuracy: 0.8272


Training: 100%|██████████| 766/766 [03:55<00:00,  3.25it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  8.99it/s]


Epoch 5/10 - Train Loss: 0.1454 - Test Accuracy: 0.8383


Training: 100%|██████████| 766/766 [03:55<00:00,  3.25it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  8.98it/s]


Epoch 6/10 - Train Loss: 0.1036 - Test Accuracy: 0.8390


Training: 100%|██████████| 766/766 [03:55<00:00,  3.25it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  8.98it/s]


Epoch 7/10 - Train Loss: 0.0774 - Test Accuracy: 0.8357


Training: 100%|██████████| 766/766 [03:55<00:00,  3.25it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  8.99it/s]


Epoch 8/10 - Train Loss: 0.0597 - Test Accuracy: 0.8194


Training: 100%|██████████| 766/766 [03:55<00:00,  3.25it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  9.09it/s]


Epoch 9/10 - Train Loss: 0.0578 - Test Accuracy: 0.8374


Training: 100%|██████████| 766/766 [03:54<00:00,  3.26it/s]
Evaluating: 100%|██████████| 192/192 [00:21<00:00,  9.06it/s]


Epoch 10/10 - Train Loss: 0.0525 - Test Accuracy: 0.8367


In [66]:
path = f'/kaggle/working/model_{feature}.pth'
torch.save(model, path)

In [67]:
# img_test = Image.open('/kaggle/input/fac-data-crop/output_images/img_10226.jpg')

In [68]:
# img_test

In [69]:
# img_test = mytransform(img_test)

In [70]:
# pred = model(img_test.unsqueeze(0).to(device))

In [71]:
# pred.logits[0]

In [72]:
# pred[0].argmax().item()

In [73]:
# torch.nn.functional.softmax(pred.logits[0], dim=0)

In [74]:
# id2label[pred[0].argmax().item()]