In [1]:
#step 1 import image
%matplotlib inline
import torchvision.datasets
import math
import torchvision.transforms as tvt
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wget
import zipfile
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as tfms
from torch.utils.data import DataLoader, Subset, Dataset, random_split
from torchvision.utils import make_grid
from PIL import Image
from time import time
from tqdm import tqdm
from transformers import ViTConfig, ViTModel
from super_con import SupConLoss

device = torch.device('cuda:1')
device_id =1
"""
data_root = "../celeba/datasets"

base_url = "https://graal.ift.ulaval.ca/public/celeba/"

file_list = [
    "img_align_celeba.zip",
    "list_attr_celeba.txt",
    "identity_CelebA.txt",
    "list_bbox_celeba.txt",
    "list_landmarks_align_celeba.txt",
    "list_eval_partition.txt",
]

# Path to folder with the dataset
dataset_folder = f"{data_root}/celeba"
os.makedirs(dataset_folder, exist_ok=True)

for file in file_list:
    url = f"{base_url}/{file}"
    if not os.path.exists(f"{dataset_folder}/{file}"):
        wget.download(url, f"{dataset_folder}/{file}")

with zipfile.ZipFile(f"{dataset_folder}/img_align_celeba.zip", "r") as ziphandler:
    ziphandler.extractall(dataset_folder)
"""

image_size = 64
batch_size = 256
dataset = torchvision.datasets.CelebA("../celeba/datasets/",split='train', transform=tvt.Compose([
                                  tvt.Resize((image_size,image_size)),
                                  tvt.ToTensor(),
                                  tvt.Normalize(mean=[0.5, 0.5, 0.5],
                                                std=[0.5, 0.5, 0.5])
                              ]))

test_dataset = torchvision.datasets.CelebA("../celeba/datasets/",split='test', transform=tvt.Compose([
                                  tvt.Resize((image_size,image_size)),
                                  tvt.ToTensor(),
                                  tvt.Normalize(mean=[0.5, 0.5, 0.5],
                                                std=[0.5, 0.5, 0.5])
                              ]))


dataset_size = len(dataset)
train_size = int(0.85 * dataset_size)
val_size = dataset_size - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print(len(train_dataset), len(val_dataset))
training_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
valid_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
print('Done')

138354 24416
Done


In [2]:
from transformers import ViTConfig, ViTModel
configuration = ViTConfig(num_hidden_layers = 8, num_attention_heads = 8, 
                          intermediate_size = 768, image_size= 64, patch_size = 16)
model = ViTModel(configuration)
configuration = model.config
t = iter(test_data_loader)
img, label = next(t)
img
y = model(img)
y.last_hidden_state.shape

torch.Size([256, 17, 768])

In [3]:
def choose_value_patch(attention, value, k):
    """
    Get top-k attention values based on average attention weights across heads.

    Parameters:
    - attention (tensor): Shape [Batch, Head, token]
    - value (tensor): Shape [Batch, token, dim]
    - k (int): Number of top attention values to select

    Returns:
    - top_k_values (tensor): Shape [Batch, k, dim]
    """

    # Average attention across the head dimension.
    avg_attention = attention.mean(dim=1)

    # Get the top-k attention indices.
    _, top_k_indices = avg_attention.topk(k, dim=-1)

    # Gather the top-k values.
    batch_size, _ = top_k_indices.shape
    batch_indices = torch.arange(batch_size)[:, None].to(top_k_indices.device)
    top_k_values = value[batch_indices, top_k_indices].view(batch_size, k, -1)

    return top_k_values

    
class Last_Attention(nn.Module):
    def __init__(self, normalize):
        super(Last_Attention, self).__init__()
        self.normalize = normalize
        self.p_dim = 2
        self.emb_size = 768
        self.head = 8
        self.temperature = 1
        self.head_dim = self.emb_size //self.head
        self.Q = nn.Linear(768,768)
        self.K = nn.Linear(768,768)
        self.V = nn.Linear(768,768)
        self.projection = nn.Linear(768, 768)
        
        self.soft_max = nn.Softmax(dim=-1)
        self.projector = nn.Sequential(
            nn.Linear(self.p_dim*768, 512, bias=False),
            nn.ReLU(),
            nn.Linear(512, 128, bias=False),
        )
        self.momentum = 0.1
        self.register_buffer('running_mean_q', torch.zeros(1,8,17,96))
        self.register_buffer('running_std_q', torch.ones(1,8,17,96))
        self.register_buffer('running_mean_k', torch.zeros(1,8,17,96))
        self.register_buffer('running_std_k', torch.ones(1,8,17,96))

    def register_buffer(self, name, tensor):
        setattr(self, name, tensor)
    
        
    def forward(self, x, training):
        B, N, C = x.shape
        origin_k = self.K(x)
        origin_q = self.Q(x)
        origin_v = self.V(x)
        
        q = origin_q.reshape(B,N,self.head, C//self.head).permute(0,2,1,3)
        k = origin_k.reshape(B,N,self.head, C//self.head).permute(0,2,1,3)
        v = origin_v.reshape(B,N,self.head, C//self.head).permute(0,2,1,3)
        
        if self.normalize == True:
            self.running_mean_q = self.running_mean_q.detach()
            self.running_std_q = self.running_std_q.detach()
            self.running_mean_k = self.running_mean_k.detach()
            self.running_std_k = self.running_std_k.detach()
            
            if training:
                q_mean, q_std = torch.mean(q, 0, keepdim=True), torch.std(q, 0, keepdim=True)
                k_mean, k_std = torch.mean(k, 0, keepdim=True), torch.std(k, 0, keepdim=True)  

                self.running_mean_q = (1 - self.momentum) * self.running_mean_q.to(device) + self.momentum * q_mean
                self.running_std_q = (1 - self.momentum) * self.running_std_q.to(device) + self.momentum * q_std
                self.running_mean_k = (1 - self.momentum) * self.running_mean_k.to(device) + self.momentum * k_mean
                self.running_std_k = (1 - self.momentum) * self.running_std_k.to(device) + self.momentum * k_std
                    
            else:
                with torch.no_grad():
                    q_mean = self.running_mean_q.to(device)
                    q_std = self.running_std_q.to(device)
                    k_mean = self.running_mean_k.to(device)
                    k_std = self.running_std_k.to(device)
        
        q = (q - q_mean)  / q_std
        q = torch.abs(q)
        k = (k - k_mean) / k_std
        k = torch.abs(k)
        
        attention = (q @ k.transpose(-2,-1))* (self.head_dim ** (-0.5))
        atten = self.soft_max(attention/self.temperature)
        out = (atten @ v).transpose(1, 2).reshape(B, N, C)
        out = self.projection(out)
        attentions = atten[:,:, 0, :]
        v = v.transpose(1, 2).reshape(B, N, C)
        mst_val = choose_value_patch(attentions, v, self.p_dim)
        mst_val = mst_val.reshape(B, -1)
        mst_val = self.projector(mst_val)
        z = F.normalize(mst_val, dim=1)
        return out, z, atten


    
class Last_ATBlock(nn.Module):
    def __init__(self, normalize):
        super().__init__()
        dim = 768
        self.norm = nn.LayerNorm(dim)
        self.attention = Last_Attention(normalize)
        self.norm2 = nn.LayerNorm(dim)
        self.feedforward = nn.Sequential(
            nn.Linear(768, 768),
            nn.GELU(),
            nn.Linear(768, 768)          
        )
        
    def forward(self, x, training):
        identity = x
        x = self.norm(x)
        x, vz, att = self.attention(x, training)
        x += identity
        res = x 
        x = self.norm2(x)
        x = self.feedforward(x)
        x += res
        return x, vz, att

In [4]:
class VisionTransformer(nn.Module):
    def __init__(self, vit, normalize):
        super(VisionTransformer, self).__init__()
        self.vit = vit
        self.last_encoder = Last_ATBlock(normalize)
        self.seq = nn.Sequential(
            nn.Linear(768, 768),
            nn.ReLU(),
            nn.Linear(768, 1),     
            nn.Sigmoid()
        )
    
    def forward(self, x, training):
        z = self.vit(x)
        m = z.last_hidden_state
        m, vz, att = self.last_encoder(m, training)
        g = m[:,0]
        y = self.seq(g)
        return y, vz , att

In [5]:
import random
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import confusion_matrix
from collections import OrderedDict
import seaborn as sns

def seed_everything(seed):
    """
    Changes the seed for reproducibility. 
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  


def test_epoch(model, p=False):
    model.eval()
    test_pred = []
    test_gt = []
    sense_gt = []
    a0_predic = []
    a0_gt = []
    a1_predic = []
    a1_gt = []
    criterion = nn.BCELoss()
    testing_loss = 0.0
    for step, (test_input, attributes) in enumerate(test_data_loader):
        sensitive, test_target = attributes[:,20], attributes[:,9]
        test_input = test_input.to(device)
        test_target = test_target.to(device)
        gt = test_target.detach().cpu().numpy()
        sen = sensitive.detach().cpu().numpy()
        test_gt.extend(gt)
        sense_gt.extend(sen)

        with torch.no_grad():
            test_pred_, _ , att= model(test_input, False)
            test_pred.extend(torch.round(test_pred_.squeeze(1)).detach().cpu().numpy())
            test_target = test_target.float().to(device)
            test_loss = criterion(test_pred_, test_target.unsqueeze(1))
            testing_loss += test_loss.item()
    t_loss = testing_loss/len(test_data_loader)

    for i in range(len(sense_gt)):
        if sense_gt[i] == 0:
            a0_predic.append(test_pred[i])
            a0_gt.append(test_gt[i])
        else:
            a1_predic.append(test_pred[i])
            a1_gt.append(test_gt[i])
    a0_CM = confusion_matrix(a0_gt, a0_predic)    
    a1_CM = confusion_matrix(a1_gt, a1_predic) 
    a0_dp = (a0_CM[1][1]+a0_CM[0][1])/(a0_CM[0][0]+a0_CM[0][1]+a0_CM[1][0]+a0_CM[1][1])
    a1_dp = (a1_CM[1][1]+a1_CM[0][1])/(a1_CM[0][0]+a1_CM[0][1]+a1_CM[1][0]+a1_CM[1][1])
    a0_TPR = a0_CM[1][1]/(a0_CM[1][1]+a0_CM[1][0])
    a1_TPR = a1_CM[1][1]/(a1_CM[1][1]+a1_CM[1][0])
    a0_FPR = a0_CM[0][1]/(a0_CM[0][1]+a0_CM[0][0])
    a1_FPR = a1_CM[0][1]/(a1_CM[0][1]+a1_CM[0][0])
    EOd = 0.5*(abs(a0_FPR-a1_FPR)+ abs(a0_TPR-a1_TPR))
    test_acc = accuracy_score(test_gt, test_pred)
    if p == True:
        print('valid loss:',t_loss)
        print('DP', abs(a0_dp - a1_dp))
        print('EOP', abs(a0_TPR - a1_TPR))
        print('EoD', EOd)
        print('acc', test_acc)
    return EOd


def train_model(normalize):
    configuration = ViTConfig(num_hidden_layers = 10, num_attention_heads = 8, 
                          intermediate_size = 768, image_size= 64, patch_size = 16)
    vit = ViTModel(configuration)
    configuration = vit.config
    vit = vit.to(device)
    model = VisionTransformer(vit, normalize)
    model = model.to(device)
    
    epoch = 50
    criterion = nn.BCELoss()
    fair_criterion = SupConLoss()

    fair_optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4, weight_decay = 0.1)
    train_loss = []
    train_acc = []
    valid_loss =[]
    valid_acc = []
    fair_metric = 1
    save_acc = 0
    
    
    for epoches in range(epoch):
        with tqdm(training_data_loader, unit="batch") as tepoch:
            model.train()
            running_loss = 0.0
            all_preds = []
            all_labels = []
            
            for train_input, attributes in tepoch:
                # Transfer data to GPU if possible. 
                train_input = train_input.to(device)
                train_target = attributes[:,9]
                train_target = train_target.float().to(device)
                fair_optimizer.zero_grad()

                outputs, value, _ = model(train_input, True)
                value = value.unsqueeze(1)
                fair_loss = fair_criterion(value, train_target.squeeze())
                train_target = train_target.unsqueeze(1)
                ut_loss = criterion(outputs, train_target)
                loss =  ut_loss + fair_loss
                tepoch.set_postfix(ul = ut_loss.item(),fl = fair_loss.item())  
                loss.backward()
                running_loss += loss.item()
                fair_optimizer.step()
                tepoch.set_description(f"epoch %2f " % epoches)
                
                all_preds.extend(torch.round(outputs.squeeze(1)).detach().cpu().numpy())
                all_labels.extend(train_target.squeeze(1).cpu().numpy())
        train_accuracy = accuracy_score(all_labels, all_preds)
        print(f'Epoch {epoches+1}/{epoch}, Loss: {running_loss/len(training_data_loader):.4f}, Accuracy: {train_accuracy * 100:.2f}%')
        run_loss = running_loss/len(training_data_loader)
        train_loss.append(run_loss) 
        train_acc.append(train_accuracy)
        eod = test_epoch(model)
        if fair_metric> eod:
            fair_metric = eod
            torch.save(model, 'best_celebA.pth')
            print('saved')
    return train_loss, train_acc, valid_loss, valid_acc

In [6]:
import subprocess  
import threading  
import time
import pickle

def compute_average_hourly_utilization(minute_utilizations):
    padding_length = (60 - len(minute_utilizations) % 60) % 60 
    padded_utilizations = minute_utilizations + [0] * padding_length
    num_hours = len(padded_utilizations) // 60
    hourly_averages = [sum(padded_utilizations[i*60:(i+1)*60])/60.0 for i in range(num_hours)]

    return hourly_averages

def get_gpu_utilization(device_id):
    try:
        result = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv,noheader,nounits"]
        ).decode("utf-8").strip().split('\n')
        return int(result[device_id])# please change the device_id accordingly
    except Exception as e:
        print(f"Error querying GPU utilization: {e}")
        return None


def track_gpu_utilization_periodically(device_id):
    if not stop_tracking:
        utilization = get_gpu_utilization(device_id)
        if utilization is not None:
            utilization_list.append(utilization)
        threading.Timer(60, track_gpu_utilization_periodically, args=[device_id]).start()


utilization_list = []
stop_tracking = False

# Start the periodic GPU tracking
track_gpu_utilization_periodically(device_id)
start_time = time.time()    
    
normalize = True
seed_everything(0)    
tl,ta,vl,va = train_model(normalize)

stop_tracking = True
end_time = time.time()
total_time = end_time - start_time

# Print GPU utilizations
print("***********************************************")
print("energy results")
print("\nGPU Utilizations recorded every minute:", utilization_list)

with open('utilization_data.pkl', 'wb') as f:
    pickle.dump(utilization_list, f)
    
average_hourly_utilizations = compute_average_hourly_utilization(utilization_list)
print("Average hourly GPU Utilizations:", average_hourly_utilizations)
print(f"\nTotal training time: {total_time:.2f} seconds.")


## test
print('testing')

epoch 0.000000 : 100%|██████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.45, ul=0.182]


Epoch 1/50, Loss: 5.6699, Accuracy: 91.35%
saved


epoch 1.000000 : 100%|██████████████████████████████████████████| 540/540 [02:04<00:00,  4.32batch/s, fl=5.35, ul=0.162]


Epoch 2/50, Loss: 5.5423, Accuracy: 93.83%


epoch 2.000000 : 100%|██████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.43, ul=0.151]


Epoch 3/50, Loss: 5.5168, Accuracy: 94.41%


epoch 3.000000 : 100%|█████████████████████████████████████████| 540/540 [02:03<00:00,  4.37batch/s, fl=5.31, ul=0.0893]


Epoch 4/50, Loss: 5.4986, Accuracy: 94.71%


epoch 4.000000 : 100%|██████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.32, ul=0.131]


Epoch 5/50, Loss: 5.4873, Accuracy: 94.84%


epoch 5.000000 : 100%|██████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.34, ul=0.121]


Epoch 6/50, Loss: 5.4728, Accuracy: 95.14%


epoch 6.000000 : 100%|██████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.38, ul=0.124]


Epoch 7/50, Loss: 5.4630, Accuracy: 95.34%


epoch 7.000000 : 100%|██████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.41, ul=0.121]


Epoch 8/50, Loss: 5.4504, Accuracy: 95.51%


epoch 8.000000 : 100%|███████████████████████████████████████████| 540/540 [02:04<00:00,  4.35batch/s, fl=5.3, ul=0.103]


Epoch 9/50, Loss: 5.4422, Accuracy: 95.70%


epoch 9.000000 : 100%|██████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.28, ul=0.135]


Epoch 10/50, Loss: 5.4272, Accuracy: 95.93%
saved


epoch 10.000000 : 100%|█████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.34, ul=0.136]


Epoch 11/50, Loss: 5.4163, Accuracy: 96.10%


epoch 11.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.31, ul=0.0882]


Epoch 12/50, Loss: 5.4031, Accuracy: 96.35%
saved


epoch 12.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.32, ul=0.0805]


Epoch 13/50, Loss: 5.3864, Accuracy: 96.63%


epoch 13.000000 : 100%|██████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.35, ul=0.12]


Epoch 14/50, Loss: 5.3710, Accuracy: 96.89%


epoch 14.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.25, ul=0.0912]


Epoch 15/50, Loss: 5.3564, Accuracy: 97.13%


epoch 15.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.26, ul=0.0559]


Epoch 16/50, Loss: 5.3386, Accuracy: 97.36%


epoch 16.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.32batch/s, fl=5.25, ul=0.0886]


Epoch 17/50, Loss: 5.3225, Accuracy: 97.60%


epoch 17.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.19, ul=0.0522]


Epoch 18/50, Loss: 5.3096, Accuracy: 97.78%


epoch 18.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.26, ul=0.0512]


Epoch 19/50, Loss: 5.2929, Accuracy: 97.98%


epoch 19.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.25, ul=0.0748]


Epoch 20/50, Loss: 5.2858, Accuracy: 98.11%


epoch 20.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.16, ul=0.0582]


Epoch 21/50, Loss: 5.2684, Accuracy: 98.35%


epoch 21.000000 : 100%|████████████████████████████████████████| 540/540 [02:05<00:00,  4.31batch/s, fl=5.21, ul=0.0863]


Epoch 22/50, Loss: 5.2591, Accuracy: 98.47%


epoch 22.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.22, ul=0.0729]


Epoch 23/50, Loss: 5.2506, Accuracy: 98.57%


epoch 23.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.15, ul=0.0123]


Epoch 24/50, Loss: 5.2391, Accuracy: 98.73%


epoch 24.000000 : 100%|█████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.2, ul=0.0692]


Epoch 25/50, Loss: 5.2321, Accuracy: 98.82%


epoch 25.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.16, ul=0.0373]


Epoch 26/50, Loss: 5.2294, Accuracy: 98.83%
saved


epoch 26.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.32batch/s, fl=5.19, ul=0.0499]


Epoch 27/50, Loss: 5.2213, Accuracy: 98.90%


epoch 27.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.35batch/s, fl=5.15, ul=0.0327]


Epoch 28/50, Loss: 5.2124, Accuracy: 99.05%


epoch 28.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.16, ul=0.0221]


Epoch 29/50, Loss: 5.2160, Accuracy: 99.01%


epoch 29.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.15, ul=0.0212]


Epoch 30/50, Loss: 5.2098, Accuracy: 99.08%


epoch 30.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.28, ul=0.0539]


Epoch 31/50, Loss: 5.2035, Accuracy: 99.16%


epoch 31.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.11, ul=0.0201]


Epoch 32/50, Loss: 5.2029, Accuracy: 99.14%


epoch 32.000000 : 100%|█████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.2, ul=0.0286]


Epoch 33/50, Loss: 5.1975, Accuracy: 99.22%


epoch 33.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.32batch/s, fl=5.16, ul=0.0217]


Epoch 34/50, Loss: 5.1954, Accuracy: 99.22%


epoch 34.000000 : 100%|███████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.15, ul=0.00744]


Epoch 35/50, Loss: 5.1922, Accuracy: 99.26%


epoch 35.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.12, ul=0.0177]


Epoch 36/50, Loss: 5.1904, Accuracy: 99.28%


epoch 36.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.19, ul=0.0319]


Epoch 37/50, Loss: 5.1893, Accuracy: 99.31%


epoch 37.000000 : 100%|████████████████████████████████████████| 540/540 [02:05<00:00,  4.32batch/s, fl=5.25, ul=0.0416]


Epoch 38/50, Loss: 5.1859, Accuracy: 99.34%


epoch 38.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.18, ul=0.0151]


Epoch 39/50, Loss: 5.1833, Accuracy: 99.36%


epoch 39.000000 : 100%|█████████████████████████████████████████| 540/540 [02:04<00:00,  4.35batch/s, fl=5.2, ul=0.0333]


Epoch 40/50, Loss: 5.1833, Accuracy: 99.35%


epoch 40.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.35batch/s, fl=5.22, ul=0.0314]


Epoch 41/50, Loss: 5.1816, Accuracy: 99.40%


epoch 41.000000 : 100%|█████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.2, ul=0.0267]


Epoch 42/50, Loss: 5.1813, Accuracy: 99.40%


epoch 42.000000 : 100%|████████████████████████████████████████| 540/540 [02:03<00:00,  4.38batch/s, fl=5.19, ul=0.0253]


Epoch 43/50, Loss: 5.1815, Accuracy: 99.41%


epoch 43.000000 : 100%|████████████████████████████████████████| 540/540 [02:03<00:00,  4.36batch/s, fl=5.19, ul=0.0273]


Epoch 44/50, Loss: 5.1764, Accuracy: 99.45%


epoch 44.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.17, ul=0.0102]


Epoch 45/50, Loss: 5.1788, Accuracy: 99.42%


epoch 45.000000 : 100%|██████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.17, ul=0.000937]


Epoch 46/50, Loss: 5.1790, Accuracy: 99.42%


epoch 46.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.35batch/s, fl=5.17, ul=0.0214]


Epoch 47/50, Loss: 5.1726, Accuracy: 99.50%


epoch 47.000000 : 100%|███████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.13, ul=0.00797]


Epoch 48/50, Loss: 5.1741, Accuracy: 99.47%


epoch 48.000000 : 100%|███████████████████████████████████████| 540/540 [02:04<00:00,  4.34batch/s, fl=5.13, ul=0.00846]


Epoch 49/50, Loss: 5.1721, Accuracy: 99.48%


epoch 49.000000 : 100%|████████████████████████████████████████| 540/540 [02:04<00:00,  4.33batch/s, fl=5.13, ul=0.0028]


Epoch 50/50, Loss: 5.1713, Accuracy: 99.50%
***********************************************
energy results

GPU Utilizations recorded every minute: [0, 27, 40, 38, 37, 21, 31, 39, 42, 11, 14, 16, 39, 5, 27, 15, 11, 39, 20, 26, 27, 25, 28, 19, 15, 11, 44, 41, 39, 40, 27, 1, 11, 40, 25, 36, 26, 16, 39, 21, 2, 24, 32, 34, 30, 6, 40, 29, 20, 39, 32, 38, 37, 37, 0, 26, 40, 15, 28, 10, 40, 19, 0, 40, 11, 25, 27, 24, 39, 40, 19, 22, 39, 11, 41, 1, 39, 39, 40, 24, 11, 33, 25, 38, 40, 32, 14, 27, 24, 11, 39, 39, 18, 38, 0, 35, 11, 39, 39, 40, 40, 23, 40, 28, 16, 11, 37, 20, 40, 5, 12, 8, 19, 39, 19]
Average hourly GPU Utilizations: [25.8, 24.166666666666668]

Total training time: 6875.14 seconds.
testing


In [9]:
print('Testing result')
configuration = ViTConfig(num_hidden_layers = 10, num_attention_heads = 8, 
                          intermediate_size = 768, image_size= 64, patch_size = 16)
vit = ViTModel(configuration)
model = VisionTransformer(vit, normalize)
model = torch.load('best_celebA.pth')
model = model.to(device)
model.eval()
_ = test_epoch(model, True)

Testing result
valid loss: 0.20273513585711136
DP 0.18146682767179192
EOP 0.3697132616487456
EoD 0.20552938088417697
acc 0.9389339745516482
