In [1]:
import tqdm
import pandas as pd
import os
import sys

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

from predict.prediction_models import (
    DensePredictor,
    LayerNormPredictor,
    AttentionPredictor,
    EmbeddingPredictor
)

In [3]:
dense_predictor = DensePredictor(with_features=True)
layer_norm_predictor = LayerNormPredictor()
attention_predictor = AttentionPredictor(with_features=True)
embedding_predictor = EmbeddingPredictor(with_features=True)

In [4]:
dense_wo_features_predictor = DensePredictor(with_features=False)
layer_norm_wo_feaures_predictor = LayerNormPredictor()
attention_wo_feaures_predictor = AttentionPredictor(with_features=False)
embedding_wo_feaures_predictor = EmbeddingPredictor(with_features=False)

In [5]:
gpu_name = 'V100'

In [6]:
# Defining BERT Base model layer specs
batch_size = 64
seq_len = 128
vocab_size = 30522
embed_dim = 768
num_heads = 12
ffn_dim = 3072
num_layers = 12

embedding_layers = {}
attention_layers = {}
dense_layers = {}
layernorm_layers = {}

embedding_layers['embedding'] = {
    'batchsize': batch_size,
    'seq_len': seq_len,
    'vocab_size': vocab_size,
    'embed_dim': embed_dim,
    'optimizer': 'Adam'
}
embedding_layers['positional_embedding'] = {
    'batchsize': batch_size,
    'seq_len': seq_len,
    'vocab_size': seq_len,
    'embed_dim': embed_dim,
    'optimizer': 'Adam'
}

for i in range(num_layers):
    attention_layers[f'attention_{i}'] = {
        'batchsize': batch_size,
        'seq_len': seq_len,
        'num_heads': num_heads,
        'embed_dim': embed_dim,
        'optimizer': 'Adam'
    }
    layernorm_layers[f'layernorm_{i}_1'] = {
        'batchsize': batch_size,
        'seq_len': seq_len,
        'embed_dim': embed_dim,
        'optimizer': 'Adam'
    }
    layernorm_layers[f'layernorm_{i}_2'] = {
        'batchsize': batch_size,
        'seq_len': seq_len,
        'embed_dim': embed_dim,
        'optimizer': 'Adam'
    }
    dense_layers[f'dense_{i}_1'] = {
        'batchsize': batch_size,
        'dim_input': embed_dim,
        'dim_output': ffn_dim,
        'activation_fct': 'ReLU',
        'optimizer': 'Adam'
    }
    dense_layers[f'dense_{i}_2'] = {
        'batchsize': batch_size,
        'dim_input': ffn_dim,
        'dim_output': embed_dim,
        'activation_fct': 'None',
        'optimizer': 'Adam'
    }
dense_layers['dense_final'] = {
    'batchsize': batch_size,
    'dim_input': embed_dim,
    'dim_output': vocab_size,
    'activation_fct': 'None',
    'optimizer': 'Adam'
}

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import time

dataset_size = 1024

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a simulated dataset for BERT-style training
class RandomTextDataset(Dataset):
    def __init__(self, vocab_size, sequence_length, dataset_size):
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        self.dataset_size = dataset_size

    def __len__(self):
        return self.dataset_size

    def __getitem__(self, idx):
        input_ids = torch.randint(0, self.vocab_size, (self.sequence_length,))
        labels = torch.randint(0, self.vocab_size, (self.sequence_length,))
        return input_ids, labels

# Initialize the dataset and DataLoader

train_dataset = RandomTextDataset(vocab_size, seq_len, dataset_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define BERT-base model with layer-by-layer transformer blocks
class BERTBaseLayerByLayer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ffn_dim, num_layers, sequence_length):
        super(BERTBaseLayerByLayer, self).__init__()

        # Embedding layers
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(sequence_length, embed_dim)
        
        # Transformer layers
        self.attention_layers = nn.ModuleList()
        self.ffn_layers = nn.ModuleList()
        self.norm_layers_1 = nn.ModuleList()
        self.norm_layers_2 = nn.ModuleList()

        for _ in range(num_layers):
            self.attention_layers.append(nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=True))
            self.ffn_layers.append(nn.Sequential(
                nn.Linear(embed_dim, ffn_dim),
                nn.ReLU(),
                nn.Linear(ffn_dim, embed_dim)
            ))
            self.norm_layers_1.append(nn.LayerNorm(embed_dim))
            self.norm_layers_2.append(nn.LayerNorm(embed_dim))
        
        # Final output layer
        self.output_layer = nn.Linear(embed_dim, vocab_size)
    
    def forward(self, x):
        position_ids = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(x.size(0), -1)
        x = self.token_embedding(x) + self.position_embedding(position_ids)

        for i in range(num_layers):
            attn_output, _ = self.attention_layers[i](x, x, x)
            x = self.norm_layers_1[i](x + attn_output)
            
            ffn_output = self.ffn_layers[i](x)
            x = self.norm_layers_2[i](x + ffn_output)

        return self.output_layer(x)

# Initialize the model and move it to the GPU
model = BERTBaseLayerByLayer(vocab_size, embed_dim, num_heads, ffn_dim, num_layers, seq_len).to(device)

epochs = 100

# Define a loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training loop
start_time = time.time()
for epoch in tqdm.tqdm(range(epochs)):
    
    epoch_loss = 0.0

    for input_data, labels in train_loader:
        input_data, labels = input_data.to(device), labels.to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_data)
        
        # Reshape for loss calculation
        outputs = outputs.view(-1, vocab_size)
        labels = labels.view(-1)
        
        # Compute loss
        loss = loss_fn(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {epoch_loss / len(train_loader):.4f}")
end_time = time.time()
print("Training complete.")


  from .autonotebook import tqdm as notebook_tqdm
  1%|          | 1/100 [00:08<13:55,  8.44s/it]

Epoch [1/100] - Loss: 10.4700


  2%|▏         | 2/100 [00:16<13:31,  8.29s/it]

Epoch [2/100] - Loss: 10.4197


  3%|▎         | 3/100 [00:24<13:19,  8.24s/it]

Epoch [3/100] - Loss: 10.3852


  4%|▍         | 4/100 [00:33<13:09,  8.23s/it]

Epoch [4/100] - Loss: 10.3689


  5%|▌         | 5/100 [00:41<13:01,  8.22s/it]

Epoch [5/100] - Loss: 10.3635


  6%|▌         | 6/100 [00:49<12:54,  8.24s/it]

Epoch [6/100] - Loss: 10.3597


  7%|▋         | 7/100 [00:57<12:46,  8.24s/it]

Epoch [7/100] - Loss: 10.3577


  8%|▊         | 8/100 [01:06<12:38,  8.25s/it]

Epoch [8/100] - Loss: 10.3571


  9%|▉         | 9/100 [01:14<12:30,  8.24s/it]

Epoch [9/100] - Loss: 10.3552


 10%|█         | 10/100 [01:22<12:22,  8.25s/it]

Epoch [10/100] - Loss: 10.3552


 11%|█         | 11/100 [01:30<12:14,  8.25s/it]

Epoch [11/100] - Loss: 10.3540


 12%|█▏        | 12/100 [01:38<12:05,  8.24s/it]

Epoch [12/100] - Loss: 10.3531


 13%|█▎        | 13/100 [01:47<11:56,  8.24s/it]

Epoch [13/100] - Loss: 10.3532


 14%|█▍        | 14/100 [01:55<11:48,  8.24s/it]

Epoch [14/100] - Loss: 10.3536


 15%|█▌        | 15/100 [02:03<11:40,  8.24s/it]

Epoch [15/100] - Loss: 10.3521


 16%|█▌        | 16/100 [02:11<11:32,  8.24s/it]

Epoch [16/100] - Loss: 10.3514


 17%|█▋        | 17/100 [02:20<11:23,  8.24s/it]

Epoch [17/100] - Loss: 10.3521


 18%|█▊        | 18/100 [02:28<11:15,  8.24s/it]

Epoch [18/100] - Loss: 10.3499


 19%|█▉        | 19/100 [02:36<11:07,  8.24s/it]

Epoch [19/100] - Loss: 10.3504


 20%|██        | 20/100 [02:44<10:59,  8.24s/it]

Epoch [20/100] - Loss: 10.3498


 21%|██        | 21/100 [02:53<10:50,  8.23s/it]

Epoch [21/100] - Loss: 10.3489


 22%|██▏       | 22/100 [03:01<10:41,  8.22s/it]

Epoch [22/100] - Loss: 10.3487


 23%|██▎       | 23/100 [03:09<10:33,  8.23s/it]

Epoch [23/100] - Loss: 10.3488


 24%|██▍       | 24/100 [03:17<10:25,  8.23s/it]

Epoch [24/100] - Loss: 10.3481


 25%|██▌       | 25/100 [03:26<10:17,  8.24s/it]

Epoch [25/100] - Loss: 10.3478


 26%|██▌       | 26/100 [03:34<10:08,  8.23s/it]

Epoch [26/100] - Loss: 10.3465


 27%|██▋       | 27/100 [03:42<10:00,  8.22s/it]

Epoch [27/100] - Loss: 10.3478


 28%|██▊       | 28/100 [03:50<09:51,  8.22s/it]

Epoch [28/100] - Loss: 10.3474


 29%|██▉       | 29/100 [03:58<09:44,  8.23s/it]

Epoch [29/100] - Loss: 10.3464


 30%|███       | 30/100 [04:07<09:35,  8.22s/it]

Epoch [30/100] - Loss: 10.3462


 31%|███       | 31/100 [04:15<09:27,  8.22s/it]

Epoch [31/100] - Loss: 10.3462


 32%|███▏      | 32/100 [04:23<09:19,  8.22s/it]

Epoch [32/100] - Loss: 10.3451


 33%|███▎      | 33/100 [04:31<09:11,  8.23s/it]

Epoch [33/100] - Loss: 10.3452


 34%|███▍      | 34/100 [04:39<09:02,  8.22s/it]

Epoch [34/100] - Loss: 10.3450


 35%|███▌      | 35/100 [04:48<08:53,  8.21s/it]

Epoch [35/100] - Loss: 10.3460


 36%|███▌      | 36/100 [04:56<08:45,  8.21s/it]

Epoch [36/100] - Loss: 10.3448


 37%|███▋      | 37/100 [05:04<08:37,  8.21s/it]

Epoch [37/100] - Loss: 10.3450


 38%|███▊      | 38/100 [05:12<08:29,  8.21s/it]

Epoch [38/100] - Loss: 10.3440


 39%|███▉      | 39/100 [05:21<08:21,  8.22s/it]

Epoch [39/100] - Loss: 10.3442


 40%|████      | 40/100 [05:29<08:12,  8.21s/it]

Epoch [40/100] - Loss: 10.3438


 41%|████      | 41/100 [05:37<08:04,  8.22s/it]

Epoch [41/100] - Loss: 10.3438


 42%|████▏     | 42/100 [05:45<07:56,  8.21s/it]

Epoch [42/100] - Loss: 10.3432


 43%|████▎     | 43/100 [05:53<07:48,  8.21s/it]

Epoch [43/100] - Loss: 10.3429


 44%|████▍     | 44/100 [06:02<07:40,  8.21s/it]

Epoch [44/100] - Loss: 10.3439


 45%|████▌     | 45/100 [06:10<07:31,  8.21s/it]

Epoch [45/100] - Loss: 10.3434


 46%|████▌     | 46/100 [06:18<07:23,  8.22s/it]

Epoch [46/100] - Loss: 10.3428


 47%|████▋     | 47/100 [06:26<07:15,  8.21s/it]

Epoch [47/100] - Loss: 10.3427


 48%|████▊     | 48/100 [06:34<07:07,  8.22s/it]

Epoch [48/100] - Loss: 10.3431


 49%|████▉     | 49/100 [06:43<06:58,  8.21s/it]

Epoch [49/100] - Loss: 10.3417


 50%|█████     | 50/100 [06:51<06:50,  8.21s/it]

Epoch [50/100] - Loss: 10.3419


 51%|█████     | 51/100 [06:59<06:42,  8.21s/it]

Epoch [51/100] - Loss: 10.3417


 52%|█████▏    | 52/100 [07:07<06:33,  8.21s/it]

Epoch [52/100] - Loss: 10.3420


 53%|█████▎    | 53/100 [07:16<06:25,  8.21s/it]

Epoch [53/100] - Loss: 10.3419


 54%|█████▍    | 54/100 [07:24<06:17,  8.21s/it]

Epoch [54/100] - Loss: 10.3418


 55%|█████▌    | 55/100 [07:32<06:09,  8.21s/it]

Epoch [55/100] - Loss: 10.3405


 56%|█████▌    | 56/100 [07:40<06:01,  8.21s/it]

Epoch [56/100] - Loss: 10.3417


 57%|█████▋    | 57/100 [07:48<05:52,  8.20s/it]

Epoch [57/100] - Loss: 10.3407


 58%|█████▊    | 58/100 [07:57<05:44,  8.20s/it]

Epoch [58/100] - Loss: 10.3410


 59%|█████▉    | 59/100 [08:05<05:36,  8.21s/it]

Epoch [59/100] - Loss: 10.3412


 60%|██████    | 60/100 [08:13<05:28,  8.21s/it]

Epoch [60/100] - Loss: 10.3406


 61%|██████    | 61/100 [08:21<05:20,  8.21s/it]

Epoch [61/100] - Loss: 10.3405


 62%|██████▏   | 62/100 [08:29<05:11,  8.21s/it]

Epoch [62/100] - Loss: 10.3405


 63%|██████▎   | 63/100 [08:38<05:03,  8.21s/it]

Epoch [63/100] - Loss: 10.3412


 64%|██████▍   | 64/100 [08:46<04:55,  8.20s/it]

Epoch [64/100] - Loss: 10.3401


 65%|██████▌   | 65/100 [08:54<04:47,  8.21s/it]

Epoch [65/100] - Loss: 10.3396


 66%|██████▌   | 66/100 [09:02<04:39,  8.21s/it]

Epoch [66/100] - Loss: 10.3397


 67%|██████▋   | 67/100 [09:10<04:30,  8.21s/it]

Epoch [67/100] - Loss: 10.3401


 68%|██████▊   | 68/100 [09:19<04:22,  8.20s/it]

Epoch [68/100] - Loss: 10.3402


 69%|██████▉   | 69/100 [09:27<04:14,  8.21s/it]

Epoch [69/100] - Loss: 10.3394


 70%|███████   | 70/100 [09:35<04:06,  8.20s/it]

Epoch [70/100] - Loss: 10.3391


 71%|███████   | 71/100 [09:43<03:57,  8.20s/it]

Epoch [71/100] - Loss: 10.3397


 72%|███████▏  | 72/100 [09:51<03:49,  8.20s/it]

Epoch [72/100] - Loss: 10.3390


 73%|███████▎  | 73/100 [10:00<03:41,  8.20s/it]

Epoch [73/100] - Loss: 10.3400


 74%|███████▍  | 74/100 [10:08<03:33,  8.21s/it]

Epoch [74/100] - Loss: 10.3391


 75%|███████▌  | 75/100 [10:16<03:24,  8.19s/it]

Epoch [75/100] - Loss: 10.3390


 76%|███████▌  | 76/100 [10:24<03:16,  8.19s/it]

Epoch [76/100] - Loss: 10.3391


 77%|███████▋  | 77/100 [10:32<03:08,  8.20s/it]

Epoch [77/100] - Loss: 10.3384


 78%|███████▊  | 78/100 [10:41<03:00,  8.21s/it]

Epoch [78/100] - Loss: 10.3387


 79%|███████▉  | 79/100 [10:49<02:52,  8.21s/it]

Epoch [79/100] - Loss: 10.3385


 80%|████████  | 80/100 [10:57<02:44,  8.21s/it]

Epoch [80/100] - Loss: 10.3383


 81%|████████  | 81/100 [11:05<02:35,  8.21s/it]

Epoch [81/100] - Loss: 10.3383


 82%|████████▏ | 82/100 [11:13<02:27,  8.21s/it]

Epoch [82/100] - Loss: 10.3381


 83%|████████▎ | 83/100 [11:22<02:19,  8.21s/it]

Epoch [83/100] - Loss: 10.3379


 84%|████████▍ | 84/100 [11:30<02:11,  8.21s/it]

Epoch [84/100] - Loss: 10.3380


 85%|████████▌ | 85/100 [11:38<02:02,  8.20s/it]

Epoch [85/100] - Loss: 10.3382


 86%|████████▌ | 86/100 [11:46<01:54,  8.20s/it]

Epoch [86/100] - Loss: 10.3379


 87%|████████▋ | 87/100 [11:54<01:46,  8.21s/it]

Epoch [87/100] - Loss: 10.3377


 88%|████████▊ | 88/100 [12:03<01:38,  8.20s/it]

Epoch [88/100] - Loss: 10.3377


 89%|████████▉ | 89/100 [12:11<01:30,  8.21s/it]

Epoch [89/100] - Loss: 10.3384


 90%|█████████ | 90/100 [12:19<01:22,  8.21s/it]

Epoch [90/100] - Loss: 10.3371


 91%|█████████ | 91/100 [12:27<01:13,  8.20s/it]

Epoch [91/100] - Loss: 10.3374


 92%|█████████▏| 92/100 [12:35<01:05,  8.20s/it]

Epoch [92/100] - Loss: 10.3375


 93%|█████████▎| 93/100 [12:44<00:57,  8.20s/it]

Epoch [93/100] - Loss: 10.3371


 94%|█████████▍| 94/100 [12:52<00:49,  8.19s/it]

Epoch [94/100] - Loss: 10.3378


 95%|█████████▌| 95/100 [13:00<00:40,  8.19s/it]

Epoch [95/100] - Loss: 10.3373


 96%|█████████▌| 96/100 [13:08<00:32,  8.20s/it]

Epoch [96/100] - Loss: 10.3369


 97%|█████████▋| 97/100 [13:16<00:24,  8.20s/it]

Epoch [97/100] - Loss: 10.3374


 98%|█████████▊| 98/100 [13:25<00:16,  8.20s/it]

Epoch [98/100] - Loss: 10.3367


 99%|█████████▉| 99/100 [13:33<00:08,  8.20s/it]

Epoch [99/100] - Loss: 10.3369


100%|██████████| 100/100 [13:41<00:00,  8.22s/it]

Epoch [100/100] - Loss: 10.3373
Training complete.





In [8]:
time_elapsed = end_time - start_time # Calculate the total time taken for training in seconds
print(f"Total time taken for training BERT: {time_elapsed:.2f} seconds")

Total time taken for training BERT: 821.55 seconds


In [9]:
time_for_one_epoch = time_elapsed / epochs
print(f"Average time taken for one epoch of BERT: {time_for_one_epoch:.2f} seconds")

Average time taken for one epoch of BERT: 8.22 seconds


In [10]:
gpu_specs = pd.read_csv('../gpus/GPUs.csv')
gpu_specs

Unnamed: 0,GPU,Provisioning,Base Clock (MHz),Boost Clock (MHz),Memory Clock (MHz),Memory (GB),Memory Type,Memory Bus (bit),GPU Memory Bandwidth (GB/s),Bus,...,TMUs,ROPs,SM,TC,RT,PR,TR,FP16,FP32,FP64
0,L4,Cloud,795,2040,1563,24,GDDR6,192,300,PCIe 4.0,...,240,80,60,240,60,163,490,30290,30290,473
1,P4,Cloud,886,1114,1502,8,GDDR5,256,192,PCIe 3.0,...,160,64,20,0,0,71,178,89,5700,178
2,P100,Cloud,1190,1329,715,16,HBM2,4096,732,PCIe 3.0,...,224,96,56,0,0,127,297,19050,9526,4763
3,RTX4090,Local,2235,2520,1313,24,GDDR6X,384,1001,PCIe 4.0,...,512,176,128,512,128,443,1290,82580,82580,1290
4,RTXA4000,Local,735,1560,1750,16,GDDR6,256,448,PCIe 4.0,...,192,96,48,192,48,150,300,19170,19170,300
5,T4,Cloud,585,1590,1250,16,GDDR6,256,320,PCIe 3.0,...,160,64,40,320,40,101,254,65130,8141,254
6,V100,Cloud,1245,1380,876,16,HBM2,4096,897,PCIe 3.0,...,320,128,80,640,0,176,441,28260,14130,7066


In [11]:
dense_layers_features = dense_layers
embedding_layers_features = embedding_layers
attention_layers_features = attention_layers
layernorm_layers_features = layernorm_layers

In [12]:
gpu = gpu_specs[gpu_specs['GPU'] == gpu_name]
gpu = gpu.squeeze()
gpu = pd.DataFrame([gpu] * len(dense_layers_features))

dense_layers_features = pd.DataFrame.from_dict(dense_layers_features, orient='index')
dense_layers_features = pd.concat([dense_layers_features.reset_index(drop=True), gpu.reset_index(drop=True)], axis=1)


gpu = gpu_specs[gpu_specs['GPU'] == gpu_name]
gpu = gpu.squeeze()
gpu = pd.DataFrame([gpu] * len(embedding_layers_features))

embedding_layers_features = pd.DataFrame.from_dict(embedding_layers_features, orient='index')
embedding_layers_features = pd.concat([embedding_layers_features.reset_index(drop=True), gpu.reset_index(drop=True)], axis=1)


gpu = gpu_specs[gpu_specs['GPU'] == gpu_name]
gpu = gpu.squeeze()
gpu = pd.DataFrame([gpu] * len(attention_layers_features))

attention_layers_features = pd.DataFrame.from_dict(attention_layers_features, orient='index')
attention_layers_features = pd.concat([attention_layers_features.reset_index(drop=True), gpu.reset_index(drop=True)], axis=1)

gpu = gpu_specs[gpu_specs['GPU'] == gpu_name]
gpu = gpu.squeeze()
gpu = pd.DataFrame([gpu] * len(layernorm_layers_features))

layernorm_layers_features = pd.DataFrame.from_dict(layernorm_layers_features, orient='index')
layernorm_layers_features = pd.concat([layernorm_layers_features.reset_index(drop=True), gpu.reset_index(drop=True)], axis=1)

In [13]:
dense_layers_features['optimizer'] = 4
dense_layers_features['precision'] = 32
dense_layers_features['activation_fct'] = dense_layers_features['activation_fct'].replace({'ReLU': 1, 'None': 0})

  dense_layers_features['activation_fct'] = dense_layers_features['activation_fct'].replace({'ReLU': 1, 'None': 0})


In [14]:
dense_layers_features

Unnamed: 0,batchsize,dim_input,dim_output,activation_fct,optimizer,GPU,Provisioning,Base Clock (MHz),Boost Clock (MHz),Memory Clock (MHz),...,ROPs,SM,TC,RT,PR,TR,FP16,FP32,FP64,precision
0,64,768,3072,1,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
1,64,3072,768,0,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
2,64,768,3072,1,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
3,64,3072,768,0,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
4,64,768,3072,1,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
5,64,3072,768,0,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
6,64,768,3072,1,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
7,64,3072,768,0,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
8,64,768,3072,1,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
9,64,3072,768,0,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32


In [15]:
embedding_layers_features['optimizer'] = 4
embedding_layers_features['precision'] = 32

In [16]:
embedding_layers_features

Unnamed: 0,batchsize,seq_len,vocab_size,embed_dim,optimizer,GPU,Provisioning,Base Clock (MHz),Boost Clock (MHz),Memory Clock (MHz),...,ROPs,SM,TC,RT,PR,TR,FP16,FP32,FP64,precision
0,64,128,30522,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
1,64,128,128,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32


In [17]:
attention_layers_features['optimizer'] = 4
attention_layers_features['precision'] = 32

In [18]:
attention_layers_features

Unnamed: 0,batchsize,seq_len,num_heads,embed_dim,optimizer,GPU,Provisioning,Base Clock (MHz),Boost Clock (MHz),Memory Clock (MHz),...,ROPs,SM,TC,RT,PR,TR,FP16,FP32,FP64,precision
0,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
1,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
2,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
3,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
4,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
5,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
6,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
7,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
8,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32
9,64,128,12,768,4,V100,Cloud,1245,1380,876,...,128,80,640,0,176,441,28260,14130,7066,32


In [19]:
layernorm_layers_features['precision'] = 32
layernorm_layers_features['optimizer'] = 4

In [20]:
layernorm_layers_features

Unnamed: 0,batchsize,seq_len,embed_dim,optimizer,GPU,Provisioning,Base Clock (MHz),Boost Clock (MHz),Memory Clock (MHz),Memory (GB),...,ROPs,SM,TC,RT,PR,TR,FP16,FP32,FP64,precision
0,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32
1,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32
2,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32
3,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32
4,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32
5,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32
6,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32
7,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32
8,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32
9,64,128,768,4,V100,Cloud,1245,1380,876,16,...,128,80,640,0,176,441,28260,14130,7066,32


In [21]:
from predict.features import (
    PreprocessDenseFeatures,
    PreprocessAttentionFeatures,
    PreprocessEmbeddingFeatures,
    PreprocessLayerNormFeatures,
)

dense_features = PreprocessDenseFeatures(dense_layers_features, include_additional_features=True).features
attention_features = PreprocessAttentionFeatures(attention_layers_features, include_additional_features=True).features
embedding_features = PreprocessEmbeddingFeatures(embedding_layers_features, include_additional_features=True).features
layernorm_features = PreprocessLayerNormFeatures(layernorm_layers_features, include_additional_features=True).features

dense_features_less = PreprocessDenseFeatures(dense_layers_features, include_additional_features=False).features
attention_features_less = PreprocessAttentionFeatures(attention_layers_features, include_additional_features=False).features
embedding_features_less = PreprocessEmbeddingFeatures(embedding_layers_features, include_additional_features=False).features
layernorm_features_less = PreprocessLayerNormFeatures(layernorm_layers_features, include_additional_features=False).features

 'Adam' 'Adam' 'Adam' 'Adam' 'Adam' 'Adam' 'Adam' 'Adam' 'Adam' 'Adam'
 'Adam' 'Adam' 'Adam' 'Adam' 'Adam']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  self.features.loc[:, 'optimizer'] = self.features['optimizer'].map({0:'None',
 'relu' 'None' 'relu' 'None' 'relu' 'None' 'relu' 'None' 'relu' 'None'
 'relu' 'None' 'relu' 'None' 'None']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  self.features.loc[:, 'activation_fct'] = self.features['activation_fct'].map({0:'None',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.features['flops'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

In [22]:
dense_features = dense_features.drop(columns=['GPU'])
attention_features = attention_features.drop(columns=['GPU'])
embedding_features = embedding_features.drop(columns=['GPU'])
layernorm_features = layernorm_features.drop(columns=['GPU'])

dense_features_less = dense_features_less.drop(columns=['GPU'])
attention_features_less = attention_features_less.drop(columns=['GPU'])
embedding_features_less = embedding_features_less.drop(columns=['GPU'])
layernorm_features_less = layernorm_features_less.drop(columns=['GPU'])

In [23]:
dense_features

Unnamed: 0,Base Clock (MHz),Boost Clock (MHz),Bus_PCIe 3.0,Bus_PCIe 4.0,Cores,FP32,GPU Memory Bandwidth (GB/s),Memory (GB),Memory Bus (bit),Memory Clock (MHz),...,memory_out,memory_total,memory_weights,optimizer_Adadelta,optimizer_Adagrad,optimizer_Adam,optimizer_None,optimizer_RMSProp,optimizer_SGD,precision
0,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,196608,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32
1,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,49152,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32
2,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,196608,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32
3,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,49152,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32
4,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,196608,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32
5,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,49152,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32
6,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,196608,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32
7,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,49152,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32
8,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,196608,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32
9,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,49152,5210112,2359296,0.0,0.0,1.0,0.0,0.0,0.0,32


In [24]:
attention_features

Unnamed: 0,Base Clock (MHz),Boost Clock (MHz),Bus_PCIe 3.0,Bus_PCIe 4.0,Cores,FP32,GPU Memory Bandwidth (GB/s),Memory (GB),Memory Bus (bit),Memory Clock (MHz),...,memory_total,num_heads,optimizer_Adadelta,optimizer_Adagrad,optimizer_Adam,optimizer_None,optimizer_RMSProp,optimizer_SGD,precision,seq_len
0,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128
1,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128
2,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128
3,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128
4,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128
5,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128
6,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128
7,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128
8,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128
9,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,78643200,12,0.0,0.0,1.0,0.0,0.0,0.0,32,128


In [25]:
embedding_features

Unnamed: 0,Base Clock (MHz),Boost Clock (MHz),Bus_PCIe 3.0,Bus_PCIe 4.0,Cores,FP32,GPU Memory Bandwidth (GB/s),Memory (GB),Memory Bus (bit),Memory Clock (MHz),...,memory_total,optimizer_Adadelta,optimizer_Adagrad,optimizer_Adam,optimizer_None,optimizer_RMSProp,optimizer_SGD,precision,seq_len,vocab_size
0,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,59464704,0.0,0.0,1.0,0.0,0.0,0.0,32,128,30522
1,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,12779520,0.0,0.0,1.0,0.0,0.0,0.0,32,128,128


In [26]:
layernorm_features

Unnamed: 0,Base Clock (MHz),Boost Clock (MHz),Bus_PCIe 3.0,Bus_PCIe 4.0,Cores,FP32,GPU Memory Bandwidth (GB/s),Memory (GB),Memory Bus (bit),Memory Clock (MHz),...,batchsize,embed_dim,optimizer_Adadelta,optimizer_Adagrad,optimizer_Adam,optimizer_None,optimizer_RMSProp,optimizer_SGD,precision,seq_len
0,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128
1,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128
2,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128
3,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128
4,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128
5,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128
6,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128
7,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128
8,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128
9,1245,1380,1.0,0.0,5120,14130,897,16,4096,876,...,64,768,0.0,0.0,1.0,0.0,0.0,0.0,32,128


In [27]:
predicted_dense = dense_predictor.predict(dense_features)
predicted_attention = attention_predictor.predict(attention_features)
predicted_layernorm = layer_norm_predictor.predict(layernorm_features)
predicted_embedding = embedding_predictor.predict(embedding_features)

predicted_dense = sum(predicted_dense * (dataset_size / batch_size)) / 1000
predicted_attention = sum(predicted_attention * (dataset_size / batch_size)) / 1000
predicted_embedding = sum(predicted_embedding * (dataset_size / batch_size)) / 1000
predicted_layernorm = sum(predicted_layernorm * (dataset_size / batch_size)) / 1000

predicted_e2e = predicted_dense + predicted_attention + predicted_embedding + predicted_layernorm
predicted_e2e = predicted_e2e.item()

In [28]:
predicted_dense_less = dense_wo_features_predictor.predict(dense_features_less)
predicted_attention_less = attention_wo_feaures_predictor.predict(attention_features_less)
predicted_layernorm_less = layer_norm_wo_feaures_predictor.predict(layernorm_features_less)
predicted_embedding_less = embedding_wo_feaures_predictor.predict(embedding_features_less)

predicted_dense_less = sum(predicted_dense_less * (dataset_size / batch_size)) / 1000
predicted_attention_less = sum(predicted_attention_less * (dataset_size / batch_size)) / 1000
predicted_embedding_less = sum(predicted_embedding_less * (dataset_size / batch_size)) / 1000
predicted_layernorm_less = sum(predicted_layernorm_less * (dataset_size / batch_size)) / 1000

predicted_e2e_less = predicted_dense_less + predicted_attention_less + predicted_embedding_less + predicted_layernorm_less
predicted_e2e_less = predicted_e2e_less.item()

In [29]:
print(f"Error in prediction with all features for BERT: {abs(predicted_e2e - time_for_one_epoch) / time_for_one_epoch * 100:.2f}%")
print(f'Predicted time for one epoch will all features for BERT: {predicted_e2e:.2f} seconds')
print(f'Actual time for one epoch: {time_for_one_epoch:.2f} seconds')

Error in prediction with all features for BERT: 62.97%
Predicted time for one epoch will all features for BERT: 3.04 seconds
Actual time for one epoch: 8.22 seconds


In [30]:
print(f"Error in prediction with raw features for BERT: {abs(predicted_e2e_less - time_for_one_epoch) / time_for_one_epoch * 100:.2f}%")
print(f'Predicted time for one epoch with raw features for BERT: {predicted_e2e_less:.2f} seconds')
print(f'Actual time for one epoch: {time_for_one_epoch:.2f} seconds')

Error in prediction with raw features for BERT: 64.07%
Predicted time for one epoch with raw features for BERT: 2.95 seconds
Actual time for one epoch: 8.22 seconds
