
Ref:

*   [TPU Graphs](https://arxiv.org/pdf/2308.13490.pdf)
*   [GraphSAGE](https://arxiv.org/pdf/1706.02216.pdf)
*   [Ranked List Loss for Deep Metric Learning](https://arxiv.org/pdf/1903.03238.pdf)


TODO:

*   split dataset to make upload faster
*   test validation


Notes:

*   Training simple model with MSE loss:
    *   need hyperparamter search
    *   why does the loss spice at the beginning of each epoch (batches are randomized)
    *   oberservations: seems that the smaller models just learn some average absolute value, but not really a ranking
    * probably model would have to be huge to rank correctly

*   Training simple model with ranking loss:
    *   Ranked List Loss
    *   Extract the smallest k times

*   Abandoning simple model:
    *   Replicate TPU paper:
        *   SageGraphs
        *   ResGCN

## Dependencies


In [None]:
!pip install torch-geometric
!pip install einops

Collecting torch-geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910454 sha256=904ea7c403cfc362d9d53766c4a10f2b1487ff61c40a6e912780f7a2b46cdc1f
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch-geometric
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.3.1
Collecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━

In [None]:
import os
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils.data import Dataset
from torch_geometric import nn as gnn

from torch.nn import Linear, ReLU, Dropout
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from einops import reduce, repeat, rearrange

from torch.utils.data import Dataset, DataLoader

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


## Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
shutil.unpack_archive("/content/drive/MyDrive/google-tpu/predict-ai-model-runtime.zip", "/content/data")

In [None]:
splits = ["train", "valid", "test"]

nlp_default = '/content/data/npz_all/npz/layout/nlp/default'
nlp_random = '/content/data/npz_all/npz/layout/nlp/random'
xla_default = '/content/data/npz_all/npz/layout/xla/default'
xla_random = '/content/data/npz_all/npz/layout/xla/random'

xla_tile = '/content/data/npz_all/npz/tile/xla'

In [None]:
def load_data_to_df(directory, split):

    path = os.path.join(directory, split)
    files = [os.path.join(path, file) for file in os.listdir(path)]
    data_list = []
    for file in tqdm(files):
        data = dict(np.load(file))
        data_list.append(data)

    return pd.DataFrame(data_list)

### geometric dataloader

In [None]:
from torch_geometric.data import Dataset, Data
class RuntimeDataset(Dataset):

    def __init__(self, dataset):
            super().__init__()
            self.dataset = dataset

    def len(self): # TODO: not __len__ and __get__?
        return len(self.dataset)

    def get(self, index):
        data_row = self.dataset.loc[index]
        normalized_runtime = torch.tensor(data_row['config_runtime'] / data_row['config_runtime_normalizers'], dtype=torch.float32)
        return Data(
          node_feat=torch.tensor(data_row['node_feat'], dtype=torch.float32),
          edge_index=torch.tensor(data_row['edge_index'], dtype=torch.long).t().contiguous(),
          node_opcode=torch.tensor(data_row['node_opcode'], dtype=torch.int32),
          config_feat=torch.tensor(data_row['config_feat'], dtype=torch.float32),
          y=normalized_runtime,
          number_configs=torch.tensor([len(data_row['config_feat'])]) # needed to match config_feat to the corresponding graph in the batch
        )

In [None]:
from torch_geometric.loader import DataLoader

def runtime_data_loader(dataset: RuntimeDataset, batch_size=32, shuffle=True):
    '''
       dataset: RuntimeDataset, containing data
       ranking: if True will rank the different configurations and include according tensor in the dataset
    '''

    data_loader = DataLoader(dataset, batch_size, shuffle=True)
    return data_loader

## Model

In [None]:
from torch_geometric.data import Batch
class GNN(nn.Module):
    def __init__(self, len_opcode_embedd, hidden_dim, output_dim, num_layers):
        super(GNN, self).__init__()

        self.len_opcode = len_opcode_embedd
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        # Embeddings
        self.embedding_layer = nn.Embedding(num_embeddings = 120, embedding_dim=len_opcode_embedd)

        #GNN
        input_dim = len_opcode_embedd + 140
        layers = []
        layers.append(GCNConv(input_dim, hidden_dim))
        for _ in range(num_layers - 2):
            layers.append(GCNConv(hidden_dim, hidden_dim))
        layers.append(GCNConv(hidden_dim, output_dim))
        self.conv = nn.Sequential(*layers)

        # Linear
        self.linear = nn.Sequential(
            nn.Linear(output_dim + 24, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, data):

        opcode_embedd = self.embedding_layer(data['node_opcode']) # (n,) -> (n,len_opcode_embedd)

        x = torch.cat((opcode_embedd, data['node_feat']), dim=1) # [(n, len_opcode_embedd), (n,140)] -> (n, len_opcode_embedd + 140)

        for layer in self.conv:
            x = layer(x, data['edge_index'])
            x = torch.relu(x)


        # differ two cases:
        # 1) batched data used for training
        # 2) single graph Data object used for inference

        if isinstance(data, Batch):

          '''
          The geometric data loader will take batch_size number of graphs. Then it will take all nodes in all these graphs and fuse them together into one graph.
          On this fuesed graph it will perform the convolution to calculate all the node embeddings at once.
          To apply the linear layer we have to seperate out all the graphs out of the batch again.
          '''

          # tensor used to store config predictions for each graph
          configs = torch.empty(0,1).to(device)

          # used to retrieve the config_feat tensors for each graph
          total = 0

          for graph_ind in range(data.num_graphs):

            # using a mask to gather all nodes that belong to the graph_ind-th graph
            node_indices = (data.batch == graph_ind).nonzero(as_tuple=True)[0]
            # first dimension is the number of nodes in the graph with index graph_ind, second dimensions is the feature dimension of the convolution (n, output_dim)
            graph_nodes = x[node_indices]

            # reduce node embeedings to get a graph embedding
            temp = reduce(graph_nodes, 'n f -> f', 'mean')

            # number of configurations for the graph_ind-th graph
            c = data.number_configs[graph_ind]

            # config_feat for the graph_ind-th graph
            graph_config_feat = data.config_feat[total:total+c]

            total += c

            # concatinating graph embedding with config_feat
            temp = repeat(temp, 'f -> r f', r=c)
            temp = torch.cat((temp, graph_config_feat), dim=1)


            # apply linear layer to tensor with shape (c, output_dim+24)
            temp = self.linear(temp)

            # add calculated runtimes to configs
            configs = torch.cat((configs, temp), dim=0)

          configs = rearrange(configs, 'f 1 -> f')

          return configs


        # only working with one grap object
        else:

          x = reduce(x, 'n f -> f', 'mean') # (n, output_dim) -> (output_dim, )

          x = repeat(x, 'f -> r f', r=len(data['config_feat'])) # (output_dim,) -> (c, output_dim)

          x = torch.cat((x, data['config_feat']), dim=1) # [(x, output_dim), (c,24)] -> (c, output_dim + 24)

          x = self.linear(x)

          x = rearrange(x, 'f 1 -> f')

        return x

In [None]:
class GNN_bugged(nn.Module):
    def __init__(self, len_opcode_embedd, hidden_dim, output_dim, num_layers):
        super(GNN_bugged, self).__init__()
        # Embeddings
        self.embedding_layer = nn.Embedding(num_embeddings = 120, embedding_dim=len_opcode_embedd)

        #GNN
        input_dim = len_opcode_embedd + 140
        layers = []
        layers.append(GCNConv(input_dim, hidden_dim))
        for _ in range(num_layers - 2):
            layers.append(GCNConv(hidden_dim, hidden_dim))
        layers.append(GCNConv(hidden_dim, output_dim))
        self.conv = nn.Sequential(*layers)

        # Linear
        self.linear = nn.Sequential(
            nn.Linear(output_dim + 24, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, data):
        opcode_embedd = self.embedding_layer(data['node_opcode']) # (n,) -> (n,len_opcode_embedd)

        x = torch.cat((opcode_embedd, data['node_feat']), dim=1) # [(n, len_opcode_embedd), (n,140)] -> (n, len_opcode_embedd + 140)

        for layer in self.conv:
            x = layer(x, data['edge_index'])
            x = torch.relu(x)

        x = reduce(x, 'n f -> f', 'mean') # (n, output_dim) -> (outputdim, )

        x = repeat(x, 'f -> r f', r=len(data['config_feat'])) # (output_dim,) -> (c, output_dim)

        x = torch.cat((x, data['config_feat']), dim=1) # [(x, output_dim), (c,24)] -> (c, output_dim + 24)

        x = self.linear(x)

        x = rearrange(x, 'b 1 -> b')

        return x

In [None]:
class dummy_model(nn.Module):
  def __init__(self):
    super(dummy_model, self).__init__()
    self.linear = nn.Sequential(
            nn.Linear(24, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
  def forward(self, x):
      x = self.linear(x)
      x = rearrange(x, 'f 1 -> f')
      return x

In [None]:
class testing(nn.Module):
  def __init__(self, prob: float):
    super(testing, self).__init__()
    self.prob = prob

  def forward(self, data):
    return data.y


## Train Loop

In [None]:
def MSE_training(model: GNN, dataloader: DataLoader, epochs: int, lr=0.01):
    '''
    Training model using MSE
    '''

    print(type(model))

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    #optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    #optimizer = torch.optim.Adagrad(model.parameters(), lr=0.01)


    loss_fn = torch.nn.MSELoss()

    model.to(device)

    for epoch in range(epochs):
        for batch, data in enumerate(dataloader):
            data = data.to(device)
            optimizer.zero_grad()
            x_pred = model(data)
            #print(f"shapes: {x_pred.shape}, {data['y'].shape}")
            loss = loss_fn(x_pred, data['y'])
            loss.backward()
            if batch % 200 == 0:
                print('Epoch [{}/{}], Batch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, batch+1, len(dataloader), loss.item()))

In [None]:
def train_dummy(model, dataloader,epochs,lr=0.01):
  '''
  Training model using MSE
  '''

  print(type(model))

  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  #optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
  #optimizer = torch.optim.Adagrad(model.parameters(), lr=0.01)


  loss_fn = torch.nn.MSELoss()

  model.to(device)

  for epoch in range(epochs):
      for batch, data in enumerate(dataloader):
          data = data.to(device)
          optimizer.zero_grad()
          x_pred = model(data.config_feat)
          #print(f"shapes: {x_pred.shape}, {data['y'].shape}")
          loss = loss_fn(x_pred, data['y'])
          loss.backward()
          if batch % 200 == 0:
              print('Epoch [{}/{}], Batch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, batch+1, len(dataloader), loss.item()))

## Validation

In [None]:
import torch

x = torch.tensor([3.0, 4.0, 5.0, 1.0, 2.0])
x_ranked = torch.argsort(x.squeeze(), descending=False)

print(x_ranked)
# Output: tensor([3, 4, 0, 1, 2])

tensor([3, 4, 0, 1, 2])


In [None]:
def validate_graph_ranking(x: torch.tensor, y: torch.tensor, k: int):
    '''
    evaluation metric for tiles
    '''
    x_ranked = torch.argsort(x.squeeze(), descending=False)
    k_top = []

    for i in range(min(len(x_ranked), k)):
        k_top.append(y[x_ranked[i]])

    best_runtime = torch.min(y)
    k_top_best_runtime = min(k_top)

    return 2 - k_top_best_runtime / best_runtime

In [None]:
def validate_model(model: GNN, dataloader: DataLoader, k: int):
  '''
  counts graphs for which the top k predictions contained the actual best runtime
  '''
  correct, total = 0, 0
  model.to(device)
  for _, data in enumerate(tqdm(dataloader)):
    data = data.to(device)
    total = total + data.num_graphs
    x_pred = model(data)
    ind = 0
    for i in range(data.num_graphs):

      if validate_graph_ranking(x_pred[ind:ind+data.number_configs[i]], data.y[ind:ind+data.number_configs[i]], k) == 1:
        correct += 1
      ind += data.number_configs[i]

  print(f'correct k-tops predictions: [{correct}/{total}], {100*correct/total}%')

In [None]:
def validate_dummy_model(model, dataloader: DataLoader, k: int):
  '''
  counts graphs for which the top k predictions contained the actual best runtime
  '''
  correct, total = 0, 0
  model.to(device)
  for _, data in enumerate(tqdm(dataloader)):
    data = data.to(device)
    total = total + data.num_graphs
    x_pred = model(data.config_feat)
    ind = 0
    for i in range(data.num_graphs):

      if validate_graph_ranking(x_pred[ind:ind+data.number_configs[i]], data['y'][ind:ind+data.number_configs[i]], k) == 1:
        correct = correct + 1

      ind += data.number_configs[i]

  print(f'correct k-tops predictions: [{correct}/{total}], {100*correct/total}%')

## Testing

### data_loader


In [None]:
df = load_data_to_df(xla_tile, "train")

100%|██████████| 5709/5709 [00:15<00:00, 357.14it/s]


In [None]:
torch.manual_seed(42)
dataset = RuntimeDataset(df)
data_loader = runtime_data_loader(dataset, batch_size=4)
len(data_loader)

1428

### validation

In [None]:
test_val = testing(0.5)
validate_model(test_val, data_loader, 5)

100%|██████████| 1428/1428 [00:06<00:00, 206.06it/s]


correct k-tops predictions: [5709/5709], 100.0%


### dummy model

In [None]:
torch.manual_seed(42)
dummy = dummy_model()
train_dummy(dummy, data_loader, 3)


<class '__main__.dummy_model'>
Epoch [1/3], Batch [1/1428], Loss: 33.4778
Epoch [1/3], Batch [201/1428], Loss: 36.1460
Epoch [1/3], Batch [401/1428], Loss: 153.7309
Epoch [1/3], Batch [601/1428], Loss: 44.9505
Epoch [1/3], Batch [801/1428], Loss: 76.1029
Epoch [1/3], Batch [1001/1428], Loss: 97.7942
Epoch [1/3], Batch [1201/1428], Loss: 18.3323
Epoch [1/3], Batch [1401/1428], Loss: 24.0022
Epoch [2/3], Batch [1/1428], Loss: 41.0310
Epoch [2/3], Batch [201/1428], Loss: 41.1890
Epoch [2/3], Batch [401/1428], Loss: 55.5533
Epoch [2/3], Batch [601/1428], Loss: 54.1709
Epoch [2/3], Batch [801/1428], Loss: 44.5520
Epoch [2/3], Batch [1001/1428], Loss: 13.8332
Epoch [2/3], Batch [1201/1428], Loss: 28.6817
Epoch [2/3], Batch [1401/1428], Loss: 28.2588
Epoch [3/3], Batch [1/1428], Loss: 516.8983
Epoch [3/3], Batch [201/1428], Loss: 109.7388
Epoch [3/3], Batch [401/1428], Loss: 210.7040
Epoch [3/3], Batch [601/1428], Loss: 17.1954
Epoch [3/3], Batch [801/1428], Loss: 28.5319
Epoch [3/3], Batch [

In [None]:
validate_dummy_model(dummy, data_loader, 5)

100%|██████████| 1428/1428 [00:10<00:00, 136.55it/s]

correct k-tops predictions: [470/5709], 8.232615169031353%





### gnn

In [None]:
torch.manual_seed(42)

model_bugged = GNN_bugged(len_opcode_embedd=8, hidden_dim=32, output_dim=16, num_layers=3)
MSE_training(model_bugged, data_loader, 3, lr=0.0001)

<class '__main__.GNN_bugged'>
Epoch [1/3], Batch [1/1428], Loss: 110787552.0000
Epoch [1/3], Batch [201/1428], Loss: 712377024.0000
Epoch [1/3], Batch [401/1428], Loss: 178438800.0000
Epoch [1/3], Batch [601/1428], Loss: 33185614.0000
Epoch [1/3], Batch [801/1428], Loss: 251439200.0000
Epoch [1/3], Batch [1001/1428], Loss: 11946582.0000
Epoch [1/3], Batch [1201/1428], Loss: 228379216.0000
Epoch [1/3], Batch [1401/1428], Loss: 235065065472.0000
Epoch [2/3], Batch [1/1428], Loss: 5062701056.0000
Epoch [2/3], Batch [201/1428], Loss: 7467978752.0000
Epoch [2/3], Batch [401/1428], Loss: 1007073344.0000
Epoch [2/3], Batch [601/1428], Loss: 52635556.0000
Epoch [2/3], Batch [801/1428], Loss: 283247456.0000
Epoch [2/3], Batch [1001/1428], Loss: 12246102016.0000
Epoch [2/3], Batch [1201/1428], Loss: 162581728.0000
Epoch [2/3], Batch [1401/1428], Loss: 108249000.0000
Epoch [3/3], Batch [1/1428], Loss: 259787024.0000
Epoch [3/3], Batch [201/1428], Loss: 1033679872.0000
Epoch [3/3], Batch [401/1428

In [None]:
validate_model(model_bugged, data_loader, 5)

100%|██████████| 1428/1428 [00:15<00:00, 95.16it/s] 


correct k-tops predictions: [329/5709], 5.7628306183219475%


In [None]:
torch.manual_seed(42)
model = GNN(len_opcode_embedd=32, hidden_dim=512, output_dim=256, num_layers=8)
MSE_training(model, data_loader, 15, lr=0.1)

<class '__main__.GNN'>
Epoch [1/15], Batch [1/1428], Loss: 2142616.7500
Epoch [1/15], Batch [201/1428], Loss: 48851048.0000
Epoch [1/15], Batch [401/1428], Loss: 18449040.0000
Epoch [1/15], Batch [601/1428], Loss: 29798194.0000
Epoch [1/15], Batch [801/1428], Loss: 1977245.0000
Epoch [1/15], Batch [1001/1428], Loss: 48957076.0000
Epoch [1/15], Batch [1201/1428], Loss: 468172.1250
Epoch [1/15], Batch [1401/1428], Loss: 1801847936.0000
Epoch [2/15], Batch [1/1428], Loss: 3641160.7500
Epoch [2/15], Batch [201/1428], Loss: 9909286.0000
Epoch [2/15], Batch [401/1428], Loss: 236798544.0000
Epoch [2/15], Batch [601/1428], Loss: 2537408.2500
Epoch [2/15], Batch [801/1428], Loss: 17903664.0000
Epoch [2/15], Batch [1001/1428], Loss: 543794752.0000
Epoch [2/15], Batch [1201/1428], Loss: 26592594.0000
Epoch [2/15], Batch [1401/1428], Loss: 6845207.0000
Epoch [3/15], Batch [1/1428], Loss: 42193964.0000
Epoch [3/15], Batch [201/1428], Loss: 64478348.0000
Epoch [3/15], Batch [401/1428], Loss: 4497660

In [None]:
validate_model(model, data_loader, 5)

100%|██████████| 1428/1428 [00:25<00:00, 55.64it/s]

correct k-tops predictions: [156/5709], 2.7325275880189177%





In [None]:
torch.manual_seed(42)
model_control = GNN(len_opcode_embedd=8, hidden_dim=32, output_dim=16, num_layers=3)
validate_model(model_control, data_loader, 5)

torch.manual_seed(42)
model_control_bugged = GNN_bugged(len_opcode_embedd=8, hidden_dim=32, output_dim=16, num_layers=3)
validate_model(model_control_bugged, data_loader, 5)

NameError: ignored

In [None]:
torch.manual_seed(42)
dataset = RuntimeDataset(df)
data_loader = runtime_data_loader(dataset)
len(data_loader)