<a href="https://colab.research.google.com/github/max-seeli/ai-model-runtime-prediction/blob/main/google_tpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Ref:

*   [TPU Graphs](https://arxiv.org/pdf/2308.13490.pdf)
*   [GraphSAGE](https://arxiv.org/pdf/1706.02216.pdf)
*   [Ranked List Loss for Deep Metric Learning](https://arxiv.org/pdf/1903.03238.pdf)


TODO:

*   split dataset to make upload faster
*   test validation


Notes:

*   Training simple model with MSE loss:
    *   need hyperparamter search
    *   why does the loss spice at the beginning of each epoch (batches are randomized)
    *   oberservations: seems that the smaller models just learn some average absolute value, but not really a ranking
    * probably model would have to be huge to rank correctly

*   Training simple model with ranking loss:
    *   Ranked List Loss
    *   Extract the smallest k times

*   Abandoning simple model:
    *   Replicate TPU paper:
        *   SageGraphs
        *   ResGCN

## Dependencies


In [2]:
!pip install torch-geometric
!pip install einops



In [3]:
import os
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
from torch import nn

from torch_geometric import nn as gnn

from torch.nn import Linear, ReLU, Dropout
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from einops import reduce, repeat, rearrange

from torch.utils.data import Dataset, DataLoader

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


## Load Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import shutil
shutil.unpack_archive("/content/drive/MyDrive/google-tpu/predict-ai-model-runtime.zip", "/content/data")

In [5]:
splits = ["train", "valid", "test"]

nlp_default = '/content/data/npz_all/npz/layout/nlp/default'
nlp_random = '/content/data/npz_all/npz/layout/nlp/random'
xla_default = '/content/data/npz_all/npz/layout/xla/default'
xla_random = '/content/data/npz_all/npz/layout/xla/random'
xla_tile = '/content/data/npz_all/npz/tile/xla'

In [6]:
def load_data_to_df(directory, split):

    path = os.path.join(directory, split)
    files = [os.path.join(path, file) for file in os.listdir(path)]
    data_list = []
    for file in tqdm(files):
        data = dict(np.load(file))
        data_list.append(data)

    return pd.DataFrame(data_list)

### geometric dataloader

In [7]:
from torch_geometric.data import Dataset, Data
from typing import Literal

class RuntimeDataset(Dataset):

    def __init__(self, data, mode: Literal["tile", "layout"]):
            super().__init__()
            if mode == "tile":
              self.dataset = data
              self.mode = mode

            elif mode == "layout":

              '''
              Approach:
              For every graph add c number of graphs to the dataset where every one of these graphs has node features that are the concat node features.
              (nodes that don't have additional features are padded with zeros)
              This has implications for training: we are now predicting a scalar value for each graph configuration (can not really train with ranking loss anymore)
              '''

              self.dataset = pd.DataFrame(columns=['graph_id', 'node_feat', 'node_opcode', 'edge_index', 'config_runtime'])

              for index, row in data.iterrows():

                c = row['config_runtime'].shape[0]

                node_feat = torch.tensor(row['node_feat'], dtype=torch.float32) # (n, 140)
                node_config_feat = torch.tensor(row['node_config_feat'], dtype=torch.float32) # (c, nc, 18)

                node_feat = repeat(node_feat, 'n f -> r n f', r = c) # repeats every node c times (c, n, 140)
                node = torch.zeros((c, node_feat.shape[1], 18)) # (c, n, 18)

                node = torch.cat((node_feat, node), dim=2)

                for j in range(node_config_feat.shape[1]):
                  node[:,j,18:] = node_config_feat[:,j,:]

                for i in range(c):

                  graph = {'graph_id': index,
                           'node_feat': node[i],
                           'node_opcode': row['node_opcode'],
                           'edge_index': row['edge_index'],
                           'config_runime': row['config_runtime'][i]
                           }
                  self.dataset.append(graph, ignore_index=True)


    def len(self):
        return len(self.dataset)

    def get(self, index):
        data_row = self.dataset.loc[index]
        if self.mode == "tile":
          normalized_runtime = torch.tensor(data_row['config_runtime'] / data_row['config_runtime_normalizers'], dtype=torch.float32)
          return Data(
            node_feat=torch.tensor(data_row['node_feat'], dtype=torch.float32),
            edge_index=torch.tensor(data_row['edge_index'], dtype=torch.long).t().contiguous(),
            node_opcode=torch.tensor(data_row['node_opcode'], dtype=torch.int32),
            config_feat=torch.tensor(data_row['config_feat'], dtype=torch.float32),
            y=normalized_runtime, # TODO: rename
            number_configs=torch.tensor([len(data_row['config_runtime'])]) # needed to match config_feat to the corresponding graph in the batch
          )

        else:
          data_row = self.datset.loc[index]
          return Data(
            graph_id = torch.tensor(data_row['graph_id'], dtype=torch.long),
            node_feat=data_row['node_feat'],
            edge_index=torch.tensor(data_row['edge_index'], dtype=torch.long).t().contiguous(),
            node_opcode=torch.tensor(data_row['node_opcode'], dtype=torch.int32),
            config_runtime=torch.tensor(data_row['config_runtime'], dtype=torch.long)
          )


In [8]:
from torch_geometric.loader import DataLoader

def runtime_data_loader(dataset: RuntimeDataset, batch_size=32, shuffle=True):
    '''
       dataset: RuntimeDataset, containing data
    '''

    data_loader = DataLoader(dataset, batch_size, shuffle=True)
    return data_loader

## Model

In [None]:
from torch_geometric.data import Batch
class Tile_GNN(nn.Module):
    def __init__(self, len_opcode_embedd, hidden_dim, output_dim, num_layers):
        super(Tile_GNN, self).__init__()

        self.len_opcode = len_opcode_embedd
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        # Embeddings
        self.embedding_layer = nn.Embedding(num_embeddings = 120, embedding_dim=len_opcode_embedd)

        #GNN
        input_dim = len_opcode_embedd + 140
        layers = nn.ModuleList()
        layers.append(GCNConv(input_dim, hidden_dim))
        for _ in range(num_layers - 2): # TODO list comp, ReLU
            layers.append(GCNConv(hidden_dim, hidden_dim))
        layers.append(GCNConv(hidden_dim, output_dim))

        self.conv = layers

        # Linear
        self.linear = nn.Sequential(
            nn.Linear(output_dim + 24, 48),
            nn.ReLU(),
            nn.Linear(48, 48),
            nn.ReLU(),
            nn.Linear(48, 1)
        )

    def forward(self, data):

        opcode_embedd = self.embedding_layer(data['node_opcode']) # (n,) -> (n,len_opcode_embedd)

        x = torch.cat((opcode_embedd, data['node_feat']), dim=1) # [(n, len_opcode_embedd), (n,140)] -> (n, len_opcode_embedd + 140)

        for layer in self.conv:
          x = layer(x, data['edge_index'])
          x = torch.relu(x)

        # differ two cases:
        # 1) batched data used for training
        # 2) single graph Data object used for inference

        if isinstance(data, Batch):

          '''
          The geometric data loader will take batch_size number of graphs. Then it will take all nodes in all these graphs and fuse them together into one graph.
          On this fuesed graph it will perform the convolution to calculate all the node embeddings at once.
          To apply the linear layer we have to seperate out all the graphs out of the batch again.
          '''

          # tensor used to store config predictions for each graph
          configs = torch.empty(0,1).to(device)

          # used to retrieve the config_feat tensors for each graph
          total = 0

          for graph_ind in range(data.num_graphs):

            # using a mask to gather all nodes that belong to the graph_ind-th graph
            node_indices = (data.batch == graph_ind).nonzero(as_tuple=True)[0]
            # first dimension is the number of nodes in the graph with index graph_ind, second dimensions is the feature dimension of the convolution (n, output_dim)
            graph_nodes = x[node_indices]

            # reduce node embeedings to get a graph embedding
            temp = reduce(graph_nodes, 'n f -> f', 'mean')

            # number of configurations for the graph_ind-th graph
            c = data.number_configs[graph_ind]

            # config_feat for the graph_ind-th graph
            graph_config_feat = data.config_feat[total:total+c]

            total += c

            # concatinating graph embedding with config_feat
            temp = repeat(temp, 'f -> r f', r=c)
            temp = torch.cat((temp, graph_config_feat), dim=1)


            # apply linear layer to tensor with shape (c, output_dim+24)
            temp = self.linear(temp)

            # add calculated runtimes to configs
            configs = torch.cat((configs, temp), dim=0)

          configs = rearrange(configs, 'f 1 -> f')

          return configs

        # only working with one graph object

        else:

          x = reduce(x, 'n f -> f', 'mean') # (n, output_dim) -> (output_dim, )

          x = repeat(x, 'f -> r f', r=len(data['config_feat'])) # (output_dim,) -> (c, output_dim)

          x = torch.cat((x, data['config_feat']), dim=1) # [(x, output_dim), (c,24)] -> (c, output_dim + 24)

          x = self.linear(x)

          x = rearrange(x, 'f 1 -> f')

        return x

In [None]:
from torch_geometric.data import Batch
class Layout_GNN(nn.Module):
    def __init__(self, len_opcode_embedd, hidden_dim, output_dim, num_layers):
        super(Tile_GNN, self).__init__()

        self.len_opcode = len_opcode_embedd
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers

        # Embeddings
        self.embedding_layer = nn.Embedding(num_embeddings = 120, embedding_dim=len_opcode_embedd)

        #GNN
        input_dim = len_opcode_embedd + 140
        layers = nn.ModuleList()
        layers.append(GCNConv(input_dim, hidden_dim))
        for _ in range(num_layers - 2): # TODO list comp, ReLU
            layers.append(GCNConv(hidden_dim, hidden_dim))
        layers.append(GCNConv(hidden_dim, output_dim))

        self.conv = layers

        # Linear
        self.linear = nn.Sequential(
            nn.Linear(output_dim + 24, 48),
            nn.ReLU(),
            nn.Linear(48, 48),
            nn.ReLU(),
            nn.Linear(48, 1)
        )

    def forward(self, data):

        opcode_embedd = self.embedding_layer(data['node_opcode']) # (n,) -> (n,len_opcode_embedd)

        x = torch.cat((opcode_embedd, data['node_feat']), dim=1) # [(n, len_opcode_embedd), (n,140)] -> (n, len_opcode_embedd + 140)

        for layer in self.conv:
          x = layer(x, data['edge_index'])
          x = torch.relu(x)

        # differ two cases:
        # 1) batched data used for training
        # 2) single graph Data object used for inference

        if isinstance(data, Batch):

          '''
          The geometric data loader will take batch_size number of graphs. Then it will take all nodes in all these graphs and fuse them together into one graph.
          On this fuesed graph it will perform the convolution to calculate all the node embeddings at once.
          To apply the linear layer we have to seperate out all the graphs out of the batch again.
          '''

          # tensor used to store config predictions for each graph
          configs = torch.empty(0,1).to(device)

          # used to retrieve the config_feat tensors for each graph
          total = 0

          for graph_ind in range(data.num_graphs):

            # using a mask to gather all nodes that belong to the graph_ind-th graph
            node_indices = (data.batch == graph_ind).nonzero(as_tuple=True)[0]
            # first dimension is the number of nodes in the graph with index graph_ind, second dimensions is the feature dimension of the convolution (n, output_dim)
            graph_nodes = x[node_indices]

            # reduce node embeedings to get a graph embedding
            temp = reduce(graph_nodes, 'n f -> f', 'mean')

            # number of configurations for the graph_ind-th graph
            c = data.number_configs[graph_ind]

            # config_feat for the graph_ind-th graph
            graph_config_feat = data.config_feat[total:total+c]

            total += c

            # concatinating graph embedding with config_feat
            temp = repeat(temp, 'f -> r f', r=c)
            temp = torch.cat((temp, graph_config_feat), dim=1)


            # apply linear layer to tensor with shape (c, output_dim+24)
            temp = self.linear(temp)

            # add calculated runtimes to configs
            configs = torch.cat((configs, temp), dim=0)

          configs = rearrange(configs, 'f 1 -> f')

          return configs

        # only working with one graph object

        else:

          x = reduce(x, 'n f -> f', 'mean') # (n, output_dim) -> (output_dim, )

          x = repeat(x, 'f -> r f', r=len(data['config_feat'])) # (output_dim,) -> (c, output_dim)

          x = torch.cat((x, data['config_feat']), dim=1) # [(x, output_dim), (c,24)] -> (c, output_dim + 24)

          x = self.linear(x)

          x = rearrange(x, 'f 1 -> f')

        return x

## Train Loop

In [None]:
def MSE_training(model: Tile_GNN, dataloader: DataLoader, epochs: int, lr=0.01):
    '''
    Training model using MSE
    '''

    print(type(model))

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    #optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    #optimizer = torch.optim.Adagrad(model.parameters(), lr=0.01)


    loss_fn = torch.nn.MSELoss()

    model.to(device)
    model.train()

    for epoch in tqdm(range(epochs)):
        for batch, data in enumerate(dataloader):
            data = data.to(device)
            optimizer.zero_grad()
            x_pred = model(data)
            #print(f"shapes: {x_pred.shape}, {data['y'].shape}")
            loss = loss_fn(x_pred, data['y'])/len(data['y'])
            loss.backward()
            optimizer.step()
            #if batch % 200 == 0:
            #    print('Epoch [{}/{}], Batch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, batch+1, len(dataloader), loss.item()))

In [None]:
def no_batch_training(model, dataset, epochs, lr=0.01):
  '''
  Training model using MSE
  '''

  print(type(model))
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  #optimizer = torch.optim.SGD(model.parameters(), lr=lr)
  #optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)


  loss_fn = torch.nn.MSELoss()

  model.to(device)
  model.train()
  for epoch in tqdm(range(epochs)):
      for num, graph in enumerate(dataset):
          graph = graph.to(device)
          optimizer.zero_grad()
          x_pred = model(graph)
          #print(f"shapes: {x_pred.shape}, {data['y'].shape}")
          loss = loss_fn(x_pred, graph['y'])/len(graph['y'])
          loss.backward()
          optimizer.step()
          #if num % 500 == 0:
          #    print('Epoch [{}/{}], Batch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, num+1, len(dataset), loss.item()))

## Validation

In [None]:
from torch_geometric.data import Data

def validate_model(model, dataset):


    # Make predictions
    predictions = []
    model.to(device)
    model.eval()

    for tile in tqdm(dataset):
        tile.to(device)
        out = model(tile)
        predictions.append(torch.sort(out).indices)

    # Calculate score
    score = 0.0

    for i, tile in tqdm(enumerate(dataset), total=len(dataset)):
        best_prediction = min([dataset[i]["y"][pred_ind] for pred_ind in predictions[i][:5]])
        best_total = min(dataset[i]["y"])
        score += 2.0 - best_prediction / best_total

    avg_score = score / len(dataset)
    print("Score:", avg_score)
    return avg_score

## Testing

### Tiles

In [None]:
df_train = load_data_to_df(xla_tile, "train")
df_valid = load_data_to_df(xla_tile, "valid")

 61%|██████▏   | 3497/5709 [00:15<00:10, 220.73it/s]


KeyboardInterrupt: ignored

In [None]:
torch.manual_seed(42)
train_dataset = RuntimeDataset(df_train, mode="tile")
valid_dataset = RuntimeDataset(df_valid, mode="tile")
data_loader = runtime_data_loader(train_dataset, batch_size=64)
len(data_loader)

NameError: ignored

In [None]:
torch.manual_seed(42)
model = Tile_GNN(len_opcode_embedd=12, hidden_dim=128, output_dim=64, num_layers=8)
MSE_training(model, data_loader, 20, lr=0.01)

<class '__main__.Tile_GNN'>


  5%|▌         | 1/20 [00:26<08:14, 26.03s/it]


KeyboardInterrupt: ignored

In [None]:
validate_model(model, train_dataset)

100%|██████████| 5709/5709 [00:42<00:00, 134.09it/s]
100%|██████████| 5709/5709 [00:59<00:00, 95.22it/s] 

Score: tensor(0.0900)





tensor(0.0900)

In [None]:
validate_model(model, valid_dataset)

100%|██████████| 676/676 [00:05<00:00, 130.62it/s]
100%|██████████| 676/676 [00:06<00:00, 106.80it/s]

Score: tensor(0.9728)





tensor(0.9728)

In [None]:
sum(p.numel() for p in model.parameters())

15089

### Layout

In [None]:
print("tile")
print("xla")
! cd data/npz_all/npz/tile/xla && du -sh test && du -sh train && du -sh valid
print("layout")
print("nlp")
print("default")
! cd data/npz_all/npz/layout/nlp/default && du -sh test && du -sh train && du -sh valid
print("random")
! cd data/npz_all/npz/layout/nlp/random && du -sh test && du -sh train && du -sh valid
print("xla")
print("default")
! cd data/npz_all/npz/layout/xla/default && du -sh test && du -sh train && du -sh valid
print("random")
! cd data/npz_all/npz/layout/xla/random && du -sh test && du -sh train && du -sh valid

tile
xla
17M	test
159M	train
17M	valid
layout
nlp
default
4.4M	test
2.2G	train
249M	valid
random
4.6M	test
2.3G	train
251M	valid
xla
default
9.7M	test
375M	train
45M	valid
random
11M	test
358M	train
44M	valid


In [9]:
df_xla_default_train = load_data_to_df(xla_default, "valid")

100%|██████████| 7/7 [00:09<00:00,  1.39s/it]


In [None]:
print(type(df_xla_default_train))
print(len(df_xla_default_train))

<class 'pandas.core.frame.DataFrame'>
61


In [None]:
print(df_xla_default_train.loc[9])

node_feat           [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,...
node_opcode         [63, 63, 20, 20, 20, 63, 63, 20, 83, 63, 63, 2...
edge_index          [[2, 0], [2, 1], [3, 0], [3, 1], [4, 2], [4, 3...
node_config_feat    [[[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -...
node_config_ids     [2338, 2341, 2343, 2347, 2357, 2360, 2363, 236...
config_runtime      [36918487, 36927251, 36918845, 36923070, 36908...
node_splits                                              [[0, 22385]]
Name: 9, dtype: object


In [10]:
print(type(df_xla_default_train.head(1)))

<class 'pandas.core.frame.DataFrame'>


In [None]:
torch.manual_seed(42)
train_dataset = RuntimeDataset(df_xla_default_train.head(1), mode="layout")
data_loader = runtime_data_loader(train_dataset, batch_size=9) # work only for batch size=1, (node_config_feat)
len(data_loader)

In [41]:
test = next(iter(data_loader))
print(type(test))


<class 'torch_geometric.data.batch.DataBatch'>
