# Model Exporting Notebook

This notebook takes a model and exports:

1. Test Predictions
2. MC Test predictions (if applicable)
3. Train/Valid prediction (for downstream stacking)

In [1]:
RUN_ID = "6aeax7xs"

import os
os.chdir('/root/kaggle-fast-or-slow')

from ml.layout_v1.model import GraphMLP
from ml.layout_v1.dataset import LayoutDataset
from ml.layout_v1.job.spec import PreprocessorSpec, PostprocessorSpec, JobSpec
from ml.layout_v1.job.builder import build_processors, fit_node_processor
from ml.layout_v1.job.constants import GLOBAL_POOLINGS
from ml.layout_v1.preprocessors import GlobalFeatureGenerator
import torch_geometric
import torch
from copy import deepcopy

import wandb
torch.set_float32_matmul_precision('high')

In [2]:
# Used to fit
TRAIN_DATA_DIRS = ["data/layout/nlp/default/train", "data/layout/nlp/random/train"]
TEST_DATA_DIRS = ["data/layout/nlp/default/test", "data/layout/nlp/random/test"]

WANDB_RUN_ID = f"kaggle-fast-or-slow/{RUN_ID}"

api = wandb.Api()
run = api.run(WANDB_RUN_ID)

config = run.config

In [3]:
job_spec = JobSpec(**config)

preprocessor_spec = PreprocessorSpec(**config["preprocessors"])
postprocessor_spec = PostprocessorSpec(**config["postprocessors"])

preprocessors = build_processors(preprocessor_spec)
postprocessors = build_processors(postprocessor_spec)

if preprocessors.node_transform:
    if hasattr(preprocessors.node_transform, "fit"):
        preprocessors.node_transform = fit_node_processor(
            TRAIN_DATA_DIRS, preprocessors.node_transform
        )

if postprocessors.node_transform:
    if hasattr(postprocessors.node_transform, "fit"):
        postprocessors.node_transform = fit_node_processor(
            TRAIN_DATA_DIRS, postprocessors.node_transform
        )

# Manually add global processor cause automating it is a pain cause I'm a bad dev
global_random_preprocessor = GlobalFeatureGenerator("nlp","random",True)
global_default_preprocessor = GlobalFeatureGenerator("nlp","default",True)

random_preprocessors = deepcopy(preprocessors)
random_preprocessors.global_transform = global_random_preprocessor

default_preprocessors = deepcopy(preprocessors)
default_preprocessors.global_transform = global_default_preprocessor

Fitting node processor


In [7]:
# Test Data
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
default_dataset = LayoutDataset(
    directories=["data/layout/nlp/default/test"],
    processed_dir="data/processed_test",
    pretransforms=default_preprocessors,
    posttransforms=postprocessors,
    multiprocess=False,
    force_reload=False
)

random_dataset = LayoutDataset(
    directories=["data/layout/nlp/random/test"],
    processed_dir="data/processed_test",
    pretransforms=random_preprocessors,
    posttransforms=postprocessors,
    multiprocess=False,
    force_reload=False
)


  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

In [8]:
num_features = default_dataset.get(0).x.shape[1]
num_global_features = default_dataset.get(0).global_features.shape[1]

pooling = GLOBAL_POOLINGS[job_spec.pooling]


model = GraphMLP(
    graph_input_dim=num_features,
    global_features_dim=num_global_features,
    graph_channels=job_spec.graph_channels,
    graph_layers=job_spec.graph_layers,
    linear_channels=job_spec.linear_channels,
    linear_layers=job_spec.linear_layers,
    dropout=job_spec.dropout,
    pooling_fn=pooling,
    pooling_feature_multiplier=job_spec.pooling_feature_multiplier,
    graph_conv=job_spec.graph_convolution_type,
    graph_conv_kwargs=job_spec.graph_convolution_kwargs,
    graph_norm=job_spec.graph_norm,
    linear_norm=job_spec.linear_norm,
    use_multi_edge=job_spec.use_multi_edge,
    main_block=job_spec.main_block,
    alt_block=job_spec.alt_block,
)

model = model.to("cuda")
model = torch_geometric.compile(model)

In [9]:
# Get the most recent checkpoint
import os
from pathlib import Path

paths = sorted(Path(f"models/{RUN_ID}").iterdir(), key=os.path.getmtime)
most_recent = str(paths[0].absolute())

state_dict = torch.load(most_recent)
model.load_state_dict(state_dict["model_state_dict"])
model = model.to('cuda')

In [10]:
def make_id_from_file(filepath: str):
    file_id = filepath.removeprefix("data/").removesuffix(".npz")
    file_id = file_id.replace("/test","")
    file_id = file_id.replace("/",":")
    
    return file_id

In [None]:
from collections import defaultdict
from tqdm.auto import tqdm
from torch_geometric.data import Batch

results = defaultdict(dict)

BATCH_SIZE = 8
MC_DROPOUT_ITERS = 20

next_batch = []

DATASETS = [random_dataset, default_dataset]

model.eval()
for dataset in DATASETS:
    for i in tqdm(range(len(dataset))):
        file_path, config_idx = dataset.idx_to_source_file_and_config[i]
        file_id = make_id_from_file(file_path)
        data = dataset.get(i)
        next_batch.append((data, file_id, config_idx))
        
        if len(next_batch) == BATCH_SIZE or i == len(dataset) - 1:
            batch_data = [d[0] for d in next_batch]
            file_ids = [d[1] for d in next_batch]
            config_ids = [d[2] for d in next_batch]
            
            with torch.no_grad():
                batch = Batch.from_data_list(batch_data)
                batch = batch.to('cuda')
                output = model(batch).flatten()
            
            for o, f, c in zip(output.tolist(), file_ids, config_ids):
                results[f][c] = o
            
            next_batch = []


  0%|          | 0/17000 [00:00<?, ?it/s]

  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,
