In [49]:
models = ['LiftMP', 'GIN', 'GAT', 'GCNN', 'GatedGCNN']
# note, no twitter.
dataset_names = ['RANDOM', 'ENZYMES', 'PROTEINS', 'IMDB-BINARY', 'MUTAG', 'COLLAB']
folder_path = Path('/home/penlu/code/bespoke-gnn4do')
training_path = folder_path / "training_runs"

train_df = pd.DataFrame(columns=dataset_names)
valid_df = pd.DataFrame(columns=dataset_names)

In [50]:
from torch_geometric.datasets import TUDataset

# Count edges in each dataset
for dataset in ['PROTEINS', 'ENZYMES', 'COLLAB', 'IMDB-BINARY', 'MUTAG']:
    print(dataset)
    loader = TUDataset(root=f'/tmp/{dataset}', name=dataset)
    edges = 0
    count = 0
    for example in loader:
        edges += example.edge_index.shape[1]
        count += 1

    # theoretical objective after getting all edges
    train_df.at['bound', dataset] = -edges / 2. / count

    # theoretical score after getting all edges
    valid_df.at['bound', dataset] = edges / count

PROTEINS
ENZYMES
COLLAB
IMDB-BINARY
MUTAG


In [58]:
# Collect training outputs
import os
import json
import pandas as pd
import numpy as np
import sys
from pathlib import Path

train_prefix = "230822_test3"
model_list = [training_path / x for x in os.listdir(training_path) if x.startswith(train_prefix)]

# load in params
for model_folder in model_list:
    try:
        if not os.path.isfile(os.path.join(model_folder, 'done.txt')):
            continue
        with open(os.path.join(model_folder, 'params.txt'), 'r') as f:
            model_args = json.load(f)
        train_losses = np.load(os.path.join(model_folder, 'train_losses.npy'))
        valid_scores = np.load(os.path.join(model_folder, 'valid_scores.npy'))
        if model_args['dataset'] == 'TU':
            train_df.at[model_args['model_type'], model_args['TUdataset_name']] = train_losses[-1]
            valid_df.at[model_args['model_type'], model_args['TUdataset_name']] = valid_scores[-1] + (valid_df.at['bound', dataset] - 2) / 2.
        else:
            train_df.at[model_args['model_type'], 'RANDOM'] = train_losses[-1]
            valid_df.at[model_args['model_type'], 'RANDOM'] = valid_scores[-1] + (valid_df.at['bound', dataset] - 2) / 2.
    except:
        print(f'something is wrong w/ {model_folder}')
        print(sys.exc_info())

In [62]:
# Collect SDP outputs
sdp_folder = "230821_TU"
runs_folder = folder_path / "baseline_runs" / sdp_folder
for dataset in ['PROTEINS', 'ENZYMES', 'COLLAB', 'IMDB-BINARY', 'MUTAG']:
    results = []
    total_sdp_score = 0.
    total_hyperplane_score = 0.
    total_gurobi_score = 0.
    with open(runs_folder / dataset / 'results.jsonl', 'r') as f:
        for line in f:
            res = json.loads(line)
            if res['method'] == 'sdp':
                total_sdp_score += res['score']
            elif res['method'] == 'sdp|random_hyperplane':
                total_hyperplane_score += res['score']
            elif res['method'] == 'gurobi':
                total_gurobi_score += res['score']
            results.append(res)
    length = len(results) / 4

    # convert SDP score back to obj and store
    # score == (2 - obj) / 2, so obj == 2 - score * 2
    avg_edges = valid_df.at['bound', dataset]
    train_df.at['SDP lift', dataset] = 2 - (total_sdp_score / length) * 2
    valid_df.at['SDP proj', dataset] = total_hyperplane_score / length + (valid_df.at['bound', dataset] - 2) / 2.
    print(f"{dataset} edges: {avg_edges}")

    # store gurobi score and equivalent objective
    train_df.at['gurobi', dataset] = 2 - (total_gurobi_score / length) * 2
    valid_df.at['gurobi', dataset] = total_gurobi_score / length + (valid_df.at['bound', dataset] - 2) / 2.

    print(f"{dataset} length: {length}")
    print(f"{dataset} SDP: {total_sdp_score / length}")
    print(f"{dataset} rounded: {total_hyperplane_score / length}")
    print(f"{dataset} gurobi: {total_gurobi_score / length}")

PROTEINS edges: 145.6316262353998
PROTEINS length: 1113.0
PROTEINS SDP: 24.638019062866526
PROTEINS rounded: 23.172506738544474
PROTEINS gurobi: 23.260557053009883
ENZYMES edges: 124.27333333333333
ENZYMES length: 600.0
ENZYMES SDP: 20.660313673814137
ENZYMES rounded: 19.56
ENZYMES gurobi: 19.573333333333334
COLLAB edges: 4914.4316
COLLAB length: 5000.0
COLLAB SDP: 70.9906649263382
COLLAB rounded: 67.7261
COLLAB gurobi: 68.0949
IMDB-BINARY edges: 193.062
IMDB-BINARY length: 1000.0
IMDB-BINARY SDP: 8.02489567041397
IMDB-BINARY rounded: 7.6635
IMDB-BINARY gurobi: 7.6645
MUTAG edges: 39.58510638297872
MUTAG length: 188.0
MUTAG SDP: 10.710323310912923
MUTAG rounded: 10.534574468085106
MUTAG gurobi: 10.534574468085106


In [56]:
train_df

Unnamed: 0,RANDOM,ENZYMES,PROTEINS,IMDB-BINARY,MUTAG,COLLAB
bound,,-62.136667,-72.815813,-96.531,-19.792553,-2457.2158
GAT,,-37.824329,-34.64743,30.37344,-19.32514,1055.83374
GIN,,-30.858206,-30.001453,55.632381,-17.12985,1217.959717
GatedGCNN,,-42.976559,-49.370506,60.165707,-19.480838,2899.2771
GCNN,,-26.581135,-17.708513,4.981516,-16.71162,1287.315918
SDP lift,,-39.320627,-47.276038,-14.049791,-19.420647,-139.98133
gurobi,,-37.146667,-44.521114,-13.329,-19.069149,-134.1898


In [63]:
valid_df

Unnamed: 0,RANDOM,ENZYMES,PROTEINS,IMDB-BINARY,MUTAG,COLLAB
bound,,124.273333,145.631626,193.062,39.585106,4914.4316
GAT,,36.007266,38.972903,0.308621,28.331111,-306.762025
GIN,,30.391764,33.958572,-8.229167,25.933693,-488.683728
GatedGCNN,,37.528737,43.425917,3.527243,28.568614,-1115.27167
GCNN,,32.363245,35.549974,0.913256,26.180151,-381.328141
SDP proj,,80.696667,94.98832,103.1945,29.327128,2523.9419
gurobi,,80.71,95.07637,103.1955,29.327128,2524.3107
