In [2]:
import os
import json
import pandas as pd
import numpy as np
import sys
from pathlib import Path

models = ['LiftMP', 'GIN', 'GAT', 'GCNN', 'GatedGCNN']
# note, no twitter.
dataset_names = ['RANDOM', 'ENZYMES', 'PROTEINS', 'IMDB-BINARY', 'MUTAG', 'COLLAB']
folder_path = Path('/home/penlu/code/bespoke-gnn4do')
training_path = folder_path / "training_runs"

train_df = pd.DataFrame(columns=dataset_names)
valid_df = pd.DataFrame(columns=dataset_names)

In [3]:
from torch_geometric.datasets import TUDataset

# Count edges in each dataset
for dataset in ['PROTEINS', 'ENZYMES', 'COLLAB', 'IMDB-BINARY', 'MUTAG']:
    print(dataset)
    loader = TUDataset(root=f'/tmp/{dataset}', name=dataset)
    edges = 0
    count = 0
    for example in loader:
        edges += example.edge_index.shape[1]
        count += 1

    # theoretical objective after getting all edges
    train_df.at['bound', dataset] = -edges / 2. / count

    # theoretical score after getting all edges
    valid_df.at['bound', dataset] = edges / count

PROTEINS


Downloading https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip
Extracting /tmp/PROTEINS/PROTEINS/PROTEINS.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip


ENZYMES


Extracting /tmp/ENZYMES/ENZYMES/ENZYMES.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/COLLAB.zip


COLLAB


Extracting /tmp/COLLAB/COLLAB/COLLAB.zip
Processing...
Done!


IMDB-BINARY


Downloading https://www.chrsmrrs.com/graphkerneldatasets/IMDB-BINARY.zip
Extracting /tmp/IMDB-BINARY/IMDB-BINARY/IMDB-BINARY.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip


MUTAG


Extracting /tmp/MUTAG/MUTAG/MUTAG.zip
Processing...
Done!


In [4]:
# Collect training outputs

train_prefix = "230823_test"
model_list = [training_path / x for x in os.listdir(training_path) if x.startswith(train_prefix)]

# load in params
for model_folder in model_list:
    try:
        with open(os.path.join(model_folder, 'params.txt'), 'r') as f:
            model_args = json.load(f)
        train_losses = np.load(os.path.join(model_folder, 'train_losses.npy'))
        valid_scores = np.load(os.path.join(model_folder, 'valid_scores.npy'))
        dataset = model_args['TUdataset_name']
        if model_args['dataset'] == 'TU':
            train_df.at[model_args['model_type'], dataset] = train_losses[-1]
            valid_df.at[model_args['model_type'], dataset] = valid_scores[-1]
        else:
            train_df.at[model_args['model_type'], 'RANDOM'] = train_losses[-1]
            valid_df.at[model_args['model_type'], 'RANDOM'] = valid_scores[-1]
        print(f"got {model_args['model_type']}, {dataset}")
    except:
        print(f'something is wrong w/ {model_folder}')
        print(sys.exc_info())

got GatedGCNN, PROTEINS
got GAT, MUTAG
got GCNN, RANDOM
got GCNN, ENZYMES
got GCNN, IMDB-BINARY
got GatedGCNN, RANDOM
got GAT, IMDB-BINARY
got LiftMP, COLLAB
got GCNN, COLLAB
got GIN, IMDB-BINARY
got GIN, PROTEINS
got GatedGCNN, IMDB-BINARY
got GAT, PROTEINS
got LiftMP, RANDOM
got LiftMP, ENZYMES
got GatedGCNN, MUTAG
got GIN, COLLAB
got GAT, ENZYMES
got LiftMP, IMDB-BINARY
got GIN, MUTAG
got GCNN, PROTEINS
got GIN, ENZYMES
got LiftMP, MUTAG
got LiftMP, PROTEINS
got GAT, COLLAB
got GIN, RANDOM
got GatedGCNN, COLLAB
got GCNN, MUTAG
got GAT, RANDOM
got GatedGCNN, ENZYMES


In [10]:
# Collect SDP outputs
baselines_path = folder_path / "baseline_runs"
sdp_list = [baselines_path / x for x in os.listdir(baselines_path) if "_sdp_" in x]

# load in params
for sdp_folder in sdp_list:
    try:
        with open(sdp_folder / 'params.txt', 'r') as f:
            sdp_args = json.load(f)
        if model_args['dataset'] == 'TU':
            dataset = sdp_args['TUdataset_name']
        else:
            dataset = "RANDOM"

        with open(sdp_folder / 'results.jsonl', 'r') as f:
            total_sdp_score = 0.
            total_hyperplane_score = 0.
            total_count = 0
            for line in f:
                res = json.loads(line)
                if res['method'] == 'sdp':
                    total_sdp_score += res['score']
                elif res['method'] == 'sdp|random_hyperplane':
                    total_hyperplane_score += res['score']
                total_count += 1
        total_count = total_count / 3 # because we triple-count lines

        # convert SDP score back to obj and store
        # score == (E - obj) / 2, so obj == E - score * 2
        avg_edges = valid_df.at['bound', dataset]
        train_df.at['SDP lift', dataset] = avg_edges - (total_sdp_score / total_count) * 2
        valid_df.at['SDP proj', dataset] = total_hyperplane_score / total_count
        print(f"{dataset} edges: {avg_edges}")

        print(f"{dataset} length: {total_count}")
        print(f"{dataset} SDP lift: {train_df.at['SDP lift', dataset]}")
        print(f"{dataset} SDP proj: {valid_df.at['SDP proj', dataset]}")
    except:
        print(f'something is wrong w/ {sdp_folder}')
        print(sys.exc_info())

COLLAB edges: 4914.4316
COLLAB length: 5000.0
COLLAB SDP lift: -139.98133271789538
COLLAB SDP proj: 2523.9533
RANDOM edges: nan
RANDOM length: 1000.0
RANDOM SDP lift: nan
RANDOM SDP proj: 872.0625
ENZYMES edges: 124.27333333333333
ENZYMES length: 600.0
ENZYMES SDP lift: -39.32062726656598
ENZYMES SDP proj: 80.69666666666667
IMDB-BINARY edges: 193.062
IMDB-BINARY length: 1000.0
IMDB-BINARY SDP lift: -14.04979141998291
IMDB-BINARY SDP proj: 103.1935
PROTEINS edges: 145.6316262353998
PROTEINS length: 1113.0
PROTEINS SDP lift: -47.27603828382195
PROTEINS SDP proj: 94.99101527403414
MUTAG edges: 39.58510638297872
MUTAG length: 188.0
MUTAG SDP lift: -19.420646890680842
MUTAG SDP proj: 29.32712765957447


In [6]:
# Collect gurobi outputs
baselines_path = folder_path / "baseline_runs"
gurobi_1s_list = [baselines_path / x for x in os.listdir(baselines_path) if "gurobi_1s" in x]
gurobi_5s_list = [baselines_path / x for x in os.listdir(baselines_path) if "gurobi_5s" in x]

# load in params
for gurobi_folder in gurobi_1s_list:
    try:
        with open(gurobi_folder / 'params.txt', 'r') as f:
            gurobi_args = json.load(f)
        if model_args['dataset'] == 'TU':
            dataset = gurobi_args['TUdataset_name']
        else:
            dataset = "RANDOM"

        with open(gurobi_folder / 'results.jsonl', 'r') as f:
            total_score = 0.
            total_count = 0
            for line in f:
                res = json.loads(line)
                assert res['method'] == 'gurobi'
                total_score += res['score']
                total_count += 1

        # store gurobi score and equivalent objective
        train_df.at['gurobi 1s', dataset] = valid_df.at['bound', dataset] - (total_score / total_count) * 2
        valid_df.at['gurobi 1s', dataset] = total_score / total_count

        print(f"{dataset} gurobi: {total_score / total_count}")
    except:
        print(f'something is wrong w/ {gurobi_folder}')
        print(sys.exc_info())

# load in params
for gurobi_folder in gurobi_5s_list:
    try:
        with open(gurobi_folder / 'params.txt', 'r') as f:
            gurobi_args = json.load(f)
        if model_args['dataset'] == 'TU':
            dataset = gurobi_args['TUdataset_name']
        else:
            dataset = "RANDOM"

        with open(gurobi_folder / 'results.jsonl', 'r') as f:
            total_score = 0.
            total_count = 0
            for line in f:
                res = json.loads(line)
                assert res['method'] == 'gurobi'
                total_score += res['score']
                total_count += 1

        # store gurobi score and equivalent objective
        train_df.at['gurobi 5s', dataset] = valid_df.at['bound', dataset] - (total_score / total_count) * 2
        valid_df.at['gurobi 5s', dataset] = total_score / total_count

        print(f"{dataset} gurobi: {total_score / total_count}")
    except:
        print(f'something is wrong w/ {gurobi_folder}')
        print(sys.exc_info())

IMDB-BINARY gurobi: 103.1955
MUTAG gurobi: 29.32712765957447
ENZYMES gurobi: 80.71
PROTEINS gurobi: 95.0763701707098
COLLAB gurobi: 944.3480751604033
COLLAB gurobi: 2524.2921
RANDOM gurobi: 874.1265
PROTEINS gurobi: 95.0763701707098
RANDOM gurobi: 874.2815
MUTAG gurobi: 29.32712765957447
IMDB-BINARY gurobi: 103.1965
COLLAB gurobi: 2524.3399
ENZYMES gurobi: 80.71


In [11]:
train_df

Unnamed: 0,RANDOM,ENZYMES,PROTEINS,IMDB-BINARY,MUTAG,COLLAB
bound,,-62.136667,-72.815813,-96.531,-19.792553,-2457.2158
GatedGCNN,-284.610168,-34.637833,-32.155373,-6.494029,-18.802249,2779.852539
GAT,-290.322083,-30.625538,-25.318279,60.571194,-19.139918,1296.400635
GCNN,-179.6716,-23.810829,-29.719679,43.749893,-14.655859,432.388062
LiftMP,-304.903564,-35.0023,-32.377655,-14.07967,-18.827975,-113.910095
GIN,-241.642822,-22.141846,-16.147175,36.690887,-15.512301,411.069489
SDP lift,,-39.320627,-47.276038,-14.049791,-19.420647,-139.981333
gurobi 1s,,-37.146667,-44.521114,-13.329,-19.069149,-134.1526
gurobi 5s,,-37.146667,-44.521114,-13.331,-19.069149,-134.2482


In [12]:
valid_df

Unnamed: 0,RANDOM,ENZYMES,PROTEINS,IMDB-BINARY,MUTAG,COLLAB
bound,,124.273333,145.631626,193.062,39.585106,4914.4316
GatedGCNN,867.6525,78.879167,92.90583,116.0275,28.855263,1950.527
GAT,870.2025,76.241667,93.887892,73.245,29.605263,2504.0655
GCNN,824.1825,45.641667,72.484305,63.69,20.0,2002.211
LiftMP,871.0525,79.179167,93.049327,116.3825,28.855263,2704.1985
GIN,800.5025,65.5,88.040359,68.495,21.368421,2240.236
SDP proj,872.0625,80.696667,94.991015,103.1935,29.327128,2523.9533
gurobi 1s,874.1265,80.71,95.07637,103.1955,29.327128,2524.2921
gurobi 5s,874.2815,80.71,95.07637,103.1965,29.327128,2524.3399
