In [2]:
import os
import json
import pandas as pd
import numpy as np
import sys
from pathlib import Path

models = ['LiftMP', 'GIN', 'GAT', 'GCNN', 'GatedGCNN']
# note, no twitter.
dataset_names = ['RANDOM', 'ENZYMES', 'PROTEINS', 'IMDB-BINARY', 'MUTAG', 'COLLAB']
folder_path = Path('/home/penlu/code/bespoke-gnn4do')
training_path = folder_path / "training_runs"

train_df = pd.DataFrame(columns=dataset_names)
valid_df = pd.DataFrame(columns=dataset_names)

In [3]:
from torch_geometric.datasets import TUDataset

# Count edges in each dataset
for dataset in ['PROTEINS', 'ENZYMES', 'COLLAB', 'IMDB-BINARY', 'MUTAG']:
    print(dataset)
    loader = TUDataset(root=f'/tmp/{dataset}', name=dataset)
    edges = 0
    count = 0
    for example in loader:
        edges += example.edge_index.shape[1]
        count += 1

    # theoretical objective after getting all edges
    train_df.at['bound', dataset] = -edges / 2. / count

    # theoretical score after getting all edges
    valid_df.at['bound', dataset] = edges / count

PROTEINS
ENZYMES
COLLAB
IMDB-BINARY
MUTAG


In [4]:
# Collect training outputs

#train_prefix = "230823_test" # for max cut
train_prefix = "230901_VC" # for vertex cover
model_list = [training_path / x for x in os.listdir(training_path) if x.startswith(train_prefix)]

# load in params
for model_folder in model_list:
    try:
        with open(os.path.join(model_folder, 'params.txt'), 'r') as f:
            model_args = json.load(f)
        train_losses = np.load(os.path.join(model_folder, 'train_losses.npy'))
        valid_scores = np.load(os.path.join(model_folder, 'valid_scores.npy'))
        dataset = model_args['TUdataset_name']
        if model_args['dataset'] == 'TU':
            train_df.at[model_args['model_type'], dataset] = train_losses[-1]
            valid_df.at[model_args['model_type'], dataset] = valid_scores[-1]
        else:
            train_df.at[model_args['model_type'], 'RANDOM'] = train_losses[-1]
            valid_df.at[model_args['model_type'], 'RANDOM'] = valid_scores[-1]
        print(f"got {model_args['model_type']}, {dataset}")
    except:
        print(f'something is wrong w/ {model_folder}')
        print(sys.exc_info())

got GAT, PROTEINS
got GCNN, PROTEINS
got GCNN, RANDOM
got NegationGAT, COLLAB
got GIN, COLLAB
got GatedGCNN, PROTEINS
got GatedGCNN, COLLAB
got GCNN, COLLAB
got GIN, RANDOM
got GCNN, MUTAG
got GIN, PROTEINS
got NegationGAT, MUTAG
got NegationGAT, RANDOM
got GatedGCNN, RANDOM
got LiftMP, ENZYMES
got GIN, MUTAG
got GAT, RANDOM
got GIN, ENZYMES
got NegationGAT, PROTEINS
got GatedGCNN, MUTAG
got LiftMP, MUTAG
got NegationGAT, IMDB-BINARY
got GCNN, ENZYMES
got LiftMP, PROTEINS
got GAT, IMDB-BINARY
got LiftMP, RANDOM
got GIN, IMDB-BINARY
got GAT, COLLAB
got GAT, MUTAG
got GAT, ENZYMES
got GatedGCNN, IMDB-BINARY
got NegationGAT, ENZYMES
got LiftMP, IMDB-BINARY
got GatedGCNN, ENZYMES
got GCNN, IMDB-BINARY
got LiftMP, COLLAB


In [5]:
# Collect SDP outputs
baselines_path = folder_path / "baseline_runs"
#sdp_list = [baselines_path / x for x in os.listdir(baselines_path) if "_sdp_" in x]
sdp_list = [baselines_path / x for x in os.listdir(baselines_path) if "_sdp_" in x and "VC" in x]

# load in params
for sdp_folder in sdp_list:
    try:
        with open(sdp_folder / 'params.txt', 'r') as f:
            sdp_args = json.load(f)
        if model_args['dataset'] == 'TU':
            dataset = sdp_args['TUdataset_name']
        else:
            dataset = "RANDOM"

        with open(sdp_folder / 'results.jsonl', 'r') as f:
            total_sdp_score = 0.
            total_hyperplane_score = 0.
            total_count = 0
            for line in f:
                res = json.loads(line)
                if res['method'] == 'sdp':
                    total_sdp_score += res['score']
                elif res['method'] == 'sdp|random_hyperplane':
                    total_hyperplane_score += res['score']
                total_count += 1
        total_count = total_count / 3 # because we triple-count lines

        # convert SDP score back to obj and store
        # score == (E - obj) / 2, so obj == E - score * 2
        avg_edges = valid_df.at['bound', dataset]
        train_df.at['SDP lift', dataset] = avg_edges - (total_sdp_score / total_count) * 2
        valid_df.at['SDP proj', dataset] = total_hyperplane_score / total_count
        print(f"{dataset} edges: {avg_edges}")

        print(f"{dataset} length: {total_count}")
        print(f"{dataset} SDP lift: {train_df.at['SDP lift', dataset]}")
        print(f"{dataset} SDP proj: {valid_df.at['SDP proj', dataset]}")
    except:
        print(f'something is wrong w/ {sdp_folder}')
        print(sys.exc_info())

MUTAG edges: 39.58510638297872
MUTAG length: 188.0
MUTAG SDP lift: 55.88366250788911
MUTAG SDP proj: -8.170212765957446
COLLAB edges: 4914.4316
COLLAB length: 1047.0
COLLAB SDP lift: 5023.174151874636
COLLAB SDP proj: -54.54536771728749
ENZYMES edges: 124.27333333333333
ENZYMES length: 600.0
ENZYMES SDP lift: 163.93550774772962
ENZYMES SDP proj: -19.863333333333333
PROTEINS edges: 145.6316262353998
PROTEINS length: 1113.0
PROTEINS SDP lift: 192.82226147313216
PROTEINS SDP proj: -23.690925426774484
RANDOM edges: nan
RANDOM length: 1000.0
RANDOM SDP lift: nan
RANDOM SDP proj: -81.3
IMDB-BINARY edges: 193.062
IMDB-BINARY length: 1000.0
IMDB-BINARY SDP lift: 226.28193710041046
IMDB-BINARY SDP proj: -16.63


In [7]:
# Collect gurobi outputs
baselines_path = folder_path / "baseline_runs"
#gurobi_1s_list = [baselines_path / x for x in os.listdir(baselines_path) if "gurobi_1s" in x]
#gurobi_5s_list = [baselines_path / x for x in os.listdir(baselines_path) if "gurobi_5s" in x]
gurobi_1s_list = [baselines_path / x for x in os.listdir(baselines_path) if "gurobi_1s" in x and "230902" in x]
gurobi_5s_list = [baselines_path / x for x in os.listdir(baselines_path) if "gurobi_5s" in x and "230902" in x]

# load in params
for gurobi_folder in gurobi_1s_list:
    try:
        with open(gurobi_folder / 'params.txt', 'r') as f:
            gurobi_args = json.load(f)
        if model_args['dataset'] == 'TU':
            dataset = gurobi_args['TUdataset_name']
        else:
            dataset = "RANDOM"

        with open(gurobi_folder / 'results.jsonl', 'r') as f:
            total_score = 0.
            total_count = 0
            for line in f:
                res = json.loads(line)
                assert res['method'] == 'gurobi'
                total_score += res['score']
                total_count += 1

        # store gurobi score and equivalent objective
        train_df.at['gurobi 1s', dataset] = valid_df.at['bound', dataset] - (total_score / total_count) * 2
        valid_df.at['gurobi 1s', dataset] = total_score / total_count

        print(f"{dataset} gurobi: {total_score / total_count}")
    except:
        print(f'something is wrong w/ {gurobi_folder}')
        print(sys.exc_info())

# load in params
for gurobi_folder in gurobi_5s_list:
    try:
        with open(gurobi_folder / 'params.txt', 'r') as f:
            gurobi_args = json.load(f)
        if model_args['dataset'] == 'TU':
            dataset = gurobi_args['TUdataset_name']
        else:
            dataset = "RANDOM"

        with open(gurobi_folder / 'results.jsonl', 'r') as f:
            total_score = 0.
            total_count = 0
            for line in f:
                res = json.loads(line)
                assert res['method'] == 'gurobi'
                total_score += res['score']
                total_count += 1

        # store gurobi score and equivalent objective
        train_df.at['gurobi 5s', dataset] = valid_df.at['bound', dataset] - (total_score / total_count) * 2
        valid_df.at['gurobi 5s', dataset] = total_score / total_count

        print(f"{dataset} gurobi: {total_score / total_count}")
    except:
        print(f'something is wrong w/ {gurobi_folder}')
        print(sys.exc_info())

COLLAB gurobi: -65.3958
PROTEINS gurobi: -23.628032345013477
RANDOM gurobi: -75.926
MUTAG gurobi: -8.170212765957446
IMDB-BINARY gurobi: -16.613
COLLAB gurobi: -65.3958
RANDOM gurobi: -75.926
PROTEINS gurobi: -23.628032345013477
MUTAG gurobi: -8.170212765957446
IMDB-BINARY gurobi: -16.613


In [8]:
train_df

Unnamed: 0,RANDOM,ENZYMES,PROTEINS,IMDB-BINARY,MUTAG,COLLAB
bound,,-62.136667,-72.815813,-96.531,-19.792553,-2457.2158
GAT,83.522652,22.087946,26.388264,19.498142,9.645013,78.114738
GCNN,83.861099,21.317291,26.471281,20.126495,8.208278,64.68264
NegationGAT,99.791466,27.849762,31.206074,20.128101,15.991971,82.459038
GIN,80.715721,21.085382,17.319815,19.051352,8.954197,68.322067
GatedGCNN,95.674881,20.188658,23.927334,17.872929,7.071137,7684.489258
LiftMP,99.767433,18.466349,21.852633,17.211369,7.014311,73.875618
SDP lift,,163.935508,192.822261,226.281937,55.883663,5023.174152
gurobi 1s,,,192.887691,226.288,55.925532,5045.2232
gurobi 5s,,,192.887691,226.288,55.925532,5045.2232


In [10]:
valid_df.drop("bound")

Unnamed: 0,RANDOM,ENZYMES,PROTEINS,IMDB-BINARY,MUTAG,COLLAB
GAT,-81.305,-22.683333,-30.587444,-18.28,-8.710526,-69.898
GCNN,-94.525,-32.8,-52.950673,-18.625,-17.184211,-72.944
NegationGAT,-97.6,-28.241667,-34.64574,-19.055,-13.921053,-73.959
GIN,-151.84,-37.941667,-60.941704,-18.4,-10.131579,-72.371
GatedGCNN,-85.985,-20.291667,-26.336323,-18.48,-8.236842,-74.237
LiftMP,-95.225,-19.458333,-25.690583,-18.455,-8.236842,-69.621
SDP proj,-81.3,-19.863333,-23.690925,-16.63,-8.170213,-54.545368
gurobi 1s,-75.926,,-23.628032,-16.613,-8.170213,-65.3958
gurobi 5s,-75.926,,-23.628032,-16.613,-8.170213,-65.3958
