In [1]:
%load_ext autoreload
%autoreload 2

import os
import json
import pandas as pd
import numpy as np
import sys
from pathlib import Path

root_folder = Path('/home/bcjexu/maxcut-80/bespoke-gnn4do/')
sys.path.insert(0, str(root_folder))

from utils.tabulate import load_datasets, load_train_outputs, load_baseline_outputs

In [2]:
baseline_folder = 'baseline_runs/230927_snapshot'


In [3]:
# load datasets and calculate validation slices
import torch

datasets = load_datasets()
indices = {}
for name, dataset in datasets.items():
    torch.manual_seed(0)
    print(f"{name} dataset size: {len(dataset)}")
    train_size = int(0.8 * len(dataset))
    val_size = (len(dataset) - train_size)//2
    test_size = len(dataset) - train_size - val_size
    _, _, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])
    indices[name] = test_dataset.indices

dataset_names = list(datasets.keys())
#dataset_names.append('RANDOM')
#dataset_names.append('RANDOM 500')
dataset_names.append('ForcedRB')

loading PROTEINS
loading ENZYMES
loading COLLAB
loading IMDB-BINARY
loading MUTAG
PROTEINS dataset size: 1113
ENZYMES dataset size: 600
COLLAB dataset size: 5000
IMDB-BINARY dataset size: 1000
MUTAG dataset size: 188


In [4]:
# loading max cut results from files

print("loading model losses")
maxcut_models = load_train_outputs(root_folder / 'training_runs', '230823_test')
print("loading sdp lift losses")
maxcut_sdp_lift = load_baseline_outputs(root_folder / 'baseline_runs', '230824_sdp', 'sdp', indices)
print("loading sdp proj losses")
maxcut_sdp_proj = load_baseline_outputs(root_folder / 'baseline_runs', '230824_sdp', 'sdp|random_hyperplane', indices)

loading model losses
loading sdp lift losses
load_baseline_outputs: IMDB-BINARY length: 100
load_baseline_outputs: IMDB-BINARY sdp: 97.83850940704346
load_baseline_outputs: RANDOM length: 1000
load_baseline_outputs: RANDOM sdp: 896.5637869873046
load_baseline_outputs: COLLAB length: 500
load_baseline_outputs: COLLAB sdp: 2627.7631793823243
load_baseline_outputs: PROTEINS length: 112
load_baseline_outputs: PROTEINS sdp: 103.8618523819106
load_baseline_outputs: MUTAG length: 19
load_baseline_outputs: MUTAG sdp: 28.101939753482217
load_baseline_outputs: ENZYMES length: 60
load_baseline_outputs: ENZYMES sdp: 82.51778809229533
loading sdp proj losses
load_baseline_outputs: IMDB-BINARY length: 100
load_baseline_outputs: IMDB-BINARY sdp|random_hyperplane: 97.495
load_baseline_outputs: RANDOM length: 1000
load_baseline_outputs: RANDOM sdp|random_hyperplane: 872.0625
load_baseline_outputs: COLLAB length: 500
load_baseline_outputs: COLLAB sdp|random_hyperplane: 2624.261
load_baseline_outputs: PR

In [5]:
# read in files

vc_scores = pd.DataFrame()

for model_folder in os.listdir(os.path.join(root_folder, baseline_folder)):
    with open(os.path.join(os.path.join(root_folder, baseline_folder, model_folder), 'params.txt'), 'r') as f:
        model_args = json.load(f)
    if model_args['problem_type'] != 'max_cut':
        continue
    
    #print(model_args['gurobi'], model_args['gurobi_timeout'], model_args['dataset'], model_args['gen_n'])
    #print(load_baseline_outputs(Path(os.path.join(root_folder, baseline_folder)), model_folder, 'gurobi', indices))
    row = f'gurobi_{model_args["gurobi_timeout"]}'
    if isinstance(model_args['gen_n'], list):
        col = f"{model_args['dataset']}@@{model_args['gen_n'][0]}"
    else:
        col = f"{model_args['dataset']}"
    print(row,col)

    val = load_baseline_outputs(Path(os.path.join(root_folder, baseline_folder)), model_folder, 'gurobi') # indices)
    
    vc_scores.at[ row , col] = list(val.values())[0]
    #vc_scores.at[ row , 'gen_n'] = model_args['gen_n']
    #vc_scores.at[ row , ''] = model_args['gen_n']

gurobi_8.0 ENZYMES
load_baseline_outputs: ENZYMES length: 60
load_baseline_outputs: ENZYMES gurobi: 81.45
gurobi_4.0 ErdosRenyi@@50
load_baseline_outputs: ErdosRenyi length: 1000
load_baseline_outputs: ErdosRenyi gurobi: 530.173
gurobi_2.0 WattsStrogatz@@100
load_baseline_outputs: WattsStrogatz length: 1000
load_baseline_outputs: WattsStrogatz gurobi: 392.068
gurobi_8.0 COLLAB
load_baseline_outputs: COLLAB length: 500
load_baseline_outputs: COLLAB gurobi: 2624.599
gurobi_4.0 BarabasiAlbert@@100
load_baseline_outputs: BarabasiAlbert length: 1000
load_baseline_outputs: BarabasiAlbert gurobi: 719.995
gurobi_4.0 COLLAB
load_baseline_outputs: COLLAB length: 500
load_baseline_outputs: COLLAB gurobi: 2624.585
gurobi_8.0 WattsStrogatz@@50
load_baseline_outputs: WattsStrogatz length: 1000
load_baseline_outputs: WattsStrogatz gurobi: 198.745
gurobi_8.0 WattsStrogatz@@100
load_baseline_outputs: WattsStrogatz length: 1000
load_baseline_outputs: WattsStrogatz gurobi: 392.068
gurobi_8.0 PROTEINS
loa

In [6]:
# putting max cut results in a table
maxcut_scores = pd.DataFrame(columns=dataset_names)

for (model, dataset), (train_losses, valid_scores) in maxcut_models.items():
    vc_scores.at[model, dataset] = np.max(valid_scores)
for (model, dataset), (train_losses, valid_scores) in maxcut_models.items():
    print(f"{model} {dataset} loss: {train_losses[-1]}")

for dataset, score in maxcut_sdp_lift.items():
    vc_scores.at['SDP lift', dataset] = score

for dataset, score in maxcut_sdp_proj.items():
    vc_scores.at['SDP proj', dataset] = score

for dataset in dataset_names:
    edges = 0
    count = 0
    if dataset not in datasets:
        continue
    for example in datasets[dataset]:
        edges += example.edge_index.shape[1]
        count += 1
    vc_scores.at['edge count', dataset] = float(edges) / count

#maxcut_scores.style.apply(lambda col: ['font-weight:bold' if x==col.max() else '' for x in col])

In [7]:
vc_scores

Unnamed: 0,ENZYMES,ErdosRenyi@@50,WattsStrogatz@@100,COLLAB,BarabasiAlbert@@100,WattsStrogatz@@50,PROTEINS,ErdosRenyi@@100,BarabasiAlbert@@400,WattsStrogatz@@400,...,PowerlawCluster@@50,MUTAG,REDDIT-BINARY,IMDB-BINARY,REDDIT-MULTI-12K,ErdosRenyi@@400,PowerlawCluster@@400,REDDIT-MULTI-5K,BarabasiAlbert@@50,RANDOM
gurobi_8.0,81.45,530.193,392.068,2624.599,720.218,198.745,102.361607,2002.9755,2213.085,1179.808,...,346.424,27.947368,633.7875,97.495,649.019279,16495.495,2173.946,847.778,352.121,
gurobi_4.0,81.45,530.173,392.068,2624.585,719.995,198.745,102.361607,2002.9175,2213.004,1178.464,...,346.424,27.947368,633.7875,97.495,649.005868,16495.244,2173.816,847.724,352.121,
gurobi_2.0,81.45,530.124,392.068,2624.577,719.93,198.745,102.361607,2002.8885,2208.788,1178.008,...,346.424,27.947368,633.7875,97.495,648.998324,16493.979,2169.781,847.564,352.121,
SDP lift,82.517788,,,2627.763179,,,103.861852,,,,...,,28.10194,,97.838509,,,,,,896.563787
SDP proj,81.45,,,2624.261,,,102.299107,,,,...,,27.947368,,97.495,,,,,,872.0625
edge count,124.273333,,,4914.4316,,,145.631626,,,,...,,39.585106,,193.062,,,,,,


In [8]:
vc_scores.to_csv('/home/bcjexu/maxcut-80/bespoke-gnn4do/analysis_ipynb/mc_baseline_scores.csv')
vc_scores

Unnamed: 0,ENZYMES,ErdosRenyi@@50,WattsStrogatz@@100,COLLAB,BarabasiAlbert@@100,WattsStrogatz@@50,PROTEINS,ErdosRenyi@@100,BarabasiAlbert@@400,WattsStrogatz@@400,...,PowerlawCluster@@50,MUTAG,REDDIT-BINARY,IMDB-BINARY,REDDIT-MULTI-12K,ErdosRenyi@@400,PowerlawCluster@@400,REDDIT-MULTI-5K,BarabasiAlbert@@50,RANDOM
gurobi_8.0,81.45,530.193,392.068,2624.599,720.218,198.745,102.361607,2002.9755,2213.085,1179.808,...,346.424,27.947368,633.7875,97.495,649.019279,16495.495,2173.946,847.778,352.121,
gurobi_4.0,81.45,530.173,392.068,2624.585,719.995,198.745,102.361607,2002.9175,2213.004,1178.464,...,346.424,27.947368,633.7875,97.495,649.005868,16495.244,2173.816,847.724,352.121,
gurobi_2.0,81.45,530.124,392.068,2624.577,719.93,198.745,102.361607,2002.8885,2208.788,1178.008,...,346.424,27.947368,633.7875,97.495,648.998324,16493.979,2169.781,847.564,352.121,
SDP lift,82.517788,,,2627.763179,,,103.861852,,,,...,,28.10194,,97.838509,,,,,,896.563787
SDP proj,81.45,,,2624.261,,,102.299107,,,,...,,27.947368,,97.495,,,,,,872.0625
edge count,124.273333,,,4914.4316,,,145.631626,,,,...,,39.585106,,193.062,,,,,,
