# Library

In [None]:
# My library
from molgraph.dataset import *
from molgraph.graphmodel import *
from molgraph.hyperparameter import *
from molgraph.testing import *
from molgraph.visualize import *
from molgraph.experiment import *
# General library
import os
import argparse
import numpy as np
# pytorch
import torch
import pytorch_lightning as pl
# optuna
import optuna
from optuna.trial import TrialState
from optuna.visualization import plot_param_importances

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

# Argument

In [None]:
parser = ArgumentParser()
args = parser.getArgument('''--file bbbp
                             --model GIN
                             --schema AR_0
                             --reduced functional
                             --mol_embedding 256
                             --fold 5
                             --seed 42'''.split())

args

# Dataset

In [None]:
file = args.file
smiles = args.smiles 
task = args.task
splitting = args.splitting 
splitting_fold = args.fold
splitting_seed = args.splitting_seed

# get validated dataset
datasets = getDataset(file, smiles, task, splitting)
# compute positive weight for classification
if args.graphtask == 'classification':
    args.pos_weight = getPosWeight(datasets)
    print('pos_weight:', args.pos_weight)
# generate dataset splitting
datasets_splitted = generateDatasetSplitting(file, splitting, splitting_fold, splitting_seed)
# generate all graph dataset
datasets_graph = generateGraphDataset(file)
# generate all reduced graph dataset
dict_reducedgraph = dict()
for g in args.reduced:
    if g == 'substructure':
        for i in range(splitting_fold):
            vocab_file = file+'_'+str(i)
            if not os.path.exists('vocab/'+vocab_file+'.txt'):
                generateVocabTrain(file, splitting_seed, splitting_fold, vocab_len=args.vocab_len)
            dict_reducedgraph[g] = generateReducedGraphDict(file, g, vocab_file=vocab_file)
    else:
        dict_reducedgraph[g] = generateReducedGraphDict(file, g)

# Training

In [None]:
hyper = Hyper(args)

# storage_string = "sqlite:///./test.db"
if args.graphtask == 'regression':
    study = optuna.create_study(direction="minimize")
elif args.graphtask == 'classification':
    study = optuna.create_study(direction="maximize")

t_start = time.time()
study.optimize(hyper.objective, n_trials=5, timeout=3600)
len(study.get_trials())
print("Time: {:.3f}s".format(time.time() - t_start))

In [None]:
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    if key == 'channels':
        print("--in_channels {}".format(value))
        print("--hidden_channels {}".format(value))
        print("--out_channels {}".format(value))
    else:
        print("--{} {}".format(key, value))

with open('dataset/{}/hyperparams.txt'.format(hyper.log_folder_name), 'w') as f:
    for key, value in trial.params.items():
        if key == 'channels':
            f.write("--in_channels {}".format(value))
            f.write('\n')
            f.write("--hidden_channels {}".format(value))
            f.write('\n')
            f.write("--out_channels {}".format(value))
        else:
            f.write("--{} {}".format(key, value))
        f.write('\n')

print(optuna.importance.get_param_importances(study))

# Testing

In [None]:
args_test = dict()

# Load model
# ts = "2022-Oct-06-23:57:53"
# args_test['log_folder_name'] = os.path.join(*[args.file, args.model+'_'+args.reduced+'_'+args.schema, f"{ts}"])
args_test['log_folder_name'] = hyper.log_folder_name
args_test['exp_name'] = args.experiment_number
args_test['fold_number'] = 0
args_test['seed'] = args.seed

test_loader, datasets_test =  generateDataLoaderTesting(args.file, args.batch_size)

In [None]:
tester = Tester(args, args_test)
tester.test(test_loader)

In [None]:
x_embed = tester.getXEmbed()
y_test = tester.getYTest()
path = 'results/'+hyper.log_folder_name
legend = None

if args.graphtask == 'regression':
    min_value = np.min(y_test)
    max_value = np.max(y_test)
    
    interval_num = 10
    interval = (max_value-min_value)/interval_num
    ranging = [(min_value+(interval*i), min_value+(interval*(i+1))) for i in range(interval_num)]

    y_test_new = list()
    for y in y_test:
        for i, r in enumerate(ranging):
            if r[0] <= y < r[1]:
                y_test_new.append(i)
                break
            elif y == max_value:
                y_test_new.append(interval_num-1)
                break

    y_test = np.array(y_test_new)

    legend_new = list()
    for i, r in enumerate(ranging):
        if i != len(ranging)-1:
            legend_new.append('['+str("{:.2f}".format(r[0]))+','+str("{:.2f}".format(r[1]))+')')
        else:
            legend_new.append('['+str("{:.2f}".format(r[0]))+','+str("{:.2f}".format(r[1]))+']')

    legend = legend_new