## VIDA Seminar Experiments

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from imports import *

In [3]:
from models import *
from data import * 
from env import *
from harmonize import *
from sim import *
from sim import plot

In [4]:
# overall imports
import importlib
import data

# sim class
# importlib.reload(sim)

import sim.sim 
import sim.sim_utils
from sim.sim_utils import bytes2human, print_system_usage
from sim.sim import Simulation
#importlib.reload(sim.sim_run)
from sim.sim_run import single_sim_run, open_pickled_results
importlib.reload(sim.sim)
importlib.reload(sim.sim_utils)

import sim.plot
from sim.plot import (
    plot_single_model_predictions_with_metrics,
    plot_fold_performance,
    plot_summary_measure_comparison
)
importlib.reload(sim.plot)


<module 'sim.plot' from '/scratch/asr655/neuroinformatics/GeneEx2Conn/sim/plot.py'>

#### Check job specs

In [5]:
print_system_usage()

total = psutil.disk_usage('/').total
print(bytes2human(total))

GPUtil.getGPUs()
print(f"Number of available GPUs: {torch.cuda.device_count()}")

DEVICE_ID_LIST = GPUtil.getFirstAvailable()
DEVICE_ID = DEVICE_ID_LIST[0] # grab first element from list
if DEVICE_ID != None: 
    print('GPU found', DEVICE_ID)
    use_gpu = True

print("XGBoost version:", xgboost.__version__)
print("cupy version:", cp.__version__)

GPUtil.showUtilization()

torch.cuda.empty_cache()


CPU Usage: 6.5%
RAM Usage: 9.1%
Available RAM: 342.8G
Total RAM: 377.1G
52.4G
Number of available GPUs: 1
GPU found 0
XGBoost version: 2.0.3
cupy version: 13.1.0
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |


### To run:
- Genetics MLP runs for community splits
- SpectralA MLP runs for community splits
- Genetics + SpectralA MLP runs for community splits
- SpectralA linear model runs for community splits

In [6]:
feature_types = ['transcriptome', 'transcriptomePCA', 'structural_spectralA', 'euclidean']

Genetics MLP runs for community splits

In [None]:
resolutions = [1.01]
#seeds = [1, 2, 4, 5, 42]
seeds = [42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='mlp',
            feature_type=['transcriptome'],
            use_gpu=True,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('grid', 'mse')
            )

In [None]:
Genetics xgboost runs for community splits

In [13]:
resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]
# seeds = [42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='xgboost',
            feature_type=['transcriptome'],
            use_gpu=True,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('bayes', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 22106) (6320,) (1122, 22106) (1122,)
SEARCH METHOD ('bayes', 'mse')
2
3
ACCELERATING
Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV 1/2] END colsample_bytree=0.8, device=cuda, learning_rate=0.3, max_depth=3, n_estimators=250, n_gpus=-1, random_state=42, reg_alpha=0.1, reg_lambda=0.0001, subsample=0.8, tree_method=gpu_hist, verbosity=0;, score=(train=-0.000, test=-0.040) total time=   1.7s
[CV 2/2] END colsample_bytree=0.8, device=cuda, learning_rate=0.3, max_depth=3, n_estimators=250, n_gpus=-1, random_state=42, reg_alpha=0.1, reg_lambda=0.0001, subsample=0.8, tree_method=gpu_hist, verbosity=0;, score=(train=-0.000, test=-0.088) total time=   1.1s
[CV 1/2] END colsample_bytree=0.6, device=cuda, learning_rate=0.001, max_depth=6, n_estimators=250, n_gpus=-1, random_state=42, reg_alpha=1, reg_lambda=0.01, subsamp

SpectralA MLP runs for community splits

In [None]:
resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='mlp',
            feature_type=['structural_spectralA'],
            summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=True,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('grid', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 20) (6320,) (1122, 20) (1122,)
SEARCH METHOD ('grid', 'mse')
2
3
GPU model input size 20
Fitting 2 folds for each of 4 candidates, totalling 8 fits
Epoch [10/200], Loss: 0.0161
Epoch [20/200], Loss: 0.0129
Epoch [30/200], Loss: 0.0121
Epoch [40/200], Loss: 0.0102
Epoch [50/200], Loss: 0.0087
Epoch [60/200], Loss: 0.0081
Epoch [70/200], Loss: 0.0074
Epoch [80/200], Loss: 0.0071
Epoch [90/200], Loss: 0.0064
Epoch [100/200], Loss: 0.0061
Epoch [110/200], Loss: 0.0066
Epoch [120/200], Loss: 0.0063
Epoch [130/200], Loss: 0.0058
Epoch [140/200], Loss: 0.0060
Epoch [150/200], Loss: 0.0054
Epoch [160/200], Loss: 0.0055
Epoch [170/200], Loss: 0.0061
Epoch [180/200], Loss: 0.0057
Epoch [190/200], Loss: 0.0050
Epoch [200/200], Loss: 0.0050
[CV 1/2] END batch_size=32, epochs=200, l2_reg=0.001, lr=0.001;, score=(train=-204.837, test=-82.730

SpectralA MLP runs for community splits

In [6]:
resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='xgboost',
            feature_type=['structural_spectralA'],
            summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=True,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('bayes', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 20) (6320,) (1122, 20) (1122,)
SEARCH METHOD ('bayes', 'mse')
2
3
ACCELERATING
Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV 1/2] END colsample_bytree=0.8, device=cuda, learning_rate=0.3, max_depth=3, n_estimators=250, n_gpus=-1, random_state=42, reg_alpha=0.1, reg_lambda=0.0001, subsample=0.8, tree_method=gpu_hist, verbosity=0;, score=(train=-0.003, test=-0.043) total time=   2.9s
[CV 2/2] END colsample_bytree=0.8, device=cuda, learning_rate=0.3, max_depth=3, n_estimators=250, n_gpus=-1, random_state=42, reg_alpha=0.1, reg_lambda=0.0001, subsample=0.8, tree_method=gpu_hist, verbosity=0;, score=(train=-0.002, test=-0.109) total time=   0.2s
[CV 1/2] END colsample_bytree=0.6, device=cuda, learning_rate=0.001, max_depth=6, n_estimators=250, n_gpus=-1, random_state=42, reg_alpha=1, reg_lambda=0.01, subsample=0.6

Genetics + SpectralA MLP runs for community splits

In [None]:
resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='mlp',
            feature_type=['transcriptome', 'structural_spectralA'],
            summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=True,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('grid', 'mse')
            )

Genetics + SpectralA xgboost runs for community splits

In [8]:
resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='xgboost',
            feature_type=['transcriptome', 'structural_spectralA'],
            summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=True,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('bayes', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 22126) (6320,) (1122, 22126) (1122,)
SEARCH METHOD ('bayes', 'mse')
2
3
ACCELERATING
Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV 1/2] END colsample_bytree=0.8, device=cuda, learning_rate=0.3, max_depth=3, n_estimators=250, n_gpus=-1, random_state=42, reg_alpha=0.1, reg_lambda=0.0001, subsample=0.8, tree_method=gpu_hist, verbosity=0;, score=(train=-0.000, test=-0.036) total time=   1.7s
[CV 2/2] END colsample_bytree=0.8, device=cuda, learning_rate=0.3, max_depth=3, n_estimators=250, n_gpus=-1, random_state=42, reg_alpha=0.1, reg_lambda=0.0001, subsample=0.8, tree_method=gpu_hist, verbosity=0;, score=(train=-0.000, test=-0.082) total time=   1.1s
[CV 1/2] END colsample_bytree=0.6, device=cuda, learning_rate=0.001, max_depth=6, n_estimators=250, n_gpus=-1, random_state=42, reg_alpha=1, reg_lambda=0.01, subsamp

Genetics + SpectralA ridge runs for community splits

In [9]:
resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='ridge',
            feature_type=['transcriptome', 'structural_spectralA'],
            summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=False,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('grid', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 22126) (6320,) (1122, 22126) (1122,)
SEARCH METHOD ('grid', 'mse')
2
3
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END .............alpha=0, solver=auto;, score=-0.033 total time=   1.5s
[CV 2/2] END .............alpha=0, solver=auto;, score=-0.150 total time=   0.5s
[CV 1/2] END .........alpha=0.001, solver=auto;, score=-0.033 total time=   0.8s
[CV 2/2] END .........alpha=0.001, solver=auto;, score=-0.147 total time=   0.4s
[CV 1/2] END ..........alpha=0.01, solver=auto;, score=-0.033 total time=   0.8s
[CV 2/2] END ..........alpha=0.01, solver=auto;, score=-0.147 total time=   0.4s
[CV 1/2] END ...........alpha=0.1, solver=auto;, score=-0.033 total time=   0.8s
[CV 2/2] END ...........alpha=0.1, solver=auto;, score=-0.147 total time=   0.4s
[CV 1/2] END ...........alpha=1.0, solver=auto;, score=-0.03

Genetics + SpectralA pls runs for community splits

In [10]:
resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='pls',
            feature_type=['transcriptome', 'structural_spectralA'],
            summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=False,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('grid', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 22126) (6320,) (1122, 22126) (1122,)
SEARCH METHOD ('grid', 'mse')
2
3
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END max_iter=1000, n_components=1, scale=True, tol=1e-07;, score=-0.040 total time=   0.9s
[CV 2/2] END max_iter=1000, n_components=1, scale=True, tol=1e-07;, score=-0.064 total time=   0.7s
[CV 1/2] END max_iter=1000, n_components=1, scale=False, tol=1e-07;, score=-0.041 total time=   0.7s
[CV 2/2] END max_iter=1000, n_components=1, scale=False, tol=1e-07;, score=-0.065 total time=   0.6s
[CV 1/2] END max_iter=1000, n_components=2, scale=True, tol=1e-07;, score=-0.034 total time=   1.1s
[CV 2/2] END max_iter=1000, n_components=2, scale=True, tol=1e-07;, score=-0.100 total time=   0.7s
[CV 1/2] END max_iter=1000, n_components=2, scale=False, tol=1e-07;, score=-0.035 total time=   0.8s
[CV 2

SpectralA linear model runs for community splits


In [6]:
# RIDGE

resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='ridge',
            feature_type=['structural_spectralA'],
            summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=False,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('grid', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 20) (6320,) (1122, 20) (1122,)
SEARCH METHOD ('grid', 'mse')
2
3
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END .............alpha=0, solver=auto;, score=-0.078 total time=   0.0s
[CV 2/2] END .............alpha=0, solver=auto;, score=-0.063 total time=   0.0s
[CV 1/2] END .........alpha=0.001, solver=auto;, score=-0.078 total time=   0.0s
[CV 2/2] END .........alpha=0.001, solver=auto;, score=-0.063 total time=   0.0s
[CV 1/2] END ..........alpha=0.01, solver=auto;, score=-0.078 total time=   0.0s
[CV 2/2] END ..........alpha=0.01, solver=auto;, score=-0.063 total time=   0.0s
[CV 1/2] END ...........alpha=0.1, solver=auto;, score=-0.077 total time=   0.0s
[CV 2/2] END ...........alpha=0.1, solver=auto;, score=-0.064 total time=   0.0s
[CV 1/2] END ...........alpha=1.0, solver=auto;, score=-0.070 tota

In [11]:
# RIDGE

resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='ridge',
            feature_type=['transcriptome'],
            #summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=False,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('grid', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 22106) (6320,) (1122, 22106) (1122,)
SEARCH METHOD ('grid', 'mse')
2
3
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END .............alpha=0, solver=auto;, score=-0.033 total time=   1.4s
[CV 2/2] END .............alpha=0, solver=auto;, score=-0.149 total time=   0.5s
[CV 1/2] END .........alpha=0.001, solver=auto;, score=-0.033 total time=   0.8s
[CV 2/2] END .........alpha=0.001, solver=auto;, score=-0.147 total time=   0.4s
[CV 1/2] END ..........alpha=0.01, solver=auto;, score=-0.033 total time=   0.8s
[CV 2/2] END ..........alpha=0.01, solver=auto;, score=-0.147 total time=   0.4s
[CV 1/2] END ...........alpha=0.1, solver=auto;, score=-0.033 total time=   0.8s
[CV 2/2] END ...........alpha=0.1, solver=auto;, score=-0.147 total time=   0.4s
[CV 1/2] END ...........alpha=1.0, solver=auto;, score=-0.03

In [8]:
# PLS
resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='pls',
            feature_type=['structural_spectralA'],
            summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=False,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('grid', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 20) (6320,) (1122, 20) (1122,)
SEARCH METHOD ('grid', 'mse')
2
3
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END max_iter=1000, n_components=1, scale=True, tol=1e-07;, score=-0.044 total time=   0.0s
[CV 2/2] END max_iter=1000, n_components=1, scale=True, tol=1e-07;, score=-0.076 total time=   0.0s
[CV 1/2] END max_iter=1000, n_components=1, scale=False, tol=1e-07;, score=-0.044 total time=   0.0s
[CV 2/2] END max_iter=1000, n_components=1, scale=False, tol=1e-07;, score=-0.075 total time=   0.0s
[CV 1/2] END max_iter=1000, n_components=2, scale=True, tol=1e-07;, score=-0.052 total time=   0.0s
[CV 2/2] END max_iter=1000, n_components=2, scale=True, tol=1e-07;, score=-0.069 total time=   0.0s
[CV 1/2] END max_iter=1000, n_components=2, scale=False, tol=1e-07;, score=-0.053 total time=   0.0s
[CV 2/2] EN

In [12]:
# PLS
resolutions = [1.01]
seeds = [1, 2, 4, 5, 42]

for r in resolutions:
    for s in seeds:
        print('resolution', r)
        print('seed', s)
        single_sim_run(
            cv_type='community',
            random_seed=s,
            resolution=r,
            model_type='pls',
            feature_type=['transcriptome'],
            #summary_measure='10', # 10 or 20 are the candidates here (start with 10)
            use_gpu=False,
            use_shared_regions=False,
            test_shared_regions=False,
            save_sim=True,
            search_method=('grid', 'mse')
            )

resolution 1.01
seed 1
computing eig of laplacian
computing eig of adjacency
Number of components explaining 95.0% of the variance: 34

 Test fold num: 1
(6320, 22106) (6320,) (1122, 22106) (1122,)
SEARCH METHOD ('grid', 'mse')
2
3
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END max_iter=1000, n_components=1, scale=True, tol=1e-07;, score=-0.040 total time=   0.9s
[CV 2/2] END max_iter=1000, n_components=1, scale=True, tol=1e-07;, score=-0.064 total time=   0.7s
[CV 1/2] END max_iter=1000, n_components=1, scale=False, tol=1e-07;, score=-0.041 total time=   0.7s
[CV 2/2] END max_iter=1000, n_components=1, scale=False, tol=1e-07;, score=-0.065 total time=   0.6s
[CV 1/2] END max_iter=1000, n_components=2, scale=True, tol=1e-07;, score=-0.034 total time=   1.1s
[CV 2/2] END max_iter=1000, n_components=2, scale=True, tol=1e-07;, score=-0.100 total time=   0.8s
[CV 1/2] END max_iter=1000, n_components=2, scale=False, tol=1e-07;, score=-0.035 total time=   0.8s
[CV 2