In [42]:
import os
import utils.args_parser  as argtools
import pytorch_lightning as pl
import numpy as np
from utils.constants import Cte
from data_modules.my_toy_scm import MyToySCMDataModule
from utils.distributions import *
from data_modules.het_scm import HeterogeneousSCMDataModule
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
from models._evaluator import MyEvaluator



def create_data(n,
                seed,
                structural_equations,
                noise_distributions,
                graph,
                name,
                equations_type,
                cfg = None,
                new_data = False):
    if cfg = None:
        model_file =   os.path.join('_params', 'model_vaca.yaml')
        trainer_file =   os.path.join('_params', 'trainer.yaml')


        cfg = argtools.parse_args(model_file)
        cfg.update(argtools.parse_args(trainer_file))
        # Config for new dataset
        cfg['dataset'] = {
            'params1': {},
            'params2': {}
        }

        cfg['dataset']['params1'] = {
            'data_dir': '../Data',
            'batch_size': 1000,
            'num_workers': 0
        }

        cfg['dataset']['params2'] = {
            'num_samples_tr': None,
            'equations_type': 'linear',
            'normalize': 'lik',
            'likelihood_names': 'd',
            'lambda_': 0.05,
            'normalize_A': None,
        }

        cfg['root_dir'] = 'results'
        cfg['seed'] = None
        pl.seed_everything(cfg['seed'])

        cfg['dataset']['params'] = cfg['dataset']['params1'].copy()
        cfg['dataset']['params'].update(cfg['dataset']['params2'])

        cfg['dataset']['params']['data_dir'] = ''

        cfg['trainer']['limit_train_batches'] = 1.0
        cfg['trainer']['limit_val_batches'] = 1.0
        cfg['trainer']['limit_test_batches'] = 1.0
        cfg['trainer']['check_val_every_n_epoch'] = 10
        cfg['dataset']['params'] = equations_type 
        cfg['dataset']['name'] = name
        cfg['dataset']['params']['num_samples_tr'] = n
        cfg['seed'] = seed

    intervene_nodes = []
    adj_edges = {}
    
    for node in g.nodes:
        if g.out_degree[node] > 0:
            intervene_nodes.append(node)
        adj_edges[node] = (list(g.neighbors(node)))
    
    dataset_params = cfg['dataset']['params'].copy()
    dataset_params['dataset_name'] = cfg['dataset']['name']

    dataset_params['nodes_to_intervene'] = intervene_nodes
    dataset_params['structural_eq'] = structural_equations
    dataset_params['noises_distr'] = noise_distributions
    
    dataset_params['adj_edges'] = adj_edges

    data_module = MyToySCMDataModule(**dataset_params)
    data_module.prepare_data(new_data = new_data)
    return data_module,cfg

def get_train_data(data_module):
    orig_bs = data_module.batch_size
    data_module.batch_size = 1
    train = data_module.train_dataloader()
    n_points = len(data_module.train_dataloader())
    data_module.batch_size = n_points
    train = data_module.train_dataloader()
    batch = next(iter(train))
    data_module.batch_size = orig_bs
    return batch.x.view(n,-1)


def create_vaca_model(cfg):
 
    model = None
    model_params = cfg['model']['params'].copy()

    #print(f"\nUsing model: {cfg['model']['name']}")


    # VACA
    if cfg['model']['name'] == Cte.VACA:
        from models.vaca.vaca import VACA

        model_params['is_heterogeneous'] = data_module.is_heterogeneous
        model_params['likelihood_x'] = data_module.likelihood_list

        model_params['deg'] = data_module.get_deg(indegree=True)
        model_params['num_nodes'] = data_module.num_nodes
        model_params['edge_dim'] = data_module.edge_dimension
        model_params['scaler'] = data_module.scaler

        model = VACA(**model_params)
        model.set_random_train_sampler(data_module.get_random_train_sampler())
    # VACA with PIWAE
    elif cfg['model']['name'] == Cte.VACA_PIWAE:
        from models.vaca.vaca_piwae import VACA_PIWAE

        model_params['is_heterogeneous'] = data_module.is_heterogeneous

        model_params['likelihood_x'] = data_module.likelihood_list

        model_params['deg'] = data_module.get_deg(indegree=True)
        model_params['num_nodes'] = data_module.num_nodes
        model_params['edge_dim'] = data_module.edge_dimension
        model_params['scaler'] = data_module.scaler

        model = VACA_PIWAE(**model_params)
        model.set_random_train_sampler(data_module.get_random_train_sampler())


    # MultiCVAE
    elif cfg['model']['name'] == Cte.MCVAE:
        from models.multicvae.multicvae import MCVAE

        model_params['likelihood_x'] = data_module.likelihood_list

        model_params['topological_node_dims'] = data_module.train_dataset.get_node_columns_in_X()
        model_params['topological_parents'] = data_module.topological_parents
        model_params['scaler'] = data_module.scaler
        model_params['num_epochs_per_nodes'] = int(
            np.floor((cfg['trainer']['max_epochs'] / len(data_module.topological_nodes))))
        model = MCVAE(**model_params)
        model.set_random_train_sampler(data_module.get_random_train_sampler())
        cfg['early_stopping'] = False

    # CAREFL
    elif cfg['model']['name'] == Cte.CAREFL:
        from models.carefl.carefl import CAREFL

        model_params['node_per_dimension_list'] = data_module.train_dataset.node_per_dimension_list
        model_params['scaler'] = data_module.scaler
        model = CAREFL(**model_params)

    model.set_optim_params(optim_params=cfg['optimizer'],
                           sched_params=cfg['scheduler'])
    return model

def fit_vaca(model,cfg, data_module):
    

    is_training = True
    load = True

    print(f'Is training activated? {is_training}')
    print(f'Is loading activated? {load}')
    if yaml_file == '':
        if (cfg['dataset']['name'] in [Cte.GERMAN]) and (cfg['dataset']['params3']['train_kfold'] == True):
            save_dir = argtools.mkdir(os.path.join(cfg['root_dir'],
                                                   argtools.get_experiment_folder(cfg),
                                                   str(cfg['seed']), str(cfg['dataset']['params3']['kfold_idx'])))
        else:
            save_dir = argtools.mkdir(os.path.join(cfg['root_dir'],
                                                   argtools.get_experiment_folder(cfg),
                                                   str(cfg['seed'])))
    else:
        save_dir = os.path.join(*yaml_file.split('/')[:-1])

    logger = TensorBoardLogger(save_dir=save_dir, name='logs', default_hp_metric=False)

    out = logger.log_hyperparams(argtools.flatten_cfg(cfg))

    save_dir_ckpt = argtools.mkdir(os.path.join(save_dir, 'ckpt'))
    if load:
        ckpt_file = argtools.newest(save_dir_ckpt)
    else:
        ckpt_file = None
    callbacks = []



    evaluator = MyEvaluator(model=model,
                            intervention_list=data_module.train_dataset.get_intervention_list(),
                            scaler=data_module.scaler
                            )
    model.set_my_evaluator(evaluator=evaluator)

    if is_training:
        checkpoint = ModelCheckpoint(period=1,
                                     monitor=model.monitor(),
                                     mode=model.monitor_mode(),
                                     save_top_k=1,
                                     save_last=True,
                                     filename='checkpoint-{epoch:02d}',
                                     dirpath=save_dir_ckpt)
        callbacks = [checkpoint]


        if cfg['early_stopping']:
            early_stopping = EarlyStopping(model.monitor(), mode=model.monitor_mode(), min_delta=0.0, patience=50)
            callbacks.append(early_stopping)
        trainer = pl.Trainer(logger=logger, callbacks=callbacks, **cfg['trainer'])

    if load:
        if ckpt_file is None:
            print(f'No ckpt files in {save_dir_ckpt}')
        else:
            print(f'\nLoading from: {ckpt_file}')
            if is_training:
                trainer = pl.Trainer(logger=logger, callbacks=callbacks, resume_from_checkpoint=ckpt_file,
                                 **cfg['trainer'])
            else:

                model = model.load_from_checkpoint(ckpt_file, **model_params)
                evaluator.set_model(model)
                model.set_my_evaluator(evaluator=evaluator)

                if cfg['model']['name'] in [Cte.VACA_PIWAE, Cte.VACA, Cte.MCVAE]:
                    model.set_random_train_sampler(data_module.get_random_train_sampler())
    path = os.path.join(save_dir,"logs")

    if not os.path.exists(path):
        os.makedirs(path)

    if is_training:
        trainer.fit(model, data_module)
        # save_yaml(model.get_arguments(), file_path=os.path.join(save_dir, 'hparams_model.yaml'))
        argtools.save_yaml(cfg, file_path=os.path.join(save_dir, 'hparams_full.yaml'))
        # %% Testing

SyntaxError: invalid syntax (2034102531.py, line 24)

In [None]:
def vaca_cf(model, cfg, intervention, n_samples = 100):
    n = cfg['dataset']['params']['num_samples_tr']
    cfg['dataset']['params']['num_samples_tr'] = n_samples
    data_module = create_data(cfg,new_data=True)

    data_module.batch_size = 1
    x_I = intervention  # Intervention before normalizing
    data_loader = data_module.test_dataloader()
    data_loader.dataset.set_intervention(x_I,is_noise=False)
    data_loader = data_module.test_dataloader()

    batch = next(iter(data_loader))
    cfg['dataset']['params']['num_samples_tr'] = n
    vaca_pred, gt_cf, factual = model.get_counterfactual_distr(data_loader,
                                            x_I=x_I,
                                            is_noise = False,
                                            num_batches= 1,
                                            normalize=False,
                                            )

In [18]:
cfg['dataset']['params']['num_samples_tr']=10
data_module = create_data(cfg,new_data=True)
bs = data_module.batch_size
data_module.batch_size = 1
x_I = {'x1': 10.0}  # Intervention before normalizing
data_loader = data_module.test_dataloader()
data_loader.dataset.set_intervention(x_I,is_noise=False)
data_loader = data_module.test_dataloader()
data_module.batch_size = bs

batch = next(iter(data_loader))

vaca_pred, gt_cf, factual = model.get_counterfactual_distr(data_loader,
                                        x_I=x_I,
                                        is_noise = False,
                                        num_batches= 1,
                                        normalize=False,
                                        )

gt_cf

{'intervened': tensor([[10.],
         [10.]]),
 'children': tensor([[9.2288],
         [8.3016]]),
 'all': tensor([[10.0000,  9.2288],
         [10.0000,  8.3016]])}

In [6]:
gt_cf

{'intervened': tensor([[10.],
         [10.]]),
 'children': tensor([[10.5870],
         [10.4335]]),
 'all': tensor([[10.0000, 10.5870],
         [10.0000, 10.4335]])}

In [7]:
factual

{'all': tensor([[ 2.4700,  3.0570],
         [-0.3974,  0.0362]])}

In [29]:
batch = next(iter(train))

In [33]:
temp = batch.x.view(-1,2)

In [37]:
batch.x

tensor([[-0.5491],
        [-1.5926],
        [-0.6784],
        ...,
        [-0.0165],
        [ 0.7259],
        [ 0.4806]])