In [7]:
import numpy as np
import pandas as pd

from scipy.special import comb, loggamma, lambertw
from scipy.stats import multinomial, expon

from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import tensorflow as tf

from matplotlib import pyplot as plt

from net_model import *
from custom_model import *
from mps_models import *
import mps
import pwexp

# Load dataset

In [2]:
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

i_valid_train = pd.Series(train_labels).isin([0,1,2,3,4]).to_numpy()
i_valid_test = pd.Series(test_labels).isin([0,1,2,3,4]).to_numpy()

# Filters to take only the images with labels in [0, 1, 2, 3, 4]
train_images = train_images[i_valid_train]
test_images = test_images[i_valid_test]
train_labels = train_labels[i_valid_train]
test_labels = test_labels[i_valid_test]

#### Function to read a specific simulated dataset

In [5]:
def load_file(data_dir, file_index, distribution, train_images, test_images):
    '''
        Example:
            data_dir = "SimulationDataset/Scenario1/n500"
            file_index = 20
            distribution = "poisson"
    '''
    index_path = "{}/indices_{}.csv".format(data_dir, file_index, distribution)
    data_path = "{}/{}/data_{}.csv".format(data_dir, distribution, file_index)
    df_index = pd.read_csv(index_path)
    df_data = pd.read_csv(data_path)

    index_train = df_index.loc[df_index.set == "train","index"].to_numpy()
    index_val = df_index.loc[df_index.set == "val","index"].to_numpy()
    index_test = df_index.loc[df_index.set == "test","index"].to_numpy()

    # Values for the thetas
    theta_train = df_data.loc[df_data.set == "train", "theta"]
    theta_val = df_data.loc[df_data.set == "val", "theta"]
    theta_test = df_data.loc[df_data.set == "test", "theta"]
    # Values for the latent variable
    m_train = df_data.loc[df_data.set == "train", "m"]
    m_val = df_data.loc[df_data.set == "val", "m"]
    m_test = df_data.loc[df_data.set == "test", "m"]
    # Values for the time variable
    t_train = df_data.loc[df_data.set == "train", "t"]
    t_val = df_data.loc[df_data.set == "val", "t"]
    t_test = df_data.loc[df_data.set == "test", "t"]
    # Values for the censorship indicators
    delta_train = df_data.loc[df_data.set == "train", "delta"]
    delta_val = df_data.loc[df_data.set == "val", "delta"]
    delta_test = df_data.loc[df_data.set == "test", "delta"]

    img_train = train_images[index_train,:,:]
    img_val = train_images[index_val,:,:]
    img_test = test_images[index_test,:,:]

    result = {
        "theta_train": theta_train, "theta_val": theta_val, "theta_test": theta_test,
        "m_train": m_train, "m_val": m_val, "m_test": m_test,
        "t_train": t_train, "t_val": t_val, "t_test": t_test,
        "delta_train": delta_train, "delta_val": delta_val, "delta_test": delta_test,
        "img_train": img_train, "img_val": img_val, "img_test": img_test
    }
    
    return result

In [None]:
alpha0, s_t = initialize_alpha_s(t_train, n_cuts = 5)
s_t = np.array([0.0, 2.062, 3.36, 4.495, 6.32, 12.2])

print("alpha0: {}".format(alpha0))
print("cuts: {}".format(s_t))

In [None]:
def fit_simulation(log_a_str
                    theta_train, theta_val
                   m_train, m_val,
                   t_train, t_val,
                   delta_train, delta_val,
                   img_train, img_val,
                   distribution, seed = 1):
    set_all_seeds(seed)
    
    alpha0, s_t = initialize_alpha_s(t_train, n_cuts = 5)

    if(distribution == "poisson"):
        model = MPScrModel(a_poisson, phi_poisson, C_poisson, C_inv_poisson, sup_poisson)
        model.define_structure(shape_input = x_train[0].shape, seed = 42)
    
    results = call_EM("EM.py",
                      a_poisson_str, phi_poisson_str, C_poisson_str, C_inv_poisson_str, B_poisson_str, sup_poisson_str,
                      poisson_model, alpha0, s_t,
                      x_train, t_train, delta_train, delta_train,
                      max_iterations = 60,
                      early_stopping_em = True, early_stopping_em_warmup = 5, early_stopping_em_eps = 1.0e-6,
                      epochs = 100, batch_size = 64, shuffle = True,
                      learning_rate = 0.001, run_eagerly = False,
                      early_stopping_nn = True, early_stopping_min_delta_nn = 0.0, early_stopping_patience_nn = 5,
                      reduce_lr = True, reduce_lr_steps = 10, reduce_lr_factor = 0.1,
                      validation = True,
                      x_val = x_test, t_val = t_test, delta_val = delta_test, m_val = delta_test,
                      verbose = 3, alpha_known = False)

In [52]:
# load_file("SimulationDataset/Scenario1/n500", 1, "bernoulli", train_images, test_images)

#### 1

In [20]:
load_file("SimulationDataset/Scenario1/n500", 1, "bernoulli")

array([27554, 20611,  9781, 22729, 28783, 20657, 20557,  5412,  2964,
       20330,  9120, 25574, 29502, 23313, 24980, 19122, 28657, 25458,
       12747,  9515, 15814, 28472, 25117, 13016,  2545, 16713, 29172,
       23178, 29868, 21933, 18044, 15748,  8345,   451, 21680,  9091,
       21351, 10261, 27171, 25670, 13532,   834, 10912, 19098,  6653,
       20043,  6052, 12421, 25685, 11967, 12229,  9524, 26588, 22616,
        9897, 12354, 26131,  1757, 18501,  6840, 29865, 23657, 28265,
        9150, 11506, 21491,  2798, 23006, 29199,   225,  2838, 15680,
       17875, 29019,  7356, 16019, 13670, 12501,  7721, 27724, 25094,
       10464,  3946, 20153, 23772, 28781,  6362, 20124, 25939, 29900,
       15311, 11188,  4352, 23837, 17852, 10516, 18408,  5098,   337,
       13914, 13394, 27472, 20384, 16831, 27251,  7862,  5439,  4738])