In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import random

from scipy.special import comb, loggamma, lambertw
from scipy.stats import multinomial, expon

from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import tensorflow as tf
import tensorflow_probability as tfp

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config = config)

import os, shutil
import json
import subprocess

from net_model import *
from custom_model import *
from mps_models import *

import mps
import pwexp

E0000 00:00:1741042659.514411   22272 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741042659.518082   22272 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1741042661.268977   22272 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4203 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 6GB Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sample_mnist_rgp10 import *

In [5]:
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

i_valid_train = pd.Series(train_labels).isin([0,1,2,3,4]).to_numpy()
i_valid_test = pd.Series(test_labels).isin([0,1,2,3,4]).to_numpy()

# Filters to take only the images with labels in [0, 1, 2, 3, 4]
train_labels = train_labels[i_valid_train]
test_labels = test_labels[i_valid_test]

# Indices for each set of filtered data
i_train = np.arange(train_labels.shape[0])
i_test = np.arange(test_labels.shape[0])

In [12]:
def get_theta(log_a, log_phi, C, C_inv, sup, p_0, theta_min = None, theta_max = None):
    '''
        Given the specifications for the latent causes distribution and a vector with cure probabilities,
        inverts the cure probability function and returns the theta parameters for each individual
    '''
    theta = C_inv( np.exp(log_a(0.0) - np.log(p_0)) )
    
    # Se theta é limitado inferiormente por um valor theta_min > 0, valores de theta obtidos abaixo do limite são levados para o limite inferior do parâmetro
    if(theta_min is not None):
        theta[theta <= theta_min] = theta_min + 1.0e-5
    # Se theta é limitado superiormente por um valor theta_max > 0, valores de theta obtidos acima do limite são levados para o limite superior do parâmetro
    if(theta_min is not None):
        theta[theta >= theta_max] = theta_max - 1.0e-5
        
    return theta


def generate_data(log_a, log_phi, theta, sup, low_c, high_c):
    '''
        Dada a especificação do modelo e um vetor com os parâmetros individuais, gera os tempos de vida e censuras de cada indivíduo.
        low_c e high_c definem o intervalo para a geração dos tempos de censura, seguindo uma distribuição U[low_c, high_c]
    '''
    n = len(theta)
    m = mps.rvs(log_a, log_phi, theta, sup, size = 10)
    
    cured = np.zeros(n)
    delta = cured.copy()
    t = cured.copy()
    
    # Censorship times
    c = np.random.uniform(low = low_c, high = high_c, size = n)
    
    for i in range(n):
        if(m[i] == 0):
            t[i] = c[i]
            cured[i] = 1
        else:
            # Risco base segue uma distribuição Exp(1)
            z = expon.rvs(loc = 0.0, scale = 1.0, size = int(m[i]))
            t[i] = np.min(z)
    
    # Atualiza as posições não censuradas para delta = 1
    delta[t < c] = 1
    # Os tempos censurados passam a assumir o valor do tempo de censura
    t[t >= c] = c[t >= c]
    
    # Retorna os tempos, deltas e o vetor de causas latentes (que na prática é desconhecido)
    return m, t, delta, cured

def join_datasets(n_train, n_val, n_test, theta_train, theta_val, theta_test, m_train, m_val, m_test, t_train, t_val, t_test, delta_train, delta_val, delta_test):
    sets = np.concatenate([np.repeat("train", n_train), np.repeat("val", n_val), np.repeat("test", n_test)])
    theta = np.concatenate([theta_train, theta_val, theta_test])
    m = np.concatenate([m_train, m_val, m_test])
    t = np.concatenate([t_train, t_val, t_test])
    delta = np.concatenate([delta_train, delta_val, delta_test])
    return pd.DataFrame({"theta": theta, "m": m, "t": t, "delta": delta, "set": sets})

In [13]:
def sample_single_bootstrap_rgp10(cure_probs_dict_vec, directory, file_index):
    '''
        Get a single bootstrap sample from the Fashion-MNIST dataset considering each distribution from scenario 1.
    '''
    filename = "data_{}.csv".format(file_index)

    # ---------------------------- Sample the indices from the original dataset ----------------------------
    
    df_indices = pd.read_csv("{}/indices_{}.csv".format(directory, file_index))
    indices = df_indices["index"].to_numpy()
    sets = df_indices["set"].to_numpy()

    # Indices for train and validation
    i_train_val = indices[ (sets == "train") | (sets == "val") ]
    i_test = indices[ sets == "test" ]
    
    n_train = int(np.sum(sets == "train"))
    n_val = int(np.sum(sets == "val"))
    n_test = int(np.sum(sets == "test"))
    n = n_train + n_val + n_test
    
    # The labels for the train set are the first n_train sampled indices in i_train_val
    label_train = train_labels[i_train_val[:n_train]]
    # The labels for the validation set are the last n_train sampled indices in i_train_val
    label_val = train_labels[i_train_val[n_train:]]
    # Takes the labels for the test set
    label_test = test_labels[i_test]
    
    p_train = cure_probs_dict_vec(label_train)
    p_val = cure_probs_dict_vec(label_val)
    p_test = cure_probs_dict_vec(label_test)

    # The censored times follow a U(low_c, high_c) distribution - To control the censored and cured observations properly, we should have a different distribution 
    # for each of the chosen distributions for M
    low_c = 0
    high_c = 6
    
    # ---------------------------- RPG(-1/10) ----------------------------
    q = -1.0/10.0
    # RPG(-1/10) - Training data
    theta_train_rgp10 = get_theta(log_a_rgp(q), log_phi_rgp(q), C_rgp(q), C_inv_rgp(q), sup_rgp(q), p_train, theta_min = theta_min_rgp, theta_max = theta_max_rgp(q))
    m_train_rgp10, t_train_rgp10, delta_train_rgp10, cured_train_rgp10 = \
        generate_data(log_a_rgp(q), log_phi_rgp(q), theta_train_rgp10, sup_rgp(q), low_c, high_c)
    # RPG(-1/10) - Validation data
    theta_val_rgp10 = get_theta(log_a_rgp(q), log_phi_rgp(q), C_rgp(q), C_inv_rgp(q), sup_rgp(q), p_val, theta_min = theta_min_rgp, theta_max = theta_max_rgp(q))
    m_val_rgp10, t_val_rgp10, delta_val_rgp10, cured_val_rgp10 = \
        generate_data(log_a_rgp(q), log_phi_rgp(q), theta_val_rgp10, sup_rgp(q), low_c, high_c)
    # RPG(-1/10) - Test data
    theta_test_rgp10 = get_theta(log_a_rgp(q), log_phi_rgp(q), C_rgp(q), C_inv_rgp(q), sup_rgp(q), p_test, theta_min = theta_min_rgp, theta_max = theta_max_rgp(q))
    m_test_rgp10, t_test_rgp10, delta_test_rgp10, cured_test_rgp10 = \
        generate_data(log_a_rgp(q), log_phi_rgp(q), theta_test_rgp10, sup_rgp(q), low_c, high_c)
    # Save the DataFrame with the simulated values for the RGP(-1/10)
    rgp10_data = join_datasets(
        n_train, n_val, n_test,
        theta_train_rgp10, theta_val_rgp10, theta_test_rgp10,
        m_train_rgp10, m_val_rgp10, m_test_rgp10,
        t_train_rgp10, t_val_rgp10, t_test_rgp10,
        delta_train_rgp10, delta_val_rgp10, delta_test_rgp10
    )
    # rgp10_data.to_csv("{}/poisson/{}".format(directory, filename), index = False)
    return rgp10_data

In [30]:
cure_probs_dict1 = {0: 0.9, 1:0.45, 2:0.22, 3:0.14, 4: 0.08}
cure_probs_dict1 = np.vectorize(cure_probs_dict1.get)

directory = "SimulationDataset/Scenario1/n{}".format(500)
file_index = 1

np.random.seed(333)

df = sample_single_bootstrap_rgp10(cure_probs_dict1, directory, file_index)
df.head(4)

Unnamed: 0,theta,m,t,delta,set
0,0.105361,0,0.145632,0.0,train
1,1.514128,2,0.246477,1.0,train
2,0.105361,0,4.987943,0.0,train
3,1.966113,1,0.588371,1.0,train
