# Generating Biomarker Distributions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as sc

np.random.seed()

## Parameters

In [2]:
Ns = [int(50)]
Nparams = [int(100),int(500),int(1000)]
seed = 0

## Ploting function

In [3]:
def show_plots(rv1,rv2):
    plt.figure()
    plt.hist(rv1)
    plt.hist(rv2)
    plt.show()

## Generating distributions

In [4]:
for N in Ns:
    for Nparam in Nparams:
        
        ## Normally distributed biomarker
        mu_space = np.random.uniform(-5,5,Nparam)
        sigma_space = np.random.uniform(0,5,Nparam)
        
        vals = []
        for i in range(Nparam):
            rng = np.random.default_rng(i)
            rv1 = rng.normal(mu_space[i],sigma_space[i],N)
            mu_new = rng.uniform(min(mu_space ),max(mu_space ),1)[0]
            sigma_new = rng.uniform(min(sigma_space ),max(sigma_space ),1)[0]    
            rv2 = rng.normal(mu_new,sigma_new,N)
            tmp = pd.DataFrame({
                'class' : np.concatenate([
                    np.repeat(1,N),np.repeat(0,N)]),
                'distribution' : np.concatenate([
                    np.repeat('normal',N),np.repeat('normal',N)]),
                'biomarker' : np.concatenate([
                    rv1,rv2
                ]),
                'seed' : np.concatenate([
                    np.repeat(i,N),np.repeat(i,N)]),
                'mu' : np.concatenate([
                    np.repeat(mu_space[i],N),np.repeat(mu_new,N)]),
                'sigma' : np.concatenate([
                    np.repeat(sigma_space[i],N),np.repeat(sigma_new,N)])
            },index=['subject'+str(i) for i in range(N*2)])

            vals.append(tmp)
        pd.concat(vals).to_csv('data/mccv_'+str(N)+
                               'subjects_'+str(Nparam)+
                               'parameters_normal_data.csv')

        ## T distributed biomarker
        mu_space = np.random.uniform(-5,5,Nparam)
        df_space = np.random.uniform(1,N-1,Nparam)

        vals = []
        for i in range(Nparam):
            rng = np.random.default_rng(i)
            rv1 = rng.standard_t(df_space[i],N)
            rv1 = rv1 + mu_space[i]
            mu_new = rng.uniform(min(mu_space ),max(mu_space ),1)[0]
            df_new = rng.uniform(min(df_space ),max(df_space ),1)[0]
            rv2 = rng.standard_t(df_new,N)
            rv2 = rv2 + mu_new
            tmp = pd.DataFrame({
                'class' : np.concatenate([
                    np.repeat(1,N),np.repeat(0,N)]),
                'distribution' : np.concatenate([
                    np.repeat('t',N),np.repeat('t',N)]),
                'biomarker' : np.concatenate([
                    rv1,rv2
                ]),
                'seed' : np.concatenate([
                    np.repeat(i,N),np.repeat(i,N)]),
                'rv_mean' : np.concatenate([
                    np.repeat(np.mean(rv1),N),np.repeat(np.mean(rv2),N)]),
                'df' : np.concatenate([
                    np.repeat(df_space[i],N),np.repeat(df_new,N)])
            },index=['subject'+str(i) for i in range(N*2)])

            vals.append(tmp)
        pd.concat(vals).to_csv('data/mccv_'+str(N)+
                               'subjects_'+str(Nparam)+
                               'parameters_t_data.csv')

        ## Beta distributed biomarker
        a_space = np.random.uniform(0,5,Nparam)
        b_space = np.random.uniform(0,5,Nparam)

        vals = []
        for i in range(Nparam):
            rng = np.random.default_rng(i)
            rv1 = rng.beta(a_space[i],b_space[i],N)
            a_new = rng.uniform(min(a_space ),max(a_space ),1)[0]
            b_new = rng.uniform(min(b_space ),max(b_space ),1)[0]    
            rv2 = rng.beta(a_new,b_new,N)
            tmp = pd.DataFrame({
                'class' : np.concatenate([
                    np.repeat(1,N),np.repeat(0,N)]),
                'distribution' : np.concatenate([
                    np.repeat('beta',N),np.repeat('beta',N)]),
                'biomarker' : np.concatenate([
                    rv1,rv2
                ]),
                'seed' : np.concatenate([
                    np.repeat(i,N),np.repeat(i,N)]),
                'a' : np.concatenate([
                    np.repeat(a_space[i],N),np.repeat(a_new,N)]),
                'b' : np.concatenate([
                    np.repeat(b_space[i],N),np.repeat(b_new,N)])
            },index=['subject'+str(i) for i in range(N*2)])

            vals.append(tmp)
        pd.concat(vals).to_csv('data/mccv_'+str(N)+
                               'subjects_'+str(Nparam)+
                               'parameters_beta_data.csv')