In [1]:
import numpy as np
import pandas as pd
import scipy   

In [None]:
def generate_clinic(num_patients):
    lh_fsh_ratio = np.random.normal(1.7,0.7,num_patients)
    estradiol = np.random.normal(87.6,21.1,num_patients)
    testosterone = np.random.normal(71.4,27.9,num_patients)
    fai = np.random.normal(11.8,8.7,num_patients)
    shbg = np.random.normal(25.9,13.4,num_patients)
    progesterone_17oh = np.random.normal(1.6,1,num_patients)
    dheas = np.random.normal(777.5,1135.8,num_patients)
    androstenedione = np.random.normal(5.2,4.3,num_patients)
    inhibin_b = np.random.normal(88.6,47.5,num_patients)
    
    combined = np.column_stack((lh_fsh_ratio, estradiol, testosterone, fai, shbg, progesterone_17oh, dheas, androstenedione, inhibin_b))
    combined_df.columns = ["lh_fsh_ratio", "estradiol", "testosterone", "fai", "shbg", "progesterone_17oh", "dheas", "androstenedione", "inhibin_b"]
    return combined_df

In [2]:
hormones = {
    "lh_fsh_ratio" : (1.7,0.7),
    "estradiol" : (87.6,21.1),
    "testosterone" : (71.4,27.9),
    "progesterone_17oh" : (1.6,1),
    "dheas" : (777.5,1135.8),
    "androstenedione" : (5.2,4.3),
    "amh" : (76.0, 36.3)
}

In [3]:
# 1st random seed for generating the ranges
np.random.seed(5)
hormone_ranges_dic = {}

# Generate the ranges for each hormone
for hormone, dist in hormones.items():
    mean, sd = dist
    # Get the number of ranges to create
    hormone_num_ranges = np.random.randint(5,11)
    print("Number ranges "+str(hormone_num_ranges))
    hormone_ranges = []
    
    # Get the boundaries of each range
    for i in range(1,hormone_num_ranges + 1):
        # Get the current percentiles
        lower_percentile = ((i-1)*100/hormone_num_ranges)
        upper_percentile = (i*100/hormone_num_ranges)
        mid_percentile = (lower_percentile+upper_percentile)/2
        
        # Get the values of those percentiles
        lower_val = max(scipy.stats.norm.ppf(lower_percentile / 100, loc=mean, scale=sd),0)
        upper_val = min(scipy.stats.norm.ppf(upper_percentile / 100, loc=mean, scale=sd), mean+4*sd)
        middle_val = scipy.stats.norm.ppf(mid_percentile / 100, loc=mean, scale=sd)
        hormone_ranges.append((lower_val, upper_val, middle_val))
    hormone_ranges_dic[hormone] = hormone_ranges
    
for hormone, val_range in hormone_ranges_dic.items():
    print("*************")
    print(hormone)
    print(val_range)
    print("*************")

Number ranges 8
Number ranges 10
Number ranges 5
Number ranges 6
Number ranges 5
Number ranges 9
Number ranges 8
*************
lh_fsh_ratio
[(0, 0.8947554337367944, 0.6261156189532175), (0.8947554337367944, 1.2278571748627427, 1.0789974086867868), (1.2278571748627427, 1.4769524452249374, 1.3578565122197312), (1.4769524452249374, 1.7, 1.5898825207728804), (1.7, 1.9230475547750625, 1.8101174792271195), (1.9230475547750625, 2.172142825137257, 2.0421434877802684), (2.172142825137257, 2.5052445662632055, 2.321002591313213), (2.5052445662632055, 4.5, 2.7738843810467824)]
*************
*************
estradiol
[(0, 60.55926196700892, 52.893588471323916), (60.55926196700892, 69.8417919716115, 65.73125548168103), (69.8417919716115, 76.53514918186033, 73.36826627086268), (76.53514918186033, 82.25437612383462, 79.46973815880031), (82.25437612383462, 87.6, 84.94854558135793), (87.6, 92.94562387616537, 90.25145441864206), (92.94562387616537, 98.66485081813966, 95.73026184119968), (98.66485081813966,

In [4]:
np.random.seed(10)
ocps = np.arange(1,11)
ocp1={}
ocp2={}
ocp3={}
ocp4={}
ocp5={}
ocp6={}
ocp7={}
ocp8={}
ocp9={}
ocp10={}
ocp_dic = [ocp1,ocp2,ocp3,ocp4,ocp5,ocp6,ocp7,ocp8,ocp9,ocp10]
for hormone, val_ranges in hormone_ranges_dic.items():
    np.random.shuffle(ocps)
    for i in range(len(val_ranges)):
        (ocp_dic[i-1])[hormone] = val_ranges[i]
    for i in range(len(val_ranges),10):
        (ocp_dic[i-1])[hormone] = "NC"

In [5]:
ocp_dic

[{'lh_fsh_ratio': (0.8947554337367944, 1.2278571748627427, 1.0789974086867868),
  'estradiol': (60.55926196700892, 69.8417919716115, 65.73125548168103),
  'testosterone': (47.9187675833157, 64.3316158225112, 56.76922569544567),
  'progesterone_17oh': (0.632578433898299,
   1.1692727007045427,
   0.9255102498039184),
  'dheas': (0, 489.7483602583587, 181.88589766620714),
  'androstenedione': (0, 1.9117484027185356, 1.0400872657626863),
  'amh': (34.24231749235092, 51.51602206788223, 43.796579907614806)},
 {'lh_fsh_ratio': (1.2278571748627427, 1.4769524452249374, 1.3578565122197312),
  'estradiol': (69.8417919716115, 76.53514918186033, 73.36826627086268),
  'testosterone': (64.3316158225112, 78.46838417748882, 71.4),
  'progesterone_17oh': (1.1692727007045427, 1.6, 1.3895716057520755),
  'dheas': (489.7483602583587, 1065.2516397416414, 777.5),
  'androstenedione': (1.9117484027185356,
   3.3478726130295335,
   2.6653400692459535),
  'amh': (51.51602206788223, 64.43339108809319, 58.257416

In [None]:
# "lh_fsh_ratio" : (1.7,0.7),
# "estradiol" : (87.6,21.1),
# "testosterone" : (71.4,27.9),
# "progesterone_17oh" : (1.6,1),
# "dheas" : (777.5,1135.8),
# "androstenedione" : (5.2,4.3),
# "amh" : (76.0, 36.3)
np.random.seed(15)

true_variances = [0.7**2, 21.1**2, 27.9**2, 1**2, 1135.8**2, 4.3**2, 36.3**2]
#covariance_matrix = np.zeros((7,7))
np.fill_diagonal(covariance_matrix, true_variances)
#mean_variance = np.mean(true_variances)

#for i in range(7):
    #for j in range(7):
        #if(i != j and covariance_matrix[i,j] == 0):
            #new_val = np.random.randint(0,1000)
            #covariance_matrix[i,j] = new_val
            #covariance_matrix[j,i] = new_val
    
rounds = 0
while(True):
    i = np.random.randint(0,7)
    j = np.random.randint(0,7)
    if(i != j):
        add = np.random.randint(0,2)
        if(add == 0 and covariance_matrix[i,j] > 0 ):
            covariance_matrix[i,j] -= 1
            covariance_matrix[j,i] -= 1
            rounds += 1
        else:
            covariance_matrix[i,j] += 1
            covariance_matrix[j,i] += 1
            rounds += 1
    
    try:
        inverse_matrix = np.linalg.inv(covariance_matrix)
        #print("The matrix is invertible.")
        w, v = np.linalg.eig(covariance_matrix)
        if(np.any(w < 0)):
            continue
            #print("There are negative eigens")
        else:
            print("No negative eigens")
            if(rounds >= 2):
                break
        #print("Inverse matrix:\n", inverse_matrix)
    except np.linalg.LinAlgError:
        print("The matrix is not invertible.")
            
    

#covariance_matrix = np.random.randint(0, high = 100, size=(7,7))
#np.fill_diagonal(covariance_matrix, true_variances)

#symmetric_cov = covariance_matrix + np.transpose(covariance_matrix) - np.diag(covariance_matrix.diagonal())


In [30]:
covariance_matrix

array([[4.90000000e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 4.45210000e+02, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 7.78410000e+02, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.29004164e+06, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.84900000e+01, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.31769000e+03]])

In [23]:
try:
    inverse_matrix = np.linalg.inv(symmetric_cov)
    print("The matrix is invertible.")
    print("Inverse matrix:\n", inverse_matrix)
except np.linalg.LinAlgError:
    print("The matrix is not invertible.")

The matrix is invertible.
Inverse matrix:
 [[-4.87826380e-02  4.01124433e-03 -5.00624539e-04  1.60501860e-02
   1.88644552e-06  1.38320105e-03  2.66721515e-03]
 [ 4.01124433e-03  3.61847956e-03  2.47042710e-04 -4.02601164e-03
   1.65610865e-07 -4.12241866e-03  3.80909206e-05]
 [-5.00624539e-04  2.47042710e-04  1.49751735e-03 -8.13254536e-04
   9.52701834e-08 -1.17183378e-03  9.54746311e-05]
 [ 1.60501860e-02 -4.02601164e-03 -8.13254536e-04 -3.52253131e-03
  -1.57901292e-06  1.00590237e-02 -1.58272653e-03]
 [ 1.88644552e-06  1.65610865e-07  9.52701834e-08 -1.57901292e-06
   7.75191364e-07 -7.37912426e-07 -7.65267919e-08]
 [ 1.38320105e-03 -4.12241866e-03 -1.17183378e-03  1.00590237e-02
  -7.37912426e-07  4.85966658e-03 -8.14198611e-04]
 [ 2.66721515e-03  3.80909206e-05  9.54746311e-05 -1.58272653e-03
  -7.65267919e-08 -8.14198611e-04  6.94265488e-04]]


In [15]:
symmetric_cov[symmetric_cov < 0] = 0

In [24]:
N = 1000
all_patients = []
for curr_ocp in ocp_dic:
    mu = []
    for hormone, range_set in curr_ocp.items():
        print(hormone)
        if(range_set != "NC"):
            mu.append(range_set[2])
        else:
            mu.append((hormones[hormone])[0])
    
    # get the mean vector
    mu_vec =np.array([mu]).T

    # get "square root" of covariance matrix via eigenfactorization
    w, v = np.linalg.eig(symmetric_cov)
    sigma = np.sqrt(w) * v

    # generate data
    patients = sigma @ np.random.randn(7, N) + mu_vec
    all_patients.append(patients)

lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh
lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh
lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh
lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh
lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh
lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh
lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh
lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh
lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh
lh_fsh_ratio
estradiol
testosterone
progesterone_17oh
dheas
androstenedione
amh


  sigma = np.sqrt(w) * v


In [25]:
w

array([ 1.29004104e+06,  1.36190845e+03,  8.35816705e+02,  4.59979389e+02,
       -1.43927017e+02, -1.83368870e+01,  6.35206963e+01])

In [9]:
all_patients[0]

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])