In [None]:
#Notebook authored by Bowen Jiang

In [None]:
#Basics 
import numpy as np
import pandas as pd 
import math, random
import sys, os 
import sklearn
from scipy import stats 
import scipy
import matplotlib 
import matplotlib.pyplot as plt 
%matplotlib inline

#Google Drive setup 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Navigate to directory with Google Drive
%cd /content/drive/Shareddrives/CS\ 272\ PCOS\ FL/Synthetic\ Data\ Generation\ /

/content/drive/Shareddrives/CS 272 PCOS FL/Synthetic Data Generation 


In [None]:
#Keep knobs
# QUANTILE_WIDTH = 0.3 #Values in range [0, 0.5) -> lower number = easier learning
RESAMPLING_CONSTANT = 5 #Values in range [1, +inf] -> higher number = easier learning 
COVARIANCE_STRENGTH = 1 #Values in range (0, +inf) but best to keep around/below 1 -> lower number = easier learning 
SD_STRENGTH = 1 #lower number = easier learning 
PROPORTION_TRUE_TREATMENT = 0.9 #Values in range [0, 1] -> higher number = easier learning 
PER_CLINIC_NOISE_SPREAD = 1.5 #Values in range [0, +inf) -> lower number = easier learning 
HORMONE_MEASUREMENT_NOISE = 1 #Values in range [0, +inf) -> lower number = easier learning 
SYMPTOM_MASK_CONST = 3 #lower number = easier learning 

In [None]:
#Set one seed to control whole document
np.random.seed(15)

In [None]:
# Dictionary of hormone distributions
# Mean, SD
hormones = {
    "lh_fsh_ratio" : (1.7,0.7),
    "estradiol" : (87.6,21.1),
    "testosterone" : (71.4,27.9),
    "progesterone_17oh" : (1.6,1),
    "dheas" : (777.5,1135.8),
    "androstenedione" : (5.2,4.3),
    "amh" : (76.0, 36.3)
}

In [None]:
#USE EXISTING MEAN HORMONE DICTIONARY!

In [None]:
ocp_dic = [{'lh_fsh_ratio': (0.8699313140385306, 1.1784567174934693, 1.0402891953161857),
  'estradiol': (61.84612290323542, 71.46553534780895, 67.18856637762423),
  'testosterone': (39.91965455227872, 52.92578733471443, 47.08742381850135),
  'progesterone_17oh': (0.9059946705884688,
   1.430952335524231,
   1.1830555069765727),
  'dheas': (279.4275747960521, 531.2850906850412, 407.6361390809437),
  'androstenedione': (2.4476670727842946,
   4.208819828665868,
   3.367032642907436),
  'amh': (32.67477443504756, 47.29217083031238, 40.70150395944589)},
 {'lh_fsh_ratio': (1.1784567174934693, 1.4081898465663012, 1.2987538299163248),
  'estradiol': (71.46553534780895, 78.51229246338411, 75.16323268009846),
  'testosterone': (52.92578733471443, 62.75089965001584, 58.0467002398079),
  'progesterone_17oh': (1.430952335524231,
   1.9105265579415995,
   1.6687348113097689),
  'dheas': (531.2850906850412, 771.4590855442336, 652.075073672365),
  'androstenedione': (4.208819828665868, 5.812523728359134, 5.013935116112919),
  'amh': (47.29217083031238, 58.278071757426034, 53.04898704760583)},
 {'lh_fsh_ratio': (1.4081898465663012, 1.6096580629931667, 1.5108917343317565),
  'estradiol': (78.51229246338411, 84.65260238129726, 81.64579439207088),
  'testosterone': (62.75089965001584, 71.58346081481864, 67.21987239443068),
  'progesterone_17oh': (1.9105265579415995,
   2.4814389507627457,
   2.172297526805999),
  'dheas': (771.4590855442336, 1011.3602356303993, 890.7762095518796),
  'androstenedione': (5.812523728359134, 7.509825382300042, 6.633013231463977),
  'amh': (58.278071757426034, 67.82291932559981, 63.16188780397973)},
 {'lh_fsh_ratio': (1.6096580629931667, 1.8037693838738167, 1.7066495000138242),
  'estradiol': (84.65260238129726, 90.5482790577141, 87.60043643937291),
  'testosterone': (71.58346081481864, 80.43491872290703, 75.95156918330734),
  'progesterone_17oh': (2.4814389507627457, 5.6, 2.913423920031703),
  'dheas': (1011.3602356303993, 1262.3050400572652, 1134.6471556590145),
  'androstenedione': (7.509825382300042, 9.697664820080757, 8.49606228441225),
  'amh': (67.82291932559981, 76.82558184815825, 72.35240230038578)},
 {'lh_fsh_ratio': (1.8037693838738167, 2.0063802635692856, 1.9029390694870993),
  'estradiol': (90.5482790577141, 96.68866525751392, 93.55511394749459),
  'testosterone': (80.43491872290703, 90.3335840905479, 85.16621911521005),
  'progesterone_17oh': 'NC',
  'dheas': (1262.3050400572652, 1539.7977971230525, 1396.4197207284756),
  'androstenedione': (9.697664820080757, 22.4, 11.423776253370793),
  'amh': (76.82558184815825, 85.88011469740675, 81.31138134987009)},
 {'lh_fsh_ratio': (2.0063802635692856, 2.2392643606287534, 2.1170222471114526),
  'estradiol': (96.68866525751392, 103.73563396945636, 100.03780580601811),
  'testosterone': (90.3335840905479, 103.58379684931408, 96.25352494672937),
  'progesterone_17oh': 'NC',
  'dheas': (1539.7977971230525, 1873.086601537927, 1696.5207600704855),
  'androstenedione': 'NC',
  'amh': (85.88011469740675, 95.60642220773076, 90.61161649611185)},
 {'lh_fsh_ratio': (2.2392643606287534, 2.5575692455319476, 2.380742874031894),
  'estradiol': (103.73563396945636, 113.35571565453242, 108.01282733664154),
  'testosterone': (103.58379684931408, 183.0, 114.27650536948542),
  'progesterone_17oh': 'NC',
  'dheas': (1873.086601537927, 2345.6890195865217, 2081.1443567852903),
  'androstenedione': 'NC',
  'amh': (95.60642220773076, 107.02398670042349, 101.00468684938917)},
 {'lh_fsh_ratio': (2.5575692455319476, 4.5, 2.817889736356375),
  'estradiol': (113.35571565453242, 172.0, 121.21708959604464),
  'testosterone': 'NC',
  'progesterone_17oh': 'NC',
  'dheas': (2345.6890195865217, 5320.7, 2742.0736807175563),
  'androstenedione': 'NC',
  'amh': (107.02398670042349, 122.89814698341254, 114.04886881456265)},
 {'lh_fsh_ratio': 'NC',
  'estradiol': 'NC',
  'testosterone': 'NC',
  'progesterone_17oh': 'NC',
  'dheas': 'NC',
  'androstenedione': 'NC',
  'amh': (122.89814698341254, 221.2, 136.02984198887162)},
 {'lh_fsh_ratio': (0, 0.8699313140385306, 0.6273303799911605),
  'estradiol': (0, 61.84612290323542, 53.9860157037583),
  'testosterone': (0.0, 39.91965455227872, 29.68125477740076),
  'progesterone_17oh': (0, 0.9059946705884688, 0.5606429797007548),
  'dheas': (0, 279.4275747960521, 144.51704743109633),
  'androstenedione': (8.881784197001252e-16,
   2.4476670727842946,
   1.3799508975978974),
  'amh': (0, 32.67477443504756, 21.670361307634224)}]

In [None]:
#Load in covariance matrix and prepare: 
symmetric_cov = np.load("cov_matrix.npy")
symmetric_cov = symmetric_cov * (COVARIANCE_STRENGTH*(np.ones((7,7)) - np.identity(7)) + SD_STRENGTH*np.identity(7))
w, v = np.linalg.eig(symmetric_cov)
sigmas = np.sqrt(w) * v #get covariance-corrected SDs
print(f"Number of negative covariances: {np.sum(np.where(symmetric_cov < 0, 1, 0))} (of 49 total covariances)")

Number of negative covariances: 22 (of 49 total covariances)


In [None]:
hormones
pcos_hormone_sds = np.array([0.7, 21.1, 27.9, 1, 1135.8, 4.3, 36.3])

In [None]:
N = 10000
all_patients = {}
for i, curr_ocp in enumerate(ocp_dic):
    mu = []
    for hormone, range_set in curr_ocp.items():
        if(range_set != "NC"):
            # Use the midpoint as the mean
            mu.append(range_set[2])
        else:
            # Use the mean of the overall distribution
            mu.append((hormones[hormone])[0])
    
    if np.any(np.array(mu) < 0):
        print("Some means are negative")
        print(mu)
    
    
    # get the mean vector
    mu_vec =np.array([mu]).T

    # get "square root" of covariance matrix via eigenfactorization
    w, v = np.linalg.eig(symmetric_cov)
    sigma = np.sqrt(w) * v

    # generate hormone data
    patients = (sigma @ np.random.normal(loc=0, scale=(1/RESAMPLING_CONSTANT), size=(len(mu), 10000)) + mu_vec).T
    #print(f'Proportion of negatives: {np.mean(np.where(patients < 0, 1, 0))}')

    #Generate symptoms
    hormone_zscores = (patients - np.repeat(mu_vec.T, 10000, axis=0))/ np.repeat((pcos_hormone_sds/5)[np.newaxis, :], 10000, axis=0)*10
    #print(np.average(hormone_zscores, axis=0))
    irreg_mens = np.repeat(0.8+0.2*np.average(hormone_zscores[:, 0]), 10000).reshape(-1,1) #0.8+2*hormone_zscores[:, 0][:, np.newaxis]
    cysts = np.repeat(0.75+0.2*np.average(hormone_zscores[:, 1]), 10000).reshape(-1,1) #0.75+2*hormone_zscores[:, 1][:, np.newaxis]
    hirsutism = np.repeat(0.75+0.5*np.average(hormone_zscores[:, 2]), 10000).reshape(-1,1) #0.75+10*hormone_zscores[:, 2][:, np.newaxis]
    acne = np.repeat(0.25+np.average(np.average(hormone_zscores[:, 2:4], axis=1)), 10000).reshape(-1,1) #0.25+10*np.average(hormone_zscores[:, 2:4], axis=1)[:, np.newaxis]
    anxiety = np.repeat(0.42+2*np.average(hormone_zscores[:, 1]), 10000).reshape(-1,1) #0.42+15*hormone_zscores[:, 3][:, np.newaxis]
    depression = np.repeat(0.42+1.5*np.average(np.average(-1*np.concatenate((hormone_zscores[:, 1][:, np.newaxis], hormone_zscores[:, 3][:, np.newaxis]), axis=1), axis=1)), 10000).reshape(-1,1)
    #0.37+15*np.average(-1*np.concatenate((hormone_zscores[:, 1][:, np.newaxis], hormone_zscores[:, 3][:, np.newaxis]), axis=1), axis=1)[:, np.newaxis]

    symptom_probs = np.concatenate((irreg_mens, #incr lh-fsh ratio -> irreg mens
                                cysts, #incr estrogen -> cysts
                                hirsutism, #high testosterone -> hirsutism 
                                acne, #high testosterone + progesterone -> acne
                                anxiety, #high progesterone -> anxiety 
                                depression), #low estrogen + progesterone -> depression
                               axis=1)
    print(np.average(symptom_probs, axis=0))
    symptom_assignments = np.random.uniform(size=symptom_probs.shape)
    #Symptom binary variables 
    symptom_assignments = np.where(symptom_assignments < symptom_probs, 1, 0)

    patients = pd.DataFrame(np.concatenate((patients, symptom_assignments), axis=1))
    patients.columns = ["lh_fsh_ratio","estradiol","testosterone","progesterone_17oh","dheas","androstenedione","amh",
                        'irreg_menstr', 'cysts', 'hirsutism', 'acne', 'anxiety', 'depression']
    all_patients[i+1] = patients
    true_treatments = np.repeat(i+1, 10000).reshape(-1,1)
    random_treatments = np.where(np.random.uniform(size=(10000,1)) < PROPORTION_TRUE_TREATMENT, 0, 1) * np.random.randint(1,11, size=(10000,1))
    true_treatments = np.where(random_treatments == 0, true_treatments, 0)
    #print(np.average(random_treatments+true_treatments))
    
    patients['treatment'] = random_treatments + true_treatments
    patients['response_type'] = np.repeat(i+1, 10000)
    patients = patients[['response_type', "lh_fsh_ratio","estradiol","testosterone","progesterone_17oh","dheas","androstenedione","amh",
                        'irreg_menstr', 'cysts', 'hirsutism', 'acne', 'anxiety', 'depression', 'treatment']]

[0.77971266 0.73889559 0.68029606 0.12209036 0.30895591 0.54895007]
[0.76398046 0.75262171 0.77984006 0.22144567 0.4462171  0.49776017]
[0.8045699  0.75528774 0.71001552 0.25101414 0.47287739 0.33867305]
[0.80606845 0.73987711 0.70260988 0.16503114 0.31877109 0.51432894]
[0.77784586 0.74314297 0.77172178 0.21420145 0.35142971 0.53199434]
[0.81411119 0.72656392 0.69667143 0.21576871 0.18563916 0.47923939]
[0.79962165 0.76678851 0.77892325 0.32003382 0.58788509 0.29537724]
[0.80763578 0.76117533 0.7982882  0.25593633 0.53175332 0.44162032]
[0.82437819 0.7467923  0.78767334 0.29624877 0.38792305 0.4191657 ]
[0.81221852 0.73358482 0.67204842 0.25768958 0.25584818 0.3530952 ]


In [None]:
###Generating 15 clinics - start by getting proportions of patient response types
#Use gamma distribution instead of random uniform to generate highly unequal proportions 
#of patient response types in different clinics
clinic_response_type_proportions = np.random.gamma(shape=1.5, scale=1.5, size=(10,10))
clinic_response_type_proportions /= np.repeat(np.sum(clinic_response_type_proportions, axis=1)[:, np.newaxis], 10, axis=1)

#Now, generate the number of patients in each of 15 clinics using random normal distribution
#Assume 1500 +/- 300 patients per practice
clinic_sizes = np.repeat(np.round(np.random.normal(loc=1500, scale=300, size=(10,1)), 0), 10, axis=1)

#Finally, do element-wise multiplication to find the number of patients per response type to sample per clinic
num_responsetypes_per_clinic = np.round(clinic_response_type_proportions * clinic_sizes, 0).astype('int') - 10
num_responsetypes_per_clinic = np.where(num_responsetypes_per_clinic < 0, 0, num_responsetypes_per_clinic) #1000*np.ones((15, 10)).astype('int') 

In [None]:
#Save num. training examples per response type across all training data: 
np.save('num_training_examples_per_response_types.npy', np.sum(num_responsetypes_per_clinic[:15, :], axis=0).squeeze())

num_responsetypes_per_clinic

array([[ 23, 358, 101,  89,  33, 205, 429,   9, 456,  13],
       [  1, 313,  43, 277,  65, 120, 199,  97,  95,  46],
       [ 74,   5, 176, 315,  50, 131,  75,  55, 120, 164],
       [296, 274,  52,  57, 180, 134,  54, 100, 159, 164],
       [ 63,   0, 363, 235, 272, 234,  46, 139, 199, 153],
       [ 18, 233, 241,  56, 399, 282,  63,  94,  21,  28],
       [ 54, 177, 104, 582, 111,   6, 194, 157,  50,  18],
       [ 45,  22,  43,  90,  18, 233, 137, 370, 226, 194],
       [108, 119,  25,  79, 142,  27,  11, 135, 190, 408],
       [133,  63,  89,  83,  71,   0,  13,  99, 104, 128]])

In [None]:
#Because the response types are randomly generated, no need to reshuffle the data
clinics = {}
prev_idxs = np.zeros((10,)).astype('int')
for clinic in range(10):
  #Slice a row from the array specifying number of each patient response type for the clinic
  #This will specify how many more rows of each patient response type dataframe to sample 
  #and concatenate to form one clinic's worth of data
  next_idxs = prev_idxs + num_responsetypes_per_clinic[clinic]

  #Fetch rows from the appropriate response type-specific dataframes to make clinic df
  clinic_patients = None
  for i in range(10):
    if i==0:
      clinic_patients = all_patients[i+1].iloc[prev_idxs[i]:next_idxs[i]]
    else:
      clinic_patients = pd.concat([clinic_patients, all_patients[i+1].iloc[prev_idxs[i]:next_idxs[i]]], ignore_index=True)
  clinics[clinic] = clinic_patients
  #Set new index positions for the next round of row slicing
  prev_idxs = next_idxs

In [None]:
#Add noise per clinic
#Use gamma distribution to select "average percentage hormone noise" and 
#"average symptom masking probability"

clinic_hormone_percent_noise = np.linspace(0, 36, num=10)
clinic_symptom_mask_prob = np.random.gamma(shape=9, scale=0.5, size=(10))

np.save('clinic_hormone_noise_levels_noise_challenge.npy', clinic_hormone_percent_noise)
np.save('clinic_symptom_mask_levels_noise_challenge.npy', SYMPTOM_MASK_CONST*clinic_symptom_mask_prob)


for i in range(10):
  print(f"Clinic {i+1} will have mean 0, SD ({clinic_hormone_percent_noise[i]}% hormone mean) noise added to hormone metrics, and {clinic_symptom_mask_prob[i]}% of symptoms randomly masked")

Clinic 1 will have mean 0, SD (0.0% hormone mean) noise added to hormone metrics, and 4.617629684546998% of symptoms randomly masked
Clinic 2 will have mean 0, SD (4.0% hormone mean) noise added to hormone metrics, and 3.579106007127958% of symptoms randomly masked
Clinic 3 will have mean 0, SD (8.0% hormone mean) noise added to hormone metrics, and 2.8076880542710065% of symptoms randomly masked
Clinic 4 will have mean 0, SD (12.0% hormone mean) noise added to hormone metrics, and 5.228885372553222% of symptoms randomly masked
Clinic 5 will have mean 0, SD (16.0% hormone mean) noise added to hormone metrics, and 3.2032824648972293% of symptoms randomly masked
Clinic 6 will have mean 0, SD (20.0% hormone mean) noise added to hormone metrics, and 4.445415893047917% of symptoms randomly masked
Clinic 7 will have mean 0, SD (24.0% hormone mean) noise added to hormone metrics, and 1.7863999752433426% of symptoms randomly masked
Clinic 8 will have mean 0, SD (28.0% hormone mean) noise added

In [None]:
#Add noise per clinic
for clinic, patients in clinics.items():
  #For hormone measurements
  for hormone in ['lh_fsh_ratio', 'estradiol', 'testosterone', 'progesterone_17oh', 'dheas', 'androstenedione', 'amh']:
    measurements = patients[hormone].values 
    #Create noise vector for each hormone independently, add to hormone measurements 
    noise = np.random.normal(loc=0, scale=0.01*clinic_hormone_percent_noise[clinic]*np.mean(measurements), size=measurements.shape)
    patients[hormone] = np.where(measurements + noise < 0, 0, measurements + noise).astype('float')
  ##For symptoms - not yet implemented
  for symptom in ['irreg_menstr', 'cysts', 'hirsutism', 'acne', 'anxiety', 'depression']:
    measurements = patients[symptom].values #* 0
    #Set all symptoms to 0
    #patients[symptom] = measurements

    #Mask out symptoms at the clinic-specified rate 
    mask_prob = np.random.uniform(size=measurements.shape)
    patients[symptom] = np.where(mask_prob < 0.01*SYMPTOM_MASK_CONST*clinic_symptom_mask_prob[clinic], 2, measurements).astype('int')
  ##Set patient response type label (DO NOT USE AS A FEATURE), treatment label as ints 
  patients['treatment'] = patients.treatment.values.astype('int')
  patients['response_type'] = patients.response_type.values.astype('int')
  patients.to_csv(f'clinic_datasets_noise_challenge_testing/clinic_{clinic}.csv', index=False)