In [5]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt
#scikit-learn related imports
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# pytorch relates imports
import torch
import torch.nn as nn
import torch.optim as optim
import openpyxl

# imports from captum library
from captum.attr import IntegratedGradients,  GradientShap,FeatureAblation
from sklearn.utils import resample

import scipy.stats as stats

seed = 42
np.random.seed(seed)

BD_path = '../data/reducedata_bd1011.xlsx'
HC_path = '../data/reducedata_hc1011.xlsx'
HC = pd.read_excel(HC_path, index_col = False)
HC = HC.iloc[:, 2:]
HC.columns = HC.iloc[0]
HC = HC.iloc[1:].reset_index(drop=True)

BD = pd.read_excel(BD_path, index_col = False)
BD = BD.iloc[:, 2:]
BD.columns = BD.iloc[0]
BD = BD.iloc[1:].reset_index(drop=True)

BD_mean = BD.iloc[:, 2:].mean()
BD_sd = BD.iloc[:, 2:].std()

HC_mean = HC.iloc[:, 2:].mean()
HC_sd = HC.iloc[:, 2:].std()

HC_synthetic = np.random.normal(loc=HC_mean.T, scale=HC_sd.T, size=(120, 88))
HC_synthetic = pd.DataFrame(HC_synthetic)
HC_synthetic.columns = HC_mean.index

BD_synthetic = np.random.normal(loc=BD_mean.T, scale=BD_sd.T, size=(60, 88))
BD_synthetic = pd.DataFrame(BD_synthetic)
BD_synthetic.columns = BD_mean.index

In [6]:
HC = HC.iloc[:, 2:].apply(pd.to_numeric, errors='coerce')
BD = BD.iloc[:, 2:].apply(pd.to_numeric, errors='coerce')


In [7]:
HC_mean = HC.mean()
HC_cov = HC.cov()

BD_mean = BD.mean()
BD_cov = BD.cov()

# Generate synthetic data preserving correlations
# Specify the number of synthetic samples you want to generate
num_samples_HC = 120
num_samples_BD = 60

HC_synthetic = np.random.multivariate_normal(HC_mean.values, HC.cov().values, num_samples_HC)
HC_synthetic = pd.DataFrame(HC_synthetic, columns=HC_mean.index)

BD_synthetic = np.random.multivariate_normal(BD_mean, BD_cov, num_samples_BD)
BD_synthetic = pd.DataFrame(BD_synthetic, columns=BD_mean.index)

# Output the first few rows to verify
print(HC_synthetic.head())
print(BD_synthetic.head())

0  Age at Visit  Left Deep White Matter Hyperintensity volume (mm3) FLAIR  \
0     57.332108                                         -57.455109          
1     60.067185                                         586.488485          
2     71.074105                                         317.847099          
3     35.221284                                         151.530080          
4     55.984465                                         671.230972          

0  Right Deep White Matter Hyperintensity volume (mm3) FLAIR  \
0                                        -179.465602           
1                                         454.866428           
2                                         267.451808           
3                                          51.584416           
4                                         380.715971           

0  Left Periventricular Hyperintensity volume (mm3) FLAIR  \
0                                        -523.610417        
1                             

  HC_synthetic = np.random.multivariate_normal(HC_mean.values, HC.cov().values, num_samples_HC)
  BD_synthetic = np.random.multivariate_normal(BD_mean, BD_cov, num_samples_BD)


In [8]:
BD_synthetic.to_csv('../data/BD_synthetic.csv',  index=False) #index=False to prevent inserting a new first column that contains indices
HC_synthetic.to_csv('../data/HC_synthetic.csv',  index=False)