In [41]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt
#scikit-learn related imports
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# pytorch relates imports
import torch
import torch.nn as nn
import torch.optim as optim
import openpyxl

# imports from captum library
from captum.attr import IntegratedGradients,  GradientShap,FeatureAblation
from sklearn.utils import resample

import scipy.stats as stats

BD_path = '../data/reducedata_bd1011.xlsx'
HC_path = '../data/reducedata_hc1011.xlsx'
HC = pd.read_excel(HC_path, index_col = False)
HC = HC.iloc[:, 2:]
HC.columns = HC.iloc[0]
HC = HC.iloc[1:].reset_index(drop=True)

BD = pd.read_excel(BD_path, index_col = False)
BD = BD.iloc[:, 2:]
BD.columns = BD.iloc[0]
BD = BD.iloc[1:].reset_index(drop=True)

BD_mean = BD.iloc[:, 2:].mean()
BD_sd = BD.iloc[:, 2:].std()

HC_mean = HC.iloc[:, 2:].mean()
HC_sd = HC.iloc[:, 2:].std()

HC_synthetic = np.random.normal(loc=HC_mean.T, scale=HC_sd.T, size=(120, 88))
HC_synthetic = pd.DataFrame(HC_synthetic)
HC_synthetic.columns = HC_mean.index

BD_synthetic = np.random.normal(loc=BD_mean.T, scale=BD_sd.T, size=(60, 88))
BD_synthetic = pd.DataFrame(BD_synthetic)
BD_synthetic.columns = BD_mean.index

In [58]:
HC = HC.iloc[:, 2:].apply(pd.to_numeric, errors='coerce')
BD = BD.iloc[:, 2:].apply(pd.to_numeric, errors='coerce')


In [None]:
ty

In [61]:
HC_mean = HC.mean()
HC_cov = HC.cov()

BD_mean = BD.mean()
BD_cov = BD.cov()

# Generate synthetic data preserving correlations
# Specify the number of synthetic samples you want to generate
num_samples_HC = 120
num_samples_BD = 60

HC_synthetic = np.random.multivariate_normal(HC_mean.values, HC.cov().values, num_samples_HC)
HC_synthetic = pd.DataFrame(HC_synthetic, columns=HC_mean.index)

BD_synthetic = np.random.multivariate_normal(BD_mean, BD_cov, num_samples_BD)
BD_synthetic = pd.DataFrame(BD_synthetic, columns=BD_mean.index)

# Output the first few rows to verify
print(HC_synthetic.head())
print(BD_synthetic.head())

0  Right Deep White Matter Hyperintensity volume (mm3) FLAIR  \
0                                         130.807286           
1                                         263.237796           
2                                         511.626097           
3                                         127.981398           
4                                        -365.846755           

0  Left Periventricular Hyperintensity volume (mm3) FLAIR  \
0                                         813.035998        
1                                          22.350361        
2                                         255.012934        
3                                          22.301196        
4                                         271.601220        

0  Right periventricular Hyperintensity volume (mm3) FLAIR  \
0                                         459.247739         
1                                         155.129585         
2                                         168.476226         


  HC_synthetic = np.random.multivariate_normal(HC_mean.values, HC.cov().values, num_samples_HC)
  BD_synthetic = np.random.multivariate_normal(BD_mean, BD_cov, num_samples_BD)


In [62]:
BD_synthetic.to_csv('../data/BD_synthetic.csv',  index=False) #index=False to prevent inserting a new first column that contains indices
HC_synthetic.to_csv('../data/HC_synthetic.csv',  index=False)

In [64]:
HC_synthetic.head()

Unnamed: 0,Right Deep White Matter Hyperintensity volume (mm3) FLAIR,Left Periventricular Hyperintensity volume (mm3) FLAIR,Right periventricular Hyperintensity volume (mm3) FLAIR,Right cingulum hippocampus FA non zero mean -JHU,Left cingulum hippocampus FA non zero mean -JHU,Corpus Callosum body FA mean TBSS,Corpus Callosum Genu FA mean TBSS,Corpus Callosum Splenium FA mean TBSS,Left superior longitudinal fasciculus FA mean TBSS,Right superior longitudinal fasciculus FA mean TBSS,...,DMN resting state Z correlation,Task positive network Z correlation,2v1 right anterior cingulate deactivation,2v1 right parietal lobule activation,2v1 left middle frontal gyrus activation,2v1 right middle frontal gyrus activation,Left middle frontal gyrus DMS activation,Right middle frontal gyrus DMS activation,Left supramarginal (parietal) gyrus DMS activation,Right supramarginal (parietal) gyrus DMS activation
0,130.807286,813.035998,459.247739,0.269633,0.285973,0.501799,0.597509,0.678414,0.393608,0.422531,...,0.591981,0.39819,-0.227911,0.304223,0.160674,0.206133,-0.120168,-0.085781,0.064789,0.034916
1,263.237796,22.350361,155.129585,0.273971,0.254203,0.510997,0.551954,0.65879,0.426886,0.440729,...,1.017378,1.310264,-0.08418,-0.031344,0.033223,0.153303,0.499022,0.268936,0.180785,0.365769
2,511.626097,255.012934,168.476226,0.280982,0.257612,0.55007,0.598728,0.670345,0.420045,0.442168,...,0.286559,0.584271,-0.011069,0.210292,0.062926,0.15429,-0.123413,-0.123225,-0.050245,-0.071722
3,127.981398,22.301196,208.56168,0.292459,0.315326,0.548028,0.516883,0.654408,0.442434,0.385397,...,0.802976,0.901628,-0.135749,-0.056463,0.101801,-0.072536,0.223148,0.160943,0.202475,0.168307
4,-365.846755,271.60122,-139.062577,0.302352,0.286631,0.62112,0.629356,0.715898,0.488249,0.471175,...,0.944251,0.804766,-0.027156,0.392719,0.186455,0.600588,-0.000238,-0.185327,-0.074945,-0.157422
