In [None]:
import pandas as pd
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy.stats import invgamma
import sys
import matplotlib.pyplot as plt

### structural MRI

In [None]:
smri_cort_path = "./aida_data/smri/smri_/roi_gm_cort_values.xlsx"
smri_sub_path = "./aida_data/smri/smri_/roi_gm_sub_values.xlsx"

smri_cort = pd.read_excel(smri_cort_path)
smri_sub = pd.read_excel(smri_sub_path)

In [None]:
smri_cort.rename(columns={smri_cort.columns[0]: 'ppid'}, inplace=True)
smri_sub.rename(columns={smri_sub.columns[0]: 'ppid'}, inplace=True)

In [None]:
smri_sub_dropped = smri_sub.drop(smri_sub.columns[0], axis=1)
whole_df = pd.concat([smri_cort, smri_sub_dropped], axis=1)

In [None]:
whole_df = whole_df[~whole_df.iloc[:, 0].str.contains('_T2', na=False)]
whole_df['ppid'] = whole_df['ppid'].apply(lambda value: int(value[-11:-7]))
whole_df = whole_df[whole_df['ppid'].astype(str).str.startswith(('1', '3'))]

In [None]:
new_columns = list(whole_df.columns)
new_columns[1:49] = [f'{i}c' for i in range(1, 49)]
new_columns[49:] = [f'{i}s' for i in range(1, 22)]
whole_df.columns = new_columns

In [None]:
whole_df.to_csv('roi_gm_WHOLE_T1.csv', index=False)

### fMRI - positive

In [None]:
fmri_pos_cor_path = "./aida_data/fmri/positive/positive_/roi_contrast_cort_pat_pos_values.xlsx"
fmri_pos_sub_path = "./aida_data/fmri/positive/positive_/roi_contrast_sub_pat_pos_values.xlsx"

In [None]:
fmri_pos_cor = pd.read_excel(fmri_pos_cor_path)
fmri_pos_sub = pd.read_excel(fmri_pos_sub_path)

fmri_pos_cor = fmri_pos_cor[~fmri_pos_cor.iloc[:, 0].str.contains('_T2', na=False)]
fmri_pos_sub = fmri_pos_sub[~fmri_pos_sub.iloc[:, 0].str.contains('_T2', na=False)]

In [None]:
fmri_pos_cor.rename(columns={fmri_pos_cor.columns[0]: 'ppid'}, inplace=True)
fmri_pos_sub.rename(columns={fmri_pos_sub.columns[0]: 'ppid'}, inplace=True)

In [None]:
fmri_pos_sub_dropped = fmri_pos_sub.drop(fmri_pos_sub.columns[0], axis=1)
whole_df = pd.concat([fmri_pos_cor, fmri_pos_sub_dropped], axis=1)

In [None]:
whole_df['ppid'] = whole_df['ppid'].apply(lambda value: int(value[:4]))

In [None]:
new_columns = list(whole_df.columns)
new_columns[1:49] = [f'{i}_pos_cor' for i in range(1, 49)]
new_columns[49:] = [f'{i}_pos_sub' for i in range(1, 22)]
whole_df.columns = new_columns

In [None]:
whole_df.to_csv('roi_contrast_WHOLE_pat_pos_values.csv', index=False)

### fMRI - positive - beta1

In [None]:
fmri_pos_cor_beta1_path = "./aida_data/fmri/positive/positive_/roi_beta1_cort_pos_values.xlsx"
fmri_pos_sub_beta1_path = "./aida_data/fmri/positive/positive_/roi_beta1_sub_pos_values.xlsx"

In [None]:
fmri_pos_cor_beta1 = pd.read_excel(fmri_pos_cor_beta1_path)
fmri_pos_sub_beta1 = pd.read_excel(fmri_pos_sub_beta1_path)

fmri_pos_cor_beta1 = fmri_pos_cor_beta1[~fmri_pos_cor_beta1.iloc[:, 0].str.contains('-t2', na=False)]
fmri_pos_sub_beta1 = fmri_pos_sub_beta1[~fmri_pos_sub_beta1.iloc[:, 0].str.contains('-t2', na=False)]

In [None]:
fmri_pos_cor_beta1.rename(columns={fmri_pos_cor_beta1.columns[0]: 'ppid'}, inplace=True)
fmri_pos_sub_beta1.rename(columns={fmri_pos_sub_beta1.columns[0]: 'ppid'}, inplace=True)

In [None]:
fmri_pos_cor_beta1.iloc[:, 0] = fmri_pos_cor_beta1.iloc[:, 0].str.slice(3, 7)
fmri_pos_sub_beta1.iloc[:, 0] = fmri_pos_sub_beta1.iloc[:, 0].str.slice(3, 7)

In [None]:
fmri_pos_sub_dropped = fmri_pos_sub_beta1.drop(fmri_pos_sub_beta1.columns[0], axis=1)
whole_df = pd.concat([fmri_pos_cor_beta1, fmri_pos_sub_dropped], axis=1)

In [None]:
whole_df['ppid'] = whole_df['ppid'].apply(lambda value: int(value[:4]))

In [None]:
new_columns = list(whole_df.columns)
new_columns[1:49] = [f'{i}_pos_cor_beta1' for i in range(1, 49)]
new_columns[49:] = [f'{i}_pos_sub_beta1' for i in range(1, 22)]
whole_df.columns = new_columns

In [None]:
whole_df.to_csv('roi_beta1_WHOLE_pat_pos.csv', index=False)

### fMRI - positive - beta3

In [None]:
fmri_pos_cor_beta3_path = "./aida_data/fmri/positive/positive_/roi_beta3_cort_pos_values.xlsx"
fmri_pos_sub_beta3_path = "./aida_data/fmri/positive/positive_/roi_beta3_sub_pos_values.xlsx"

In [None]:
fmri_pos_cor_beta3 = pd.read_excel(fmri_pos_cor_beta3_path)
fmri_pos_sub_beta3 = pd.read_excel(fmri_pos_sub_beta3_path)

fmri_pos_cor_beta3 = fmri_pos_cor_beta3[~fmri_pos_cor_beta3.iloc[:, 0].str.contains('-t2', na=False)]
fmri_pos_sub_beta3 = fmri_pos_sub_beta3[~fmri_pos_sub_beta3.iloc[:, 0].str.contains('-t2', na=False)]

In [None]:
fmri_pos_cor_beta3.rename(columns={fmri_pos_cor_beta3.columns[0]: 'ppid'}, inplace=True)
fmri_pos_sub_beta3.rename(columns={fmri_pos_sub_beta3.columns[0]: 'ppid'}, inplace=True)

In [None]:
fmri_pos_cor_beta3.iloc[:, 0] = fmri_pos_cor_beta3.iloc[:, 0].str.slice(3, 7)
fmri_pos_sub_beta3.iloc[:, 0] = fmri_pos_sub_beta3.iloc[:, 0].str.slice(3, 7)

In [None]:
fmri_pos_sub_dropped = fmri_pos_sub_beta3.drop(fmri_pos_sub_beta3.columns[0], axis=1)
whole_df = pd.concat([fmri_pos_cor_beta3, fmri_pos_sub_dropped], axis=1)

In [None]:
whole_df['ppid'] = whole_df['ppid'].apply(lambda value: int(value[:4]))

In [None]:
new_columns = list(whole_df.columns)
new_columns[1:49] = [f'{i}_pos_cor_beta3' for i in range(1, 49)]
new_columns[49:] = [f'{i}_pos_sub_beta3' for i in range(1, 22)]
whole_df.columns = new_columns

In [None]:
whole_df.to_csv('roi_beta3_WHOLE_pat_pos.csv', index=False)

### fMRI - negative

In [None]:
fmri_neg_cor_path = "./aida_data/fmri/negative/negative_/roi_contrast_cort_pat_neg_values.xlsx"
fmri_neg_sub_path = "./aida_data/fmri/negative/negative_/roi_contrast_sub_pat_neg_values.xlsx"

In [None]:
fmri_neg_cor = pd.read_excel(fmri_neg_cor_path)
fmri_neg_sub = pd.read_excel(fmri_neg_sub_path)

fmri_neg_cor = fmri_neg_cor[~fmri_neg_cor.iloc[:, 0].str.contains('_T2', na=False)]
fmri_neg_sub = fmri_neg_sub[~fmri_neg_sub.iloc[:, 0].str.contains('_T2', na=False)]

In [None]:
fmri_neg_cor.rename(columns={fmri_neg_cor.columns[0]: 'ppid'}, inplace=True)
fmri_neg_sub.rename(columns={fmri_neg_sub.columns[0]: 'ppid'}, inplace=True)

In [None]:
fmri_neg_sub_dropped = fmri_neg_sub.drop(fmri_neg_sub.columns[0], axis=1)
whole_df = pd.concat([fmri_neg_cor, fmri_neg_sub_dropped], axis=1)

In [None]:
whole_df['ppid'] = whole_df['ppid'].apply(lambda value: int(value[:4]))

In [None]:
new_columns = list(whole_df.columns)
new_columns[1:49] = [f'{i}_neg_cor' for i in range(1, 49)]
new_columns[49:] = [f'{i}_neg_sub' for i in range(1, 22)]
whole_df.columns = new_columns

In [None]:
whole_df.to_csv('roi_contrast_WHOLE_pat_neg_values.csv', index=False)

### fMRI - negative - beta1

In [None]:
fmri_neg_cor_beta1_path = "./aida_data/fmri/negative/negative_/roi_beta1_cort_neg_values.xlsx"
fmri_neg_sub_beta1_path = "./aida_data/fmri/negative/negative_/roi_beta1_sub_neg_values.xlsx"

In [None]:
fmri_neg_cor_beta1 = pd.read_excel(fmri_neg_cor_beta1_path)
fmri_neg_sub_beta1 = pd.read_excel(fmri_neg_sub_beta1_path)

fmri_neg_cor_beta1 = fmri_neg_cor_beta1[~fmri_neg_cor_beta1.iloc[:, 0].str.contains('-t2', na=False)]
fmri_neg_sub_beta1 = fmri_neg_sub_beta1[~fmri_neg_sub_beta1.iloc[:, 0].str.contains('-t2', na=False)]

In [None]:
fmri_neg_cor_beta1.rename(columns={fmri_neg_cor_beta1.columns[0]: 'ppid'}, inplace=True)
fmri_neg_sub_beta1.rename(columns={fmri_neg_sub_beta1.columns[0]: 'ppid'}, inplace=True)

fmri_neg_cor_beta1.iloc[:, 0] = fmri_neg_cor_beta1.iloc[:, 0].str.slice(3, 7)
fmri_neg_sub_beta1.iloc[:, 0] = fmri_neg_sub_beta1.iloc[:, 0].str.slice(3, 7)

In [None]:
fmri_neg_sub_dropped = fmri_neg_sub_beta1.drop(fmri_neg_sub_beta1.columns[0], axis=1)
whole_df = pd.concat([fmri_neg_cor_beta1, fmri_neg_sub_dropped], axis=1)

In [None]:
whole_df['ppid'] = whole_df['ppid'].apply(lambda value: int(value[:4]))

In [None]:
new_columns = list(whole_df.columns)
new_columns[1:49] = [f'{i}_neg_cor_beta1' for i in range(1, 49)]
new_columns[49:] = [f'{i}_neg_sub_beta1' for i in range(1, 22)]
whole_df.columns = new_columns

In [None]:
whole_df.to_csv('roi_beta1_WHOLE_pat_neg.csv', index=False)

### fMRI - negative - beta3

In [None]:
fmri_neg_cor_beta3_path = "./aida_data/fmri/negative/negative_/roi_beta3_cort_neg_values.xlsx"
fmri_neg_sub_beta3_path = "./aida_data/fmri/negative/negative_/roi_beta3_sub_neg_values.xlsx"

In [None]:
fmri_neg_cor_beta3 = pd.read_excel(fmri_neg_cor_beta3_path)
fmri_neg_sub_beta3 = pd.read_excel(fmri_neg_sub_beta3_path)

fmri_neg_cor_beta3 = fmri_neg_cor_beta3[~fmri_neg_cor_beta3.iloc[:, 0].str.contains('-t2', na=False)]
fmri_neg_sub_beta3 = fmri_neg_sub_beta3[~fmri_neg_sub_beta3.iloc[:, 0].str.contains('-t2', na=False)]

In [None]:
fmri_neg_cor_beta3.rename(columns={fmri_neg_cor_beta3.columns[0]: 'ppid'}, inplace=True)
fmri_neg_sub_beta3.rename(columns={fmri_neg_sub_beta3.columns[0]: 'ppid'}, inplace=True)

fmri_neg_cor_beta3.iloc[:, 0] = fmri_neg_cor_beta3.iloc[:, 0].str.slice(3, 7)
fmri_neg_sub_beta3.iloc[:, 0] = fmri_neg_sub_beta3.iloc[:, 0].str.slice(3, 7)

In [None]:
fmri_neg_sub_dropped = fmri_neg_sub_beta3.drop(fmri_neg_sub_beta3.columns[0], axis=1)
whole_df = pd.concat([fmri_neg_cor_beta3, fmri_neg_sub_dropped], axis=1)

In [None]:
whole_df['ppid'] = whole_df['ppid'].apply(lambda value: int(value[:4]))

In [None]:
new_columns = list(whole_df.columns)
new_columns[1:49] = [f'{i}_neg_cor_beta3' for i in range(1, 49)]
new_columns[49:] = [f'{i}_neg_sub_beta3' for i in range(1, 22)]
whole_df.columns = new_columns

In [None]:
whole_df.to_csv('roi_beta3_WHOLE_pat_neg.csv', index=False)

### EEG - neutral

In [None]:
eeg_neutral_alpha_path = "./aida_data/eeg/neutral/neutral_/neutral_alpha.xlsx"
eeg_neutral_beta_path = "./aida_data/eeg/neutral/neutral_/neutral_beta.xlsx"
eeg_neutral_delta_path = "./aida_data/eeg/neutral/neutral_/neutral_delta.xlsx"
eeg_neutral_gamma_path = "./aida_data/eeg/neutral/neutral_/neutral_gamma.xlsx"
eeg_neutral_theta_path = "./aida_data/eeg/neutral/neutral_/neutral_theta.xlsx"

In [None]:
eeg_neutral_alpha = pd.read_excel(eeg_neutral_alpha_path)
eeg_neutral_beta = pd.read_excel(eeg_neutral_beta_path)
eeg_neutral_delta = pd.read_excel(eeg_neutral_delta_path)
eeg_neutral_gamma = pd.read_excel(eeg_neutral_gamma_path)
eeg_neutral_theta = pd.read_excel(eeg_neutral_theta_path)

In [None]:
eeg_neutral_alpha.columns = [eeg_neutral_alpha.columns[0]] + [f'{col}_N_a' for col in eeg_neutral_alpha.columns[1:]]
eeg_neutral_beta.columns = [eeg_neutral_beta.columns[0]] + [f'{col}_N_b' for col in eeg_neutral_beta.columns[1:]]
eeg_neutral_delta.columns = [eeg_neutral_delta.columns[0]] + [f'{col}_N_d' for col in eeg_neutral_delta.columns[1:]]
eeg_neutral_gamma.columns = [eeg_neutral_gamma.columns[0]] + [f'{col}_N_g' for col in eeg_neutral_gamma.columns[1:]]
eeg_neutral_theta.columns = [eeg_neutral_theta.columns[0]] + [f'{col}_N_Q' for col in eeg_neutral_theta.columns[1:]]

In [None]:
eeg_neutral_beta_dropped = eeg_neutral_beta.iloc[:, 1:]
eeg_neutral_delta_dropped = eeg_neutral_delta.iloc[:, 1:]
eeg_neutral_gamma_dropped = eeg_neutral_gamma.iloc[:, 1:]
eeg_neutral_theta_dropped = eeg_neutral_theta.iloc[:, 1:]

eeg_neutral_all_bands = pd.concat([eeg_neutral_alpha, eeg_neutral_beta_dropped, eeg_neutral_delta_dropped, eeg_neutral_gamma_dropped, eeg_neutral_theta_dropped], axis=1)

In [None]:
eeg_neutral_all_bands.rename(columns={eeg_neutral_all_bands.columns[0]: 'ppid'}, inplace=True)

eeg_neutral_all_bands = eeg_neutral_all_bands[~eeg_neutral_all_bands['ppid'].astype(str).str.startswith('4')]

In [None]:
eeg_neutral_all_bands.to_csv('neutral_all_bands.csv', index=False)

### EEG - neutral - grouped_by2

In [None]:
eeg_neutral_path = "./aida_data/eeg/neutral/neutral_/eeg_avg_neutral.xlsx"
eeg_neutral = pd.read_excel(eeg_neutral_path)
eeg_neutral.to_csv('eeg_neutral_all_bands_grouped_by2.csv', index=False)

### EEG - sad

In [None]:
eeg_sad_alpha_path = "./aida_data/eeg/sad/sad_/sad_alpha.xlsx"
eeg_sad_beta_path = "./aida_data/eeg/sad/sad_/sad_beta.xlsx"
eeg_sad_delta_path = "./aida_data/eeg/sad/sad_/sad_delta.xlsx"
eeg_sad_gamma_path = "./aida_data/eeg/sad/sad_/sad_gamma.xlsx"
eeg_sad_theta_path = "./aida_data/eeg/sad/sad_/sad_theta.xlsx"

In [None]:
eeg_sad_alpha = pd.read_excel(eeg_sad_alpha_path)
eeg_sad_beta = pd.read_excel(eeg_sad_beta_path)
eeg_sad_delta = pd.read_excel(eeg_sad_delta_path)
eeg_sad_gamma = pd.read_excel(eeg_sad_gamma_path)
eeg_sad_theta = pd.read_excel(eeg_sad_theta_path)

In [None]:
eeg_sad_alpha.columns = [eeg_sad_alpha.columns[0]] + [f'{col}_S_a' for col in eeg_sad_alpha.columns[1:]]
eeg_sad_beta.columns = [eeg_sad_beta.columns[0]] + [f'{col}_S_b' for col in eeg_sad_beta.columns[1:]]
eeg_sad_delta.columns = [eeg_sad_delta.columns[0]] + [f'{col}_S_d' for col in eeg_sad_delta.columns[1:]]
eeg_sad_gamma.columns = [eeg_sad_gamma.columns[0]] + [f'{col}_S_g' for col in eeg_sad_gamma.columns[1:]]
eeg_sad_theta.columns = [eeg_sad_theta.columns[0]] + [f'{col}_S_Q' for col in eeg_sad_theta.columns[1:]]

In [None]:
eeg_sad_beta_dropped = eeg_sad_beta.iloc[:, 1:]
eeg_sad_delta_dropped = eeg_sad_delta.iloc[:, 1:]
eeg_sad_gamma_dropped = eeg_sad_gamma.iloc[:, 1:]
eeg_sad_theta_dropped = eeg_sad_theta.iloc[:, 1:]

eeg_sad_all_bands = pd.concat([eeg_sad_alpha, eeg_sad_beta_dropped, eeg_sad_delta_dropped, eeg_sad_gamma_dropped, eeg_sad_theta_dropped], axis=1)

In [None]:
eeg_sad_all_bands.rename(columns={eeg_sad_all_bands.columns[0]: 'ppid'}, inplace=True)

eeg_sad_all_bands = eeg_sad_all_bands[~eeg_sad_all_bands['ppid'].astype(str).str.startswith('4')]

In [None]:
eeg_sad_all_bands.to_csv('sad_all_bands.csv', index=False)

### EEG - sad - grouped_by2

In [None]:
eeg_sad_path = "./aida_data/eeg/sad/sad_/eeg_avg_sad.xlsx"
eeg_sad = pd.read_excel(eeg_sad_path)
eeg_sad.to_csv('eeg_sad_all_bands_grouped_by2.csv', index=False)

### Behaviour - fert

In [None]:
fert_path = "./aida_data/behavioral/behavioral_/fert.xlsx"

fert = pd.read_excel(fert_path)

In [None]:
fert = fert[fert.iloc[:, 1] == 1].drop(fert.columns[1], axis=1)

fert = fert.rename(columns={fert.columns[0]: 'ppid'})

new_column_names = ['ppid'] + list(range(1, len(fert.columns)))
fert.columns = new_column_names

fert = fert[fert['ppid'].astype(str).str.startswith(('1', '3'))]

In [None]:
fert.to_csv('fert.csv', index=False)

### Behaviour - effort 

In [None]:
effort_path = "./aida_data/behavioral/behavioral_/effort.xlsx"

effort = pd.read_excel(effort_path)

In [None]:
effort = effort[effort.iloc[:, 1] == 1].drop(effort.columns[1], axis=1)

effort = effort.rename(columns={effort.columns[0]: 'ppid'})

new_column_names = ['ppid'] + list(range(1, len(effort.columns)))
effort.columns = new_column_names

effort = effort[effort['ppid'].astype(str).str.startswith(('1', '3'))]

In [None]:
effort.to_csv('effort.csv', index=False)

### upload MRI 

In [None]:
smri_path = "./aida_data/smri/roi_gm_WHOLE_T1.csv"

pos_fmri_path = "./aida_data/fmri/positive/roi_contrast_WHOLE_pat_pos_values.csv"
neg_fmri_path = "./aida_data/fmri/negative/roi_contrast_WHOLE_pat_neg_values.csv"

#pos_fmri_beta1_path = "./aida_data/fmri/positive/roi_beta1_WHOLE_pat_pos.csv"
#pos_fmri_beta3_path = "./aida_data/fmri/positive/roi_beta3_WHOLE_pat_pos.csv"
#neg_fmri_beta1_path = "./aida_data/fmri/negative/roi_beta1_WHOLE_pat_neg.csv"
#neg_fmri_beta3_path = "./aida_data/fmri/negative/roi_beta3_WHOLE_pat_neg.csv"

In [None]:
smri_df = pd.read_csv(smri_path)

pos_fmri_df = pd.read_csv(pos_fmri_path)
neg_fmri_df = pd.read_csv(neg_fmri_path)

#pos_fmri_df = pd.read_csv(pos_fmri_beta1_path)
#neg_fmri_df = pd.read_csv(neg_fmri_beta1_path)

#pos_fmri_beta1_df = pd.read_csv(pos_fmri_beta1_path)
#pos_fmri_beta3_df = pd.read_csv(pos_fmri_beta3_path)

#neg_fmri_beta1_df = pd.read_csv(neg_fmri_beta1_path)
#neg_fmri_beta3_df = pd.read_csv(neg_fmri_beta3_path)

### Experiments with beta

In [None]:
pos_fmri_df.columns = [pos_fmri_df.columns[0]] + [col[:-6] for col in pos_fmri_df.columns[1:]]
neg_fmri_df.columns = [neg_fmri_df.columns[0]] + [col[:-6] for col in neg_fmri_df.columns[1:]]

### Reduce the Dimensions of structural MRI data 

In [None]:
columns_to_keep_smri = ['ppid', '2c', '3c', '4c', '5c', '6c', '9c', '10c', '11c', '12c', '19c', '20c', '25c', '28c',
                        '29c', '30c', '31c', '33c', '34c', '35c', '4s',
                        '5s', '6s', '7s', '9s', '10s', '11s', '15s', '16s', '17s', '18s', '19s', '20s', '21s']

smri_df = smri_df[columns_to_keep_smri]

features_smri = ['ppid', 'InsC', 'SFG', 'MFG', 'IFGtri', 'IFGop', 'aSTG', 'pSTG', 'aMTG', 'pMTG', 'aSMG', 'pSMG', 'FMC',
                 'PCG', 'aCG', 'pCG', 'PCun', 'FOC',
                 'aPHG', 'pPHG', 'L_Tha', 'L_Cau', 'L_Put', 'L_Pal', 'L_Hipp', 'L_Amy', 'L_Acc', 'R_Tha', 'R_Cau',
                 'R_Put', 'R_Pal', 'R_Hipp', 'R_Amy', 'R_Acc']

smri_df.columns = features_smri

### Reduce the Dimensions of positive fMRI  

In [None]:
"""
columns_to_keep_pos_fmri =  ['ppid', '1_pos_cor', '2_pos_cor', '3_pos_cor', '4_pos_cor', '13_pos_cor', '14_pos_cor', '15_pos_cor', '22_pos_cor', '23_pos_cor', 
                             '24_pos_cor', '25_pos_cor', '29_pos_cor', '32_pos_cor', '36_pos_cor', '37_pos_cor', '38_pos_cor', '41_pos_cor', '47_pos_cor',
                             '48_pos_cor', '10_pos_sub', '20_pos_sub']"""


columns_to_keep_pos_fmri =  ['ppid', '1_pos_cor', '2_pos_cor', '3_pos_cor', '4_pos_cor', '5_pos_cor', '6_pos_cor', '9_pos_cor', '10_pos_cor',
                             '13_pos_cor', '14_pos_cor', '15_pos_cor', '20_pos_cor', '25_pos_cor', '28_pos_cor', '29_pos_cor', '30_pos_cor', 
                             '31_pos_cor', '33_pos_cor', '34_pos_cor', '35_pos_cor', '36_pos_cor', '37_pos_cor', '38_pos_cor',
                             '4_pos_sub', '9_pos_sub', '10_pos_sub', '15_pos_sub', '19_pos_sub', '20_pos_sub']

pos_fmri_df = pos_fmri_df[columns_to_keep_pos_fmri]

features_pos_fmri = ['ppid', 'FP_', 'INS_', 'SFG_', 'MFG_', 'IFGtri_', 'IFGop_', 'aSTG_', 'pSTG_', 'MTGto_', 'aITG_',
                     'pITG_', 'pSMG_', 'FMC_', 'PCG_', 'aCG_',
                     'pCG_', 'PCun_', 'FOC_', 'aPHG_', 'pPHG_', 'LG_', 'aTFC_', 'pTFC_', 'L_Tha_', 'L_Hipp_', 'L_Amy_',
                     'R_Tha_', 'R_Hipp_', 'R_Amy_']
pos_fmri_df.columns = features_pos_fmri

### Reduce the Dimensions of negative fMRI 

In [None]:
"""
columns_to_keep_neg_fmri = ['ppid', '1_neg_cor', '2_neg_cor', '3_neg_cor', '4_neg_cor', '13_neg_cor', '14_neg_cor', '15_neg_cor',  '22_neg_cor', '23_neg_cor',
                            '24_neg_cor', '25_neg_cor', '29_neg_cor', '32_neg_cor', '36_neg_cor',  '37_neg_cor', '38_neg_cor', '41_neg_cor', '47_neg_cor',
                            '48_neg_cor', '10_neg_sub', '20_neg_sub']"""

columns_to_keep_neg_fmri = ['ppid', '1_neg_cor', '2_neg_cor', '3_neg_cor', '4_neg_cor', '5_neg_cor', '6_neg_cor',
                            '9_neg_cor', '10_neg_cor',
                            '13_neg_cor', '14_neg_cor', '15_neg_cor', '20_neg_cor', '25_neg_cor', '28_neg_cor',
                            '29_neg_cor', '30_neg_cor',
                            '31_neg_cor', '33_neg_cor', '34_neg_cor', '35_neg_cor', '36_neg_cor', '37_neg_cor',
                            '38_neg_cor',
                            '4_neg_sub', '9_neg_sub', '10_neg_sub', '15_neg_sub', '19_neg_sub', '20_neg_sub']

neg_fmri_df = neg_fmri_df[columns_to_keep_neg_fmri]

neg_fmri_features = ['ppid', 'FP', 'INS', 'SFG', 'MFG', 'IFGtri', 'IFGop', 'aSTG', 'pSTG', 'MTGto', 'aITG', 'pITG',
                     'pSMG', 'FMC', 'PCG', 'aCG', 'pCG',
                     'PCun', 'FOC', 'aPHG', 'pPHG', 'LG', 'aTFC', 'pTFC', 'L_Tha', 'L_Hipp', 'L_Amy', 'R_Tha', 'R_Hipp',
                     'R_Amy']

neg_fmri_df.columns = neg_fmri_features

### upload EEG

In [None]:
sad_eeg_path = "./aida_data/eeg/sad/eeg_sad_all_bands_grouped_by2.csv"
neutral_eeg_path = "./aida_data/eeg/neutral/eeg_neutral_all_bands_grouped_by2.csv"

sad_eeg_df = pd.read_csv(sad_eeg_path)
neutral_eeg_df = pd.read_csv(neutral_eeg_path)

### upload EEG channels

In [None]:
sad_alpha_path = "./aida_data/eeg/sad/sad_/sad_alpha.xlsx"
sad_beta_path = "./aida_data/eeg/sad/sad_/sad_beta.xlsx"
sad_delta_path = "./aida_data/eeg/sad/sad_/sad_delta.xlsx"
sad_gamma_path = "./aida_data/eeg/sad/sad_/sad_gamma.xlsx"
sad_theta_path = "./aida_data/eeg/sad/sad_/sad_theta.xlsx"

In [None]:
neutral_alpha_path = "./aida_data/eeg/neutral/neutral_/neutral_alpha.xlsx"
neutral_beta_path = "./aida_data/eeg/neutral/neutral_/neutral_beta.xlsx"
neutral_delta_path = "./aida_data/eeg/neutral/neutral_/neutral_delta.xlsx"
neutral_gamma_path = "./aida_data/eeg/neutral/neutral_/neutral_gamma.xlsx"
neutral_theta_path = "./aida_data/eeg/neutral/neutral_/neutral_theta.xlsx"

In [None]:
sad_alpha_df = pd.read_excel(sad_alpha_path)
sad_beta_df = pd.read_excel(sad_beta_path)
sad_delta_df = pd.read_excel(sad_delta_path)
sad_gamma_df = pd.read_excel(sad_gamma_path)
sad_theta_df = pd.read_excel(sad_theta_path)

In [None]:
sad_alpha_df.columns = ["ppid"] + [f"{col}_alpha" for col in sad_alpha_df.columns[1:]]
sad_beta_df.columns = ["ppid"] + [f"{col}_beta" for col in sad_beta_df.columns[1:]]
sad_delta_df.columns = ["ppid"] + [f"{col}_delta" for col in sad_delta_df.columns[1:]]
sad_gamma_df.columns = ["ppid"] + [f"{col}_gamma" for col in sad_gamma_df.columns[1:]]
sad_theta_df.columns = ["ppid"] + [f"{col}_theta" for col in sad_theta_df.columns[1:]]

In [None]:
"""
sad_eeg_df = pd.concat([sad_alpha_df] + [df.drop(df.columns[0], axis=1) for df in [sad_beta_df, sad_delta_df, sad_gamma_df, sad_theta_df]], axis=1)
sad_eeg_df.columns = ['ppid'] + list(sad_eeg_df.columns[1:])
"""

### selecting bands from eeg

In [None]:
sad_eeg_df = sad_eeg_df[[col for col in sad_eeg_df.columns if col == 'ppid' or '_alpha' in col]]
sad_eeg_df = sad_eeg_df[[col for col in sad_eeg_df.columns if col == 'ppid' or '_alpha' in col or '_beta' in col]]

neutral_eeg_df = neutral_eeg_df[[col for col in neutral_eeg_df.columns if col == 'ppid' or '_alpha' in col or '_beta' in col]]
neutral_eeg_df = neutral_eeg_df[[col for col in neutral_eeg_df.columns if col == 'ppid' or '_alpha' in col or '_beta' in col or '_theta' in col]]

### preprocess behavioral

In [None]:
fert_path = "./aida_data/behavioral/fert.csv"
effort_path = "./aida_data/behavioral/effort.csv"

fert_df = pd.read_csv(fert_path)
effort_df = pd.read_csv(effort_path)

In [None]:
#fert_df.columns = [fert_df.columns[0]] + ["fert_" + col for col in fert_df.columns[1:]]
effort_df.columns = [effort_df.columns[0]] + ["effort_" + col for col in effort_df.columns[1:]]

In [None]:
fert = ['Anger_', 'Fearful', 'Happy', 'Sad', 'Neutral', 'Disgusted']

fert_df.columns = [fert_df.columns[0]] + fert

### preprocess questionnaire and clinical data

In [None]:
questionnaire_path = "./aida_data/questionnaire/questionnaire.xlsx"
clinical_path = "./aida_data/questionnaire/clinical_blood.xlsx"
demography_path = "./aida_data/questionnaire/demographic_confounding.xlsx"

In [None]:
questionnaire_df = pd.read_excel(questionnaire_path)
clinical_df = pd.read_excel(clinical_path)
demography_df = pd.read_excel(demography_path)

### RELAPSE STUDIES - remove CONTROLS 

In [None]:
questionnaire_df = questionnaire_df[(questionnaire_df['isControl'] == 0)&(questionnaire_df['ppid'].astype(str).str.startswith(('1', '3')))].drop(columns=['isControl'])
clinical_df = clinical_df[(clinical_df['isControl'] == 0)&(clinical_df['ppid'].astype(str).str.startswith(('1', '3')))].drop(columns=['isControl'])
demography_df = demography_df[(demography_df['isControl'] == 0)&(demography_df['ppid'].astype(str).str.startswith(('1', '3')))].drop(columns=['isControl'])

### modify questionnaire data - Quentin's suggestions

In [None]:
questionnaire_df = questionnaire_df.drop(columns=["Baseline_ERQ_UERQ09_", 
                                                  "IDS_GAF_Baseline_hamd_total", 
                                                  "IDS_GAF_Baseline_idsc_total",
                                                  "baselinecomplete_SeverelyImpairedActivity",
                                                  "baselinecomplete_SeverelyImpairedSocialFunc",
                                                  "baselinecomplete_SeverelyImpairedWorkFunc"
                                                  ])

### Questionnaire reduce dimensions - Midway

In [None]:
questionnaire_df = questionnaire_df.drop(columns=["Baseline_ERQ_UERQ09_", 
                                                  "IDS_GAF_Baseline_hamd_total", 
                                                  "IDS_GAF_Baseline_idsc_total",
                                                  "baselinecomplete_SeverelyImpairedActivity",
                                                  "baselinecomplete_SeverelyImpairedSocialFunc",
                                                  "baselinecomplete_SeverelyImpairedWorkFunc",
                                                  "Main1_scl_sumscore",
                                                  "Main1_scl_otheritemssum",
                                                  "Baseline_cerq_Akzep",
                                                  "Baseline_cerq_AnBes",
                                                  "Baseline_cerq_Detach",
                                                  "Baseline_cerq_Katast",
                                                  "Baseline_cerq_Neube",
                                                  "Baseline_cerq_Plan",
                                                  "Baseline_cerq_PoRef",
                                                  "Baseline_cerq_Relat",
                                                  "Baseline_cerq_Rumi",
                                                  "Baseline_cerq_SeBes",
                                                  "Baseline_csq_total",
                                                  "Baseline_erq_reappraisal",
                                                  "Baseline_es_total",
                                                  "Baseline_ctq_emotionalAbuse",
                                                  "Baseline_ctq_emotionalNeglect",
                                                  "Baseline_ctq_physicalAbuse",
                                                  "Baseline_ctq_physicalNeglect",
                                                  "Baseline_ctq_sexualAbuse",
                                                  "Baseline_ctq_trivilization",
                                                  "IDS_GAF_Main1_idsc_total",
                                                  "Main1_panass_panassneg",
                                                  "Main1_panass_panasspos",
                                                  "Baseline_panast_panastneg",
                                                  "Baseline_panast_panastpos",
                                                  "Main1_sek_total",
                                                  "IDS_GAF_Main1_GAF",
                                                  "numberDiagSkidAll"
                                                  ])

In [None]:
# RELAPSE 
questionnaire_features = ['ppid','HAMD', 'GAD', 'TEPS', 'Brooding', 'Reflection', 'Aggression', 'Anger', 'Depression', 'Paranoia', 'Phobia',
                          'Psychoticism','Somatization', 'Unspecific', 'Compulsion', 'Acceptance', 'Attention', 'Perception', 'Clarity', 'Confront', 'Regulation',
                          'Resilience', 'Understand', 'ACE', 'BRS', 'OSLO', 'SWLS', 'Optimism', 'Pessimism', 'Self_Efficacy', 'STAI-T', 'BSCS', 'Agreeable',
                          'Conscientious', 'Extroversion', 'Neuroticism', 'Openness', 'MWT-B', 'Digit_Span_Back', 'TMT-A', 'TMT-B']

questionnaire_df.columns = questionnaire_features

In [None]:
# DISEASE (has 'isControl')

questionnaire_df = questionnaire_df.drop(columns=['isControl'])

questionnaire_features = ['ppid', 'HAMD', 'GAD', 'TEPS', 'Brooding', 'Reflection', 'Aggression', 'Anger', 'Depression', 'Paranoia', 'Phobia',
                          'Psychoticism','Somatization', 'Unspecific', 'Compulsion', 'Acceptance', 'Attention', 'Perception', 'Clarity', 'Confront', 'Regulation',
                          'Resilience', 'Understand', 'ACE', 'BRS', 'OSLO', 'SWLS', 'Optimism', 'Pessimism', 'Self_Efficacy', 'STAI-T', 'BSCS', 'Agreeable',
                          'Conscientious', 'Extroversion', 'Neuroticism', 'Openness', 'MWT-B', 'Digit_Span_Back', 'TMT-A', 'TMT-B']

questionnaire_df.columns = questionnaire_features

### RELAPSE STUDIES - do not use these in any study at all 

In [None]:
clinical_df = clinical_df.drop(columns=[
    'overviewSheet_CRP_Main1',
    'overviewSheet_Hb_Main1',
    'overviewSheet_Leuko_Main1',
    'overviewSheet_MCV_Main1',
    'overviewSheet_TSH_Main1',
    'cortisolData_Average'
])

clinical_df = clinical_df.drop(columns=[
    'medicationClass',
    'MedicationList_medicationDosage',
    'medicationLoad',
    'highestDosage',
    'severityLastEpisode',
])

### Experimenting with blood samples

In [None]:
clinical_df = clinical_df[[
    'ppid',
    'overviewSheet_CRP_Main1',
    'overviewSheet_Hb_Main1',
    'overviewSheet_Leuko_Main1',
    'overviewSheet_MCV_Main1',
    'overviewSheet_TSH_Main1',
    'cortisolData_Average'
]]

### CONTROL STUDIES - adjust clinical data  

In [None]:
clinical_df = clinical_df.drop(columns=[
    'medicationClass',
    'MedicationList_medicationDosage',
    'medicationLoad',
    'highestDosage',
    
    'lengthADMIntake',
    'ageOnset',
    'monthDepressed',
    'monthDepressedLastFiveYears',
    'timeInRemission',
    'timeSinceIllnessOnset',
    'nEpisodes',
    'severityLastEpisode'
])

### CONTROL STUDIES - remove duplicate columns 

In [None]:
clinical_df = clinical_df.drop(columns=['isControl'])
questionnaire_df = questionnaire_df.drop(columns=['isControl'])

### ALTERNATIVE SUBGROUPS!

In [None]:
'baselinecomplete_MWT_B_TotalScore'
'baselinecomplete_DigitspanScore'
'baselinecomplete_TMT_A_Time_sec_'
'baselinecomplete_TMT_B_time_sec_'

# mwt_b 

In [None]:
mwt_b_df = questionnaire_df[['ppid', 'baselinecomplete_MWT_B_TotalScore']]
mwt_b_df = mwt_b_df[mwt_b_df['ppid'].isin(shared_ppid_list)]
distribution_stats = mwt_b_df['baselinecomplete_MWT_B_TotalScore'].describe()
plt.hist(mwt_b_df['baselinecomplete_MWT_B_TotalScore'], bins='auto', edgecolor='black')
plt.title('Distribution of baselinecomplete_MWT_B_TotalScore')
plt.show()

In [None]:
distribution_stats
median_value = mwt_b_df['baselinecomplete_MWT_B_TotalScore'].median()

In [None]:
mwt_b_df['baselinecomplete_MWT_B_TotalScore'] = mwt_b_df['baselinecomplete_MWT_B_TotalScore'].apply(
    lambda x: 0 if x < median_value else 1)

counts = mwt_b_df['baselinecomplete_MWT_B_TotalScore'].value_counts()

In [None]:
zero_count = counts.get(0, 0)  # 20
one_count = counts.get(1, 0)  # 25

# digit_span_score

In [None]:
digit_span_score_df = questionnaire_df[['ppid', 'baselinecomplete_DigitspanScore']]
digit_span_score_df = digit_span_score_df[digit_span_score_df['ppid'].isin(shared_ppid_list)]

In [None]:
distribution_stats = digit_span_score_df['baselinecomplete_DigitspanScore'].describe()
plt.hist(digit_span_score_df['baselinecomplete_DigitspanScore'], bins='auto', edgecolor='black')
plt.title('Distribution of baselinecomplete_DigitspanScore')
plt.show()

distribution_stats

In [None]:
median_value = digit_span_score_df['baselinecomplete_DigitspanScore'].median()

digit_span_score_df['baselinecomplete_DigitspanScore'] = digit_span_score_df['baselinecomplete_DigitspanScore'].apply(
    lambda x: 0 if x < median_value else 1)

In [None]:
counts = digit_span_score_df['baselinecomplete_DigitspanScore'].value_counts()
zero_count = counts.get(0, 0)  # 19
one_count = counts.get(1, 0)  # 26

# tmt_A 

In [None]:
tmt_A_df = questionnaire_df[['ppid', 'baselinecomplete_TMT_A_Time_sec_']]
tmt_A_df = tmt_A_df[tmt_A_df['ppid'].isin(shared_ppid_list)]

In [None]:
distribution_stats = tmt_A_df['baselinecomplete_TMT_A_Time_sec_'].describe()
plt.hist(tmt_A_df['baselinecomplete_TMT_A_Time_sec_'], bins='auto', edgecolor='black')
plt.title('Distribution of baselinecomplete_TMT_A_Time_sec_')
plt.show()

distribution_stats

In [None]:
median_value = tmt_A_df['baselinecomplete_TMT_A_Time_sec_'].median()

tmt_A_df['baselinecomplete_TMT_A_Time_sec_'] = tmt_A_df['baselinecomplete_TMT_A_Time_sec_'].apply(lambda x: 0 if x < median_value else 1)

In [None]:
counts = tmt_A_df['baselinecomplete_TMT_A_Time_sec_'].value_counts()
zero_count = counts.get(0, 0)  # 22
one_count = counts.get(1, 0)  # 23

# tmt_B 

In [None]:
tmt_B_df = questionnaire_df[['ppid', 'baselinecomplete_TMT_B_time_sec_']]
tmt_B_df = tmt_B_df[tmt_B_df['ppid'].isin(shared_ppid_list)]

In [None]:
distribution_stats = tmt_B_df['baselinecomplete_TMT_B_time_sec_'].describe()
plt.hist(tmt_B_df['baselinecomplete_TMT_B_time_sec_'], bins='auto', edgecolor='black')
plt.title('Distribution of baselinecomplete_TMT_B_time_sec_')
plt.show()

distribution_stats

In [None]:
median_value = tmt_B_df['baselinecomplete_TMT_B_time_sec_'].median()

tmt_B_df['baselinecomplete_TMT_B_time_sec_'] = tmt_B_df['baselinecomplete_TMT_B_time_sec_'].apply(lambda x: 0 if x < median_value else 1)

In [None]:
counts = tmt_B_df['baselinecomplete_TMT_B_time_sec_'].value_counts()
zero_count = counts.get(0, 0)  # 22
one_count = counts.get(1, 0)  # 23

### identify common patients

smri_df, pos_fmri_df, neg_fmri_df, neutral_eeg_df, fert_df, effort_df, questionnaire_df, clinical_df, demography_df

In [None]:
ppid_sets = [set(df['ppid']) for df in
             [smri_df, pos_fmri_df, neg_fmri_df, neutral_eeg_df, fert_df, effort_df, questionnaire_df, clinical_df,
              demography_df]]
shared_ppid = set.intersection(*ppid_sets)

In [None]:
num_shared_ppid = len(shared_ppid)  # 45
shared_ppid_list = sorted(shared_ppid)

smri_df, pos_fmri_df, fert_df, questionnaire_df, demography_df

In [None]:
ppid_sets = [set(df['ppid']) for df in [smri_df, pos_fmri_df, fert_df, questionnaire_df, demography_df]]
shared_ppid = set.intersection(*ppid_sets)

In [None]:
num_shared_ppid = len(shared_ppid)  # 83
shared_ppid_list = sorted(shared_ppid)

### RELAPSE STUDIES - identify RELAPSE information for the chosen "ppid" 

In [None]:
filtered_demography_df = demography_df[demography_df['ppid'].isin(shared_ppid_list)]

# "relapse" values for the "ppid" that will be used stored in a list, same order as the "ppid"
relapse_values = filtered_demography_df['relapse'].tolist()

num_ones = relapse_values.count(1)   

### filter data according to "ppid" for the chosen modalities

In [None]:
smri_df = smri_df[smri_df['ppid'].isin(shared_ppid_list)]
pos_fmri_df = pos_fmri_df[pos_fmri_df['ppid'].isin(shared_ppid_list)]
neg_fmri_df = neg_fmri_df[neg_fmri_df['ppid'].isin(shared_ppid_list)]
#pos_fmri_beta1_df = pos_fmri_beta1_df[pos_fmri_beta1_df['ppid'].isin(shared_ppid_list)]
#pos_fmri_beta3_df = pos_fmri_beta3_df[pos_fmri_beta3_df['ppid'].isin(shared_ppid_list)]
#neg_fmri_beta1_df = neg_fmri_beta1_df[neg_fmri_beta1_df['ppid'].isin(shared_ppid_list)]
#neg_fmri_beta3_df = neg_fmri_beta3_df[neg_fmri_beta3_df['ppid'].isin(shared_ppid_list)]
sad_eeg_df = sad_eeg_df[sad_eeg_df['ppid'].isin(shared_ppid_list)]
neutral_eeg_df = neutral_eeg_df[neutral_eeg_df['ppid'].isin(shared_ppid_list)]
fert_df = fert_df[fert_df['ppid'].isin(shared_ppid_list)]
effort_df = effort_df[effort_df['ppid'].isin(shared_ppid_list)]
questionnaire_df = questionnaire_df[questionnaire_df['ppid'].isin(shared_ppid_list)]
clinical_df = clinical_df[clinical_df['ppid'].isin(shared_ppid_list)]
demography_df = demography_df[demography_df['ppid'].isin(shared_ppid_list)]

In [None]:
sad_alpha_df = sad_alpha_df[sad_alpha_df['ppid'].isin(shared_ppid_list)]
sad_beta_df = sad_beta_df[sad_beta_df['ppid'].isin(shared_ppid_list)]
sad_delta_df = sad_delta_df[sad_delta_df['ppid'].isin(shared_ppid_list)]
sad_gamma_df = sad_gamma_df[sad_gamma_df['ppid'].isin(shared_ppid_list)]
sad_theta_df = sad_theta_df[sad_theta_df['ppid'].isin(shared_ppid_list)]

questionnaire_df = questionnaire_df[questionnaire_df['ppid'].isin(shared_ppid_list)]
demography_df = demography_df[demography_df['ppid'].isin(shared_ppid_list)]

### STUDIES WITH SAD AND NEUTRAL EEG - add "sad" and "neutral" to column names

In [None]:
sad_eeg_df.columns = [sad_eeg_df.columns[0]] + [f"{col}_sad" for col in sad_eeg_df.columns[1:]]
neutral_eeg_df.columns = [neutral_eeg_df.columns[0]] + [f"{col}_neutral" for col in neutral_eeg_df.columns[1:]]

### reset index

In [None]:
smri_df = smri_df.reset_index(drop=True)
pos_fmri_df = pos_fmri_df.reset_index(drop=True)
neg_fmri_df = neg_fmri_df.reset_index(drop=True)
#pos_fmri_beta1_df = pos_fmri_beta1_df.reset_index(drop=True)
#pos_fmri_beta3_df = pos_fmri_beta3_df.reset_index(drop=True)
#neg_fmri_beta1_df = neg_fmri_beta1_df.reset_index(drop=True)
#neg_fmri_beta3_df = neg_fmri_beta3_df.reset_index(drop=True)
sad_eeg_df = sad_eeg_df.reset_index(drop=True)
neutral_eeg_df = neutral_eeg_df.reset_index(drop=True)
fert_df = fert_df.reset_index(drop=True)
effort_df = effort_df.reset_index(drop=True)
questionnaire_df = questionnaire_df.reset_index(drop=True)
clinical_df = clinical_df.reset_index(drop=True)
demography_df = demography_df.reset_index(drop=True)

In [None]:
sad_alpha_df = sad_alpha_df.reset_index(drop=True)
sad_beta_df = sad_beta_df.reset_index(drop=True)
sad_delta_df = sad_delta_df.reset_index(drop=True)
sad_gamma_df = sad_gamma_df.reset_index(drop=True)
sad_theta_df = sad_theta_df.reset_index(drop=True)

questionnaire_df = questionnaire_df.reset_index(drop=True)
demography_df = demography_df.reset_index(drop=True)

### checking for "nan" for all modalities

In [None]:
dataframes = {#'smri_df': smri_df,
              #'pos_fmri_df': pos_fmri_df,
              #'neg_fmri_df': neg_fmri_df,
              #'pos_fmri_beta1_df': pos_fmri_beta1_df,
              #'pos_fmri_beta3_df': pos_fmri_beta3_df,
              #'neg_fmri_beta1_df': neg_fmri_beta1_df,
              #'neg_fmri_beta3_df': neg_fmri_beta3_df,
              'sad_eeg_df': sad_eeg_df, 
              'neutral_eeg_df': neutral_eeg_df, 
              #'fert_df': fert_df,
              #'effort_df': effort_df,
              'questionnaire_df': questionnaire_df,
              #'clinical_df': clinical_df,
              'demography_df': demography_df
              }

def print_nan_info(df_name, df):
    nan_counts = df.isna().sum()
    nan_counts = nan_counts[nan_counts > 0]
    if not nan_counts.empty:
        print(f'\nNaNs in {df_name}:')
        for column, count in nan_counts.items():
            print(f'  Column: {column}, NaNs: {count}')

for df_name, df in dataframes.items():
    print_nan_info(df_name, df)

### drop "overviewSheet_CRP_Main1" if nan ratio is above 20

In [None]:
clinical_df.drop(columns=['overviewSheet_CRP_Main1'], inplace=True)

### CONTROL STUDIES - drop additional columns 

In [None]:
demography_df.drop(columns=['relapse'], inplace=True)

In [None]:
# Replace NaN in 'R_Amy' column with the median of that column without inplace argument
neg_fmri_df = neg_fmri_df.assign(R_Amy=neg_fmri_df['R_Amy'].fillna(neg_fmri_df['R_Amy'].median()))
pos_fmri_df.fillna(0, inplace=True)
neg_fmri_df.fillna(0, inplace=True)

In [None]:
#pos_fmri_beta1_df.fillna(0, inplace=True)
#pos_fmri_beta3_df.fillna(0, inplace=True)

#neg_fmri_beta1_df.fillna(0, inplace=True)
#neg_fmri_beta3_df.fillna(0, inplace=True)

### drop the "ppid" column for all modalities excluding "demography_df"

In [None]:
smri_df = smri_df.drop(columns=['ppid'])

pos_fmri_df = pos_fmri_df.drop(columns=['ppid'])
neg_fmri_df = neg_fmri_df.drop(columns=['ppid'])

#pos_fmri_beta1_df = pos_fmri_beta1_df.drop(columns=['ppid'])
#pos_fmri_beta3_df = pos_fmri_beta3_df.drop(columns=['ppid'])
#neg_fmri_beta1_df = neg_fmri_beta1_df.drop(columns=['ppid'])
#neg_fmri_beta3_df = neg_fmri_beta3_df.drop(columns=['ppid'])

sad_eeg_df = sad_eeg_df.drop(columns=['ppid'])
neutral_eeg_df = neutral_eeg_df.drop(columns=['ppid'])

fert_df = fert_df.drop(columns=['ppid'])
effort_df = effort_df.drop(columns=['ppid'])

questionnaire_df = questionnaire_df.drop(columns=['ppid'])
clinical_df = clinical_df.drop(columns=['ppid'])

In [None]:
sad_alpha_df = sad_alpha_df.drop(columns=['ppid'])
sad_beta_df = sad_beta_df.drop(columns=['ppid'])
sad_delta_df = sad_delta_df.drop(columns=['ppid'])
sad_gamma_df = sad_gamma_df.drop(columns=['ppid'])
sad_theta_df = sad_theta_df.drop(columns=['ppid'])

questionnaire_df = questionnaire_df.drop(columns=['ppid'])

### concatenate the chosen modalities into one dataframe

In [None]:
all_modalities = pd.concat([demography_df,
                            #smri_df,
                            #pos_fmri_df,
                            #neg_fmri_df,
                            #pos_fmri_beta1_df,
                            #pos_fmri_beta3_df,
                            #neg_fmri_beta1_df,
                            #neg_fmri_beta3_df,
                            sad_eeg_df,
                            neutral_eeg_df,
                            #fert_df,
                            #effort_df,
                            questionnaire_df,
                            #clinical_df
                            ],
                           axis=1)

In [None]:
all_modalities = pd.concat([demography_df,
                            sad_alpha_df,
                            sad_beta_df,
                            sad_delta_df,
                            sad_gamma_df,
                            sad_theta_df,
                            questionnaire_df,
                            ],
                           axis=1)

### remove rows with nan for fMRI

In [None]:
#all_modalities = all_modalities[~all_modalities['ppid'].isin(rows_w_nan)]

### CONTROL STUDIES - Change column names for EEG

In [None]:
# Replace the specified parts in the column names
all_modalities.columns = all_modalities.columns.str.replace('_alpha_sad', '_a_s', regex=False)
all_modalities.columns = all_modalities.columns.str.replace('_beta_sad', '_b_s', regex=False)
all_modalities.columns = all_modalities.columns.str.replace('_gamma_sad', '_g_s', regex=False)
all_modalities.columns = all_modalities.columns.str.replace('_theta_sad', '_t_s', regex=False)
all_modalities.columns = all_modalities.columns.str.replace('_delta_sad', '_d_s', regex=False)
all_modalities.columns = all_modalities.columns.str.replace('_alpha_neutral', '_a_n', regex=False)
all_modalities.columns = all_modalities.columns.str.replace('_beta_neutral', '_b_n', regex=False)
all_modalities.columns = all_modalities.columns.str.replace('_theta_neutral', '_t_n', regex=False)
all_modalities.columns = all_modalities.columns.str.replace('_delta_neutral', '_d_n', regex=False)
all_modalities.columns = all_modalities.columns.str.replace('_gamma_neutral', '_g_n', regex=False)

### RELAPSE COHORT - change relapse column for alternative for SUBGROUP EXPERIMENTS, however, keep the column name "relapse" for integrity of code

In [None]:
score_mapping = tmt_A_df.set_index('ppid')['baselinecomplete_TMT_A_Time_sec_']
all_modalities['relapse'] = all_modalities['ppid'].map(score_mapping)
all_modalities.drop(columns=['baselinecomplete_TMT_A_Time_sec_'], inplace=True)

### RELAPSE STUDIES - adjust the all modalities dataframe according to subgroups: relapse patients should come first, and no relapse patients should come second.

In [None]:
all_modalities = all_modalities.sort_values(by='relapse', ascending=False).reset_index(drop=True)
#all_modalities = all_modalities.drop(columns=['isControl'])

### CONTROL STUDIES -  adjust the all modalities dataframe according to subgroups: controls should come first, and patients should come second.

In [None]:
all_modalities = all_modalities.sort_values(by='isControl', ascending=False).reset_index(drop=True)

### save all modalities dataframe

In [None]:
all_modalities.to_csv("./aida_model/all_modalities/all_modalities(all_modalities).csv", index=False)

# scaling search

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, PowerTransformer
import matplotlib.pyplot as plt
import os

In [None]:
scaler = StandardScaler()

In [None]:
X = df.copy()
X = X.iloc[:, 6:].reset_index(drop=True)
X.to_csv('output.csv', index=False)

X1_columns = slice(0, 69)
X2_columns = slice(69, 138)
X3_columns = slice(138, 207)
X4_columns = slice(207, 362)
X5_columns = slice(362, 368)
X6_columns = slice(368, 383)
X7_columns = slice(383, 455)
X8_columns = slice(455, 463)

X1 = X.iloc[:, X1_columns]
X2 = X.iloc[:, X2_columns]
X3 = X.iloc[:, X3_columns]
X4 = X.iloc[:, X4_columns]
X5 = X.iloc[:, X5_columns]
X6 = X.iloc[:, X6_columns]
X7 = X.iloc[:, X7_columns]
X8 = X.iloc[:, X8_columns]

"""
X[:, X1_columns] = scaler.fit_transform(X[:, X1_columns])
X[:, X2_columns] = scaler.fit_transform(X[:, X2_columns])
X[:, X3_columns] = scaler.fit_transform(X[:, X3_columns])
X[:, X4_columns] = scaler.fit_transform(X[:, X4_columns])
X[:, X5_columns] = scaler.fit_transform(X[:, X5_columns])
X[:, X6_columns] = scaler.fit_transform(X[:, X6_columns])

for col in range(X7_columns.start, X7_columns.stop):
    X[:, col] = scaler.fit_transform(X[:, col].reshape(-1, 1)).flatten()

for col in range(X8_columns.start, X8_columns.stop):
    X[:, col] = scaler.fit_transform(X[:, col].reshape(-1, 1)).flatten()
"""

### fMRI

# Feature-wise Standard Scaling

In [None]:
X2_scaled = scaler.fit_transform(X2)
X3_scaled = scaler.fit_transform(X3)

"""
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.hist(X2_scaled.flatten(), bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of X2 (fMRI Positive) after Scaling')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(X3_scaled.flatten(), bins=50, color='lightcoral', edgecolor='black')
plt.title('Distribution of X3 (fMRI Negative) after Scaling')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()
"""

os.makedirs('plots/X2_scaled', exist_ok=True)
os.makedirs('plots/X3_scaled', exist_ok=True)

# Plot each feature of X2 after scaling and save
for i in range(X2_scaled.shape[1]):
    plt.figure()
    plt.hist(X2_scaled[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of X2 Feature {i+1} after Scaling')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X2_scaled/X2_feature_{i+1}.png')
    plt.close()

# Plot each feature of X3 after scaling and save
for i in range(X3_scaled.shape[1]):
    plt.figure()
    plt.hist(X3_scaled[:, i], bins=50, color='lightcoral', edgecolor='black')
    plt.title(f'Distribution of X3 Feature {i+1} after Scaling')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X3_scaled/X3_feature_{i+1}.png')
    plt.close()

# Feature-wise Standard Scaling + PowerTransformer w/ yeo-johnson

In [None]:
X2_scaled = scaler.fit_transform(X2)
X3_scaled = scaler.fit_transform(X3)

transformer = PowerTransformer(method='yeo-johnson')
X2_transformed = transformer.fit_transform(X2_scaled)
X3_transformed = transformer.fit_transform(X3_scaled)

"""
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.hist(X2_transformed.flatten(), bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of X2 (fMRI Positive) after Yeo-Johnson Transformation')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(X3_transformed.flatten(), bins=50, color='lightcoral', edgecolor='black')
plt.title('Distribution of X3 (fMRI Negative) after Yeo-Johnson Transformation')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()
"""

os.makedirs('plots/X2_scaled_transformed', exist_ok=True)
os.makedirs('plots/X3_scaled_transformed', exist_ok=True)

for i in range(X2_transformed.shape[1]):
    plt.figure()
    plt.hist(X2_transformed[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of X2 Feature {i+1} after Yeo-Johnson Transformation')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X2_scaled_transformed/X2_feature_{i+1}.png')
    plt.close()

for i in range(X3_transformed.shape[1]):
    plt.figure()
    plt.hist(X3_transformed[:, i], bins=50, color='lightcoral', edgecolor='black')
    plt.title(f'Distribution of X3 Feature {i+1} after Yeo-Johnson Transformation')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X3_scaled_transformed/X3_feature_{i+1}.png')
    plt.close()

### structural MRI

# Feature-wise Standard Scaling

In [None]:
min_value, max_value = X1.min().min(), X1.max().max()

print("Minimum value and Maximum value", min_value, ",", max_value)

In [None]:
X1_scaled = scaler.fit_transform(X1)

"""
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.hist(X1_scaled.flatten(), bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of X1 (structural MRI) after Scaling')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
"""

os.makedirs('plots/X1_scaled', exist_ok=True)

for i in range(X1_scaled.shape[1]):
    plt.figure()
    plt.hist(X1_scaled[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of X1 Feature {i+1} after Scaling')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X1_scaled/X1_feature_{i+1}.png')
    plt.close()

# Feature-wise Standard Scaling + PowerTransformer w/ yeo-johnson

In [None]:
X1_scaled = scaler.fit_transform(X1)

transformer = PowerTransformer(method='yeo-johnson')
X1_transformed = transformer.fit_transform(X1_scaled)

"""
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.hist(X1_transformed.flatten(), bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of X1 (structural MRI) after Scaling and Yeo-Johnson Transformation')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
"""

os.makedirs('plots/X1_scaled_transformed', exist_ok=True)

for i in range(X1_transformed.shape[1]):
    plt.figure()
    plt.hist(X1_transformed[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of X1 Feature {i+1} after Scaling and Yeo-Johnson Transformation')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X1_scaled_transformed/X1_feature_{i+1}.png')
    plt.close()

### EEG

# Feature-wise Standard Scaling

In [None]:
X4_scaled = scaler.fit_transform(X4)

"""
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.hist(X4_scaled.flatten(), bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of X4 (EEG) after Scaling')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
"""

os.makedirs('plots/X4_scaled', exist_ok=True)

for i in range(X1_scaled.shape[1]):
    plt.figure()
    plt.hist(X4_scaled[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of X4 Feature {i + 1} after Scaling')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X4_scaled/X4_feature_{i + 1}.png')
    plt.close()

# Feature-wise Standard Scaling + PowerTransformer w/ yeo-johnson

In [None]:
X4_scaled = scaler.fit_transform(X4)

transformer = PowerTransformer(method='yeo-johnson')
X4_transformed = transformer.fit_transform(X4_scaled)

"""
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.hist(X4_transformed.flatten(), bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of X4 (EEG) after Scaling and Yeo-Johnson Transformation')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
"""

os.makedirs('plots/X4_scaled_transformed', exist_ok=True)

for i in range(X1_transformed.shape[1]):
    plt.figure()
    plt.hist(X1_transformed[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of X4 Feature {i+1} after Scaling and Yeo-Johnson Transformation')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X4_scaled_transformed/X4_feature_{i+1}.png')
    plt.close()

### Questionnaires

# Feature-wise Standard Scaling

In [None]:
X7_scaled = scaler.fit_transform(X7)

"""
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.hist(X7_scaled.flatten(), bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of X7 (Questionnaires) after Scaling')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
"""

os.makedirs('plots/X7_scaled', exist_ok=True)

for i in range(X7_scaled.shape[1]):
    plt.figure()
    plt.hist(X7_scaled[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of X7 Feature {i+1} after Scaling')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X7_scaled/X7_feature_{i+1}.png')
    plt.close()

# Feature-wise Standard Scaling + PowerTransformer w/ yeo-johnson

In [None]:
X7_scaled = scaler.fit_transform(X7)

transformer = PowerTransformer(method='yeo-johnson')
X7_transformed = transformer.fit_transform(X7_scaled)

"""
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.hist(X7_transformed.flatten(), bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of X7 (EEG) after Scaling and Yeo-Johnson Transformation')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
"""

os.makedirs('plots/X7_scaled_transformed', exist_ok=True)

for i in range(X7_transformed.shape[1]):
    plt.figure()
    plt.hist(X7_transformed[:, i], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of X7 Feature {i+1} after Scaling and Yeo-Johnson Transformation')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.savefig(f'plots/X7_scaled_transformed/X7_feature_{i+1}.png')
    plt.close()