In [1]:
import pandas as pd

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# utils class data

from src.data.sets_subsets import Sets
from src.data.data_cleaner import DataCleaner
from src.data.data_preparer import DataPreparer

In [3]:
df = pd.read_csv('../data/raw/EEG.machinelearing_data_BRMH.csv')
df[['main.disorder']]

Unnamed: 0,main.disorder
0,Addictive disorder
1,Addictive disorder
2,Addictive disorder
3,Addictive disorder
4,Addictive disorder
...,...
940,Healthy control
941,Healthy control
942,Healthy control
943,Healthy control


In [4]:
dc = DataCleaner(df)
dc.analyze_missing_values()

(education        15
 IQ               13
 Unnamed: 122    945
 dtype: int64,
 np.int64(973))

In [5]:
dc.remove_missing_columns()
dc.handle_nans()
dc.analyze_missing_values()

(Series([], dtype: int64), np.int64(0))

In [6]:
df = dc.get_dataframe()
df

Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,...,COH.F.gamma.o.Pz.p.P4,COH.F.gamma.o.Pz.q.T6,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2
0,1,M,57.0,2012.8.30,13.43871,101.580472,Addictive disorder,Alcohol use disorder,35.998557,21.717375,...,55.989192,16.739679,23.452271,45.678820,30.167520,16.918761,48.850427,9.422630,34.507082,28.613029
1,2,M,37.0,2012.9.6,6.00000,120.000000,Addictive disorder,Alcohol use disorder,13.425118,11.002916,...,45.595619,17.510824,26.777368,28.201062,57.108861,32.375401,60.351749,13.900981,57.831848,43.463261
2,3,M,32.0,2012.9.10,16.00000,113.000000,Addictive disorder,Alcohol use disorder,29.941780,27.544684,...,99.475453,70.654171,39.131547,69.920996,71.063644,38.534505,69.908764,27.180532,64.803155,31.485799
3,4,M,35.0,2012.10.8,18.00000,126.000000,Addictive disorder,Alcohol use disorder,21.496226,21.846832,...,59.986561,63.822201,36.478254,47.117006,84.658376,24.724096,50.299349,35.319695,79.822944,41.141873
4,5,M,36.0,2012.10.18,16.00000,112.000000,Addictive disorder,Alcohol use disorder,37.775667,33.607679,...,61.462720,59.166097,51.465531,58.635415,80.685608,62.138436,75.888749,61.003944,87.455509,70.531662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,941,M,22.0,2014.8.28,13.00000,116.000000,Healthy control,Healthy control,41.851823,36.771496,...,82.905657,34.850706,63.970519,63.982003,51.244725,62.203684,62.062237,31.013031,31.183413,98.325230
941,942,M,26.0,2014.9.19,13.00000,118.000000,Healthy control,Healthy control,18.986856,19.401387,...,65.917918,66.700117,44.756285,49.787513,98.905995,54.021304,93.902401,52.740396,92.807331,56.320868
942,943,M,26.0,2014.9.27,16.00000,113.000000,Healthy control,Healthy control,28.781317,32.369230,...,61.040959,27.632209,45.552852,33.638817,46.690983,19.382928,41.050717,7.045821,41.962451,19.092111
943,944,M,24.0,2014.9.20,13.00000,107.000000,Healthy control,Healthy control,19.929100,25.196375,...,99.113664,48.328934,41.248470,28.192238,48.665743,42.007147,28.735945,27.176500,27.529522,20.028446


In [7]:
df.drop(columns=['no.', 'eeg.date'], inplace=True)
df.isna().sum().sum()

np.int64(0)

In [9]:
df.to_csv('../data/processed/EEG.machinelearing_data_BRMH.csv', index=False)

In [12]:
# set and subset separation

sets = Sets(dataframe=df, 
            quantitative_features=['age', 'education', 'IQ'], 
            qualitative_features=['sex'], 
            target_main='main.disorder', 
            target_specific='specific.disorder'
            )

In [20]:
df_psd_all_bands = sets.df_ab_psd
df_fc_all_bands = sets.df_coh_fc
df_psd_fc_all_bands = sets.df_ab_psd_coh_fc
quantitative_features = sets.quantitative_features
target_main = sets.target_main

dfs_bands_psd = sets.create_dfs_bands(df=df_psd_all_bands)
dfs_bands_fc = sets.create_dfs_bands(df=df_fc_all_bands)
dfs_bands_psd_fc = sets.create_dfs_bands(df=df_psd_fc_all_bands)

In [21]:
qualitative_features = sets.qualitative_features
qualitative_features = qualitative_features.squeeze()
target_main = target_main.squeeze()

pd.crosstab(qualitative_features, target_main)

main.disorder,Addictive disorder,Anxiety disorder,Healthy control,Mood disorder,Obsessive compulsive disorder,Schizophrenia,Trauma and stress related disorder
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,22,28,35,115,8,52,84
M,164,79,60,151,38,65,44


In [22]:
dfs_bands_psd_fc['delta']

Unnamed: 0,AB.A.delta.a.FP1,AB.A.delta.b.FP2,AB.A.delta.c.F7,AB.A.delta.d.F3,AB.A.delta.e.Fz,AB.A.delta.f.F4,AB.A.delta.g.F8,AB.A.delta.h.T3,AB.A.delta.i.C3,AB.A.delta.j.Cz,...,COH.A.delta.o.Pz.p.P4,COH.A.delta.o.Pz.q.T6,COH.A.delta.o.Pz.r.O1,COH.A.delta.o.Pz.s.O2,COH.A.delta.p.P4.q.T6,COH.A.delta.p.P4.r.O1,COH.A.delta.p.P4.s.O2,COH.A.delta.q.T6.r.O1,COH.A.delta.q.T6.s.O2,COH.A.delta.r.O1.s.O2
0,35.998557,21.717375,21.518280,26.825048,26.611516,25.732649,16.563408,29.891368,22.402246,22.582176,...,33.602703,5.222549,7.150781,26.452531,14.731017,6.143500,32.022094,2.149012,18.110294,15.125936
1,13.425118,11.002916,11.942516,15.272216,14.151570,12.456034,8.436832,9.975238,14.834740,10.950564,...,44.592397,15.714172,22.275446,23.371961,51.677774,36.421214,57.266856,17.109119,43.320665,46.174213
2,29.941780,27.544684,17.150159,23.608960,27.087811,13.541237,16.523963,12.775574,21.686306,18.367666,...,99.785662,54.796761,16.598523,70.106871,55.438293,15.846510,69.026347,7.923487,48.922446,26.971678
3,21.496226,21.846832,17.364316,13.833701,14.100954,13.100939,14.613650,8.063191,11.015078,11.639560,...,26.807758,33.249219,9.694118,18.168532,73.411256,6.777008,19.586110,9.035528,53.592150,5.215413
4,37.775667,33.607679,21.865556,21.771413,22.854536,21.456377,15.969042,9.434306,15.244523,17.041979,...,11.372665,14.403949,5.850498,13.503633,27.115082,8.677888,24.688328,15.129463,45.612721,18.292532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,41.851823,36.771496,43.671792,36.860889,24.732236,23.607823,23.288260,7.520642,17.636528,20.220791,...,84.122265,41.407185,65.274650,65.775185,54.247137,68.479332,68.933347,42.768542,43.003647,99.563911
941,18.986856,19.401387,27.586436,20.194732,19.407491,20.216570,16.465027,13.178851,12.687296,20.257619,...,94.708097,94.967559,90.020317,93.954836,99.951555,91.751161,99.633608,91.469273,99.512077,93.819343
942,28.781317,32.369230,11.717778,23.134370,26.209302,25.484497,22.586688,11.368466,21.799254,36.083181,...,67.745883,19.133832,43.100275,39.942251,39.601824,37.123125,58.061482,22.351596,49.959779,51.267681
943,19.929100,25.196375,14.445391,16.453456,16.590649,16.007279,18.909188,13.438102,17.442777,18.859586,...,99.946811,27.110725,17.032675,28.323060,27.330957,16.981225,28.266015,10.096288,21.126765,6.927552


In [None]:
dp = DataPreparer(df, target_column='main.disorder')

dp.minority_class_balancer()

In [None]:
dp.get_dataframe().shape

In [None]:
dp.get_dataframe()['main.disorder'].value_counts()