In [101]:
import pandas as pd
import numpy as np

In [102]:
df_mayo = pd.read_csv('/scratch/mcesped/Datasets/segments_mayo.csv', sep=",", index_col="index")
df_fnusa = pd.read_csv('/scratch/mcesped/Datasets/segments_fnusa.csv', sep=",", index_col="index")

In [103]:
counter = {
    # 'pw':[],
    # 'noise':[],
    # 'path':[],
    # 'phys':[]
}
df_total = df_fnusa
for inst in np.unique(df_total.institution):
    df_inst = df_total.loc[df_total.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        counter[subj] = []
        for cat_id, cat_name in [(0,'pw'), (1, 'noise'), (2, 'path'), (3,'phys')]:
            counter[subj].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)

subjects from inst fnusa
{0: [7132, 0, 0, 0], 1: [0, 0, 1912, 0], 2: [4315, 2892, 1657, 7809], 3: [0, 12, 8076, 0], 4: [0, 8463, 0, 0], 5: [0, 5059, 1527, 5452], 6: [0, 0, 1554, 962], 7: [2042, 5416, 7738, 2689], 8: [0, 18, 1896, 20860], 9: [0, 0, 6750, 0], 10: [0, 5876, 4260, 1545], 11: [0, 3339, 4072, 2890], 12: [0, 1343, 7710, 38217], 13: [0, 181, 5318, 14136]}


In [104]:
(np.sum(counter[7])+np.sum(counter[10]))*100/len(df_total)

15.309810582131131

Only choosing two subjects for validation: **number 7 and number 10 from fnusa** as it is balanced between classes. Representing ~15% of the total fnusa data.

# Noise detection dataset
I'll balance the classes manually to have a similar amount of training examples

In [105]:
df_fnusa_no_pw = df_fnusa.loc[df_fnusa.category_id != 0]
df_fnusa_no_pw.loc[:,'category_id'] = df_fnusa_no_pw.category_id - 1

df_mayo_no_pw = df_mayo.loc[df_mayo.category_id != 0]
df_mayo_no_pw.loc[:,'category_id'] = df_mayo_no_pw.category_id - 1

df_fnusa_no_pw.head()

Unnamed: 0_level_0,anatomy,category_id,channel,electrode_type,institution,patient_id,reviewer_id,segment_id,soz,category_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7132,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007132,1,pathology
7133,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007133,1,pathology
7134,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007134,1,pathology
7135,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007135,1,pathology
7136,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007136,1,pathology


In [9]:
df_train = df_mayo_no_pw

In [106]:
df_val = df_fnusa_no_pw.loc[(df_fnusa_no_pw.patient_id == 7).astype(bool) | (df_fnusa_no_pw.patient_id == 10).astype(bool) ]
df_val.patient_id.unique()

array([ 7, 10])

In [107]:
df_test = df_fnusa_no_pw.loc[(df_fnusa_no_pw.patient_id != 7).astype(bool) & (df_fnusa_no_pw.patient_id != 10).astype(bool) ]
df_test.patient_id.unique()

array([ 1,  2,  3,  4,  5,  6,  8,  9, 11, 12, 13])

In [108]:
np.unique(df_train.patient_id), np.unique(df_val.patient_id), np.unique(df_test.patient_id)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 14, 16, 17, 18, 19, 20, 21,
        23]),
 array([ 7, 10]),
 array([ 1,  2,  3,  4,  5,  6,  8,  9, 11, 12, 13]))

## Balance datasets

Not touching test set as I'll probably do bootstrapping for that

In [112]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_train
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)

for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst mayo
{'noise': [1100, 0, 466, 1100, 1100, 1002, 1100, 0, 0, 740, 0, 1100, 1100, 1100, 1100, 1100, 58, 761], 'path': [0, 883, 1500, 0, 0, 0, 0, 0, 1500, 0, 1500, 0, 0, 0, 0, 0, 1500, 1500], 'phys': [330, 1200, 399, 1200, 790, 1200, 0, 1200, 0, 0, 498, 177, 1200, 1200, 0, 1200, 0, 644]}

 noise 
Number of subj: 14 
Median events per subj 1100.0 
Total events proportion 39.71672606611774 
Total events: 12927


 path 
Number of subj: 6 
Median events per subj 1500.0 
Total events proportion 25.755806808406046 
Total events: 8383


 phys 
Number of subj: 13 
Median events per subj 1200.0 
Total events proportion 34.52746712547622 
Total events: 11238



In [117]:
# Manually picked choice
# For training: 1500, 2000, 1200. 
# For val 4000, 2000, 2000
max_img_subj_cat = {
    'noise': 1000, # Pay more the noise
    'path': 750, # Less path at it is not so hard to identify
    'phys': 500
}
df_curated = []
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            df_cat = df_subj.loc[df_subj.category_id==cat_id]
            if len(df_curated)<1:
                df_curated = df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))
            else:
                df_curated = pd.concat([df_curated, df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))])
df_curated = df_curated.reset_index(drop=True)

subjects from inst mayo


In [118]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_curated
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)
for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst mayo
{'noise': [1000, 0, 466, 1000, 1000, 1000, 1000, 0, 0, 740, 0, 1000, 1000, 1000, 1000, 1000, 58, 761], 'path': [0, 500, 500, 0, 0, 0, 0, 0, 500, 0, 500, 0, 0, 0, 0, 0, 500, 500], 'phys': [330, 500, 399, 500, 500, 500, 0, 500, 0, 0, 498, 177, 500, 500, 0, 500, 0, 500]}

 noise 
Number of subj: 14 
Median events per subj 1000.0 
Total events proportion 57.456161307276986 
Total events: 12025


 path 
Number of subj: 6 
Median events per subj 500.0 
Total events proportion 14.334177457116919 
Total events: 3000


 phys 
Number of subj: 13 
Median events per subj 500.0 
Total events proportion 28.209661235606095 
Total events: 5904



In [61]:
df_val = df_curated

In [71]:
np.unique(df_train.patient_id), np.unique(df_val.patient_id), np.unique(df_test.patient_id)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 14, 16, 17, 18, 19, 20, 21,
        23]),
 array([ 7, 10]),
 array([ 1,  2,  3,  4,  5,  6,  8,  9, 11, 12, 13]))

In [63]:
# Write csv files
df_train.to_csv('/scratch/mcesped/Datasets/Noise_detection/df_train.csv', sep=",", index_label='index')
df_val.to_csv('/scratch/mcesped/Datasets/Noise_detection/df_val.csv', sep=",", index_label='index')
df_test.to_csv('/scratch/mcesped/Datasets/Noise_detection/df_test.csv', sep=",", index_label='index')

# Multiclass (3 classes)

In [73]:
df_fnusa_no_pw = df_fnusa.loc[df_fnusa.category_id != 0]
df_fnusa_no_pw.loc[:,'category_id'] = df_fnusa_no_pw.category_id - 1

df_mayo_no_pw = df_mayo.loc[df_mayo.category_id != 0]
df_mayo_no_pw.loc[:,'category_id'] = df_mayo_no_pw.category_id - 1

df_fnusa_no_pw.head()

Unnamed: 0_level_0,anatomy,category_id,channel,electrode_type,institution,patient_id,reviewer_id,segment_id,soz,category_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7132,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007132,1,pathology
7133,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007133,1,pathology
7134,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007134,1,pathology
7135,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007135,1,pathology
7136,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007136,1,pathology


In [74]:
df_train = df_mayo_no_pw

In [75]:
df_val = df_fnusa_no_pw.loc[(df_fnusa_no_pw.patient_id == 7).astype(bool) | (df_fnusa_no_pw.patient_id == 10).astype(bool) ]
df_val.patient_id.unique()

array([ 7, 10])

In [76]:
df_test = df_fnusa_no_pw.loc[(df_fnusa_no_pw.patient_id != 7).astype(bool) & (df_fnusa_no_pw.patient_id != 10).astype(bool) ]
df_test.patient_id.unique()

array([ 1,  2,  3,  4,  5,  6,  8,  9, 11, 12, 13])

In [77]:
np.unique(df_train.patient_id), np.unique(df_val.patient_id), np.unique(df_test.patient_id)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 14, 16, 17, 18, 19, 20, 21,
        23]),
 array([ 7, 10]),
 array([ 1,  2,  3,  4,  5,  6,  8,  9, 11, 12, 13]))

## Balance datasets

In [95]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_val
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)

for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst fnusa
{'noise': [5416, 5876], 'path': [7738, 4260], 'phys': [2689, 1545]}

 noise 
Number of subj: 2 
Median events per subj 5646.0 
Total events proportion 41.02601366080511 
Total events: 11292


 path 
Number of subj: 2 
Median events per subj 5999.0 
Total events proportion 43.591047812817905 
Total events: 11998


 phys 
Number of subj: 2 
Median events per subj 2117.0 
Total events proportion 15.38293852637698 
Total events: 4234



In [96]:
# Manually picked choice
# For training: 1100, 1500, 1200. Chosen to balance it the best I could without giving so much segments
# to one class (path) coming from few subjects. Path then has less percentage.
# For val 2000 each and phys 2200 (as 1 subject has less data than 2000)
max_img_subj_cat = {
    'noise': 2000, # Pay more the noise
    'path': 2000, # Less path at it is not so hard to identify
    'phys': 2200
}
df_curated = []
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            df_cat = df_subj.loc[df_subj.category_id==cat_id]
            if len(df_curated)<1:
                df_curated = df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))
            else:
                df_curated = pd.concat([df_curated, df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))])
df_curated = df_curated.reset_index(drop=True)

subjects from inst fnusa


In [97]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_curated
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)
for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst fnusa
{'noise': [2000, 2000], 'path': [2000, 2000], 'phys': [2200, 1545]}

 noise 
Number of subj: 2 
Median events per subj 2000.0 
Total events proportion 34.05704555129842 
Total events: 4000


 path 
Number of subj: 2 
Median events per subj 2000.0 
Total events proportion 34.05704555129842 
Total events: 4000


 phys 
Number of subj: 2 
Median events per subj 1872.5 
Total events proportion 31.88590889740315 
Total events: 3745



In [98]:
df_val = df_curated

In [99]:
np.unique(df_train.patient_id), np.unique(df_val.patient_id), np.unique(df_test.patient_id)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 14, 16, 17, 18, 19, 20, 21,
        23]),
 array([ 7, 10]),
 array([ 1,  2,  3,  4,  5,  6,  8,  9, 11, 12, 13]))

In [100]:
# Write csv files
df_train.to_csv('/scratch/mcesped/Datasets/Multiclass/df_train.csv', sep=",", index_label='index')
df_val.to_csv('/scratch/mcesped/Datasets/Multiclass/df_val.csv', sep=",", index_label='index')
df_test.to_csv('/scratch/mcesped/Datasets/Multiclass/df_test.csv', sep=",", index_label='index')