In [1]:
import pandas as pd
import numpy as np

In [2]:
df_mayo = pd.read_csv('/scratch/mcesped/Datasets/segments_mayo.csv', sep=",", index_col="index")
df_fnusa = pd.read_csv('/scratch/mcesped/Datasets/segments_fnusa.csv', sep=",", index_col="index")

# Noise detection

## Fnusa

I'll use fnusa for training as it has more data after curation

In [3]:
counter = {
    # 'pw':[],
    # 'noise':[],
    # 'path':[],
    # 'phys':[]
}
df_evaluated = df_fnusa
# Remove power line
df_evaluated = df_evaluated.loc[df_evaluated.category_id != 0]
df_evaluated.loc[:,'category_id'] = df_evaluated.category_id - 1

counter = {
    # 'pw':[],
    'noise':[],
    'path':[],
    'phys':[]
}
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)

subjects from inst fnusa
{'noise': [0, 2892, 12, 8463, 5059, 0, 5416, 18, 0, 5876, 3339, 1343, 181], 'path': [1912, 1657, 8076, 0, 1527, 1554, 7738, 1896, 6750, 4260, 4072, 7710, 5318], 'phys': [0, 7809, 0, 0, 5452, 962, 2689, 20860, 0, 1545, 2890, 38217, 14136]}


In [4]:
for cat in counter:
    array_count = np.array(counter[cat])
    print(cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

noise 
Number of subj: 10 
Median events per subj 3115.5 
Total events proportion 18.147960518624497 
Total events: 32599

path 
Number of subj: 12 
Median events per subj 4166.0 
Total events proportion 29.21020547907075 
Total events: 52470

phys 
Number of subj: 9 
Median events per subj 5452.0 
Total events proportion 52.64183400230475 
Total events: 94560



In [5]:
# Curate
max_img_subj_cat = {
    'noise': 3000, # Pay more the noise
    'path': 800, # Less path at it is not so hard to identify
    'phys': 1300
}
df_curated = []
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            df_cat = df_subj.loc[df_subj.category_id==cat_id]
            if len(df_curated)<1:
                df_curated = df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))
            else:
                df_curated = pd.concat([df_curated, df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))])
df_curated = df_curated.reset_index(drop=True)

subjects from inst fnusa


In [6]:
len(df_curated)

40408

In [7]:
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_curated
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))

for cat in counter:
    array_count = np.array(counter[cat])
    print(cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst fnusa
noise 
Number of subj: 10 
Median events per subj 2946.0 
Total events proportion 48.124133834884184 
Total events: 19446

path 
Number of subj: 12 
Median events per subj 800.0 
Total events proportion 23.75767174816868 
Total events: 9600

phys 
Number of subj: 9 
Median events per subj 1300.0 
Total events proportion 28.11819441694714 
Total events: 11362



Therefore, for a 70/30 split, we would need in the val set per class:

In [8]:
for cat in counter:
    array_count = np.array(counter[cat])
    print(cat, f'\nTotal events in val set: {np.sum(array_count)*0.3}\n')

noise 
Total events in val set: 5833.8

path 
Total events in val set: 2880.0

phys 
Total events in val set: 3408.6



In [9]:
# This would mean a dist of:
total = 5833.8+2880.0+3408.6
5833.8/total, 2880.0/total, 3408.6/total
# This is great!

(0.48124133834884186, 0.2375767174816868, 0.2811819441694714)

In [10]:
for tuple_events in zip(counter['noise'],counter['path'],counter['phys']):
    print(tuple_events)

(0, 800, 0)
(2892, 800, 1300)
(12, 800, 0)
(3000, 0, 0)
(3000, 800, 1300)
(0, 800, 962)
(3000, 800, 1300)
(18, 800, 1300)
(0, 800, 0)
(3000, 800, 1300)
(3000, 800, 1300)
(1343, 800, 1300)
(181, 800, 1300)


In [11]:
df_curated.patient_id.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])

Based on this distribution:
**Val subjects:** 2, 7, 6, 9

- Noise: 2892, 3000, 0
- Path: 800, 800,800
- Phys: 1300, 1300, 962

In [13]:
5834-2892-3000, 2880.0-800-800-800-800, 3409-1300-1300-962

(-58, -320.0, -153)

## Mayo

In [56]:
counter = {
    # 'pw':[],
    # 'noise':[],
    # 'path':[],
    # 'phys':[]
}
df_evaluated = df_mayo
# Remove power line
df_evaluated = df_evaluated.loc[df_evaluated.category_id != 0]
df_evaluated.loc[:,'category_id'] = df_evaluated.category_id - 1

counter = {
    # 'pw':[],
    'noise':[],
    'path':[],
    'phys':[]
}
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)

subjects from inst mayo
{'noise': [2318, 0, 466, 4636, 2063, 1002, 12873, 0, 0, 740, 0, 3699, 4096, 1700, 5613, 1278, 58, 761], 'path': [0, 883, 1923, 0, 0, 0, 0, 0, 2816, 0, 3426, 0, 0, 0, 0, 0, 3432, 2747], 'phys': [330, 8653, 399, 2057, 790, 6583, 0, 25951, 0, 0, 498, 177, 6098, 3126, 0, 1424, 0, 644]}


In [42]:
for cat in counter:
    array_count = np.array(counter[cat])
    print(cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

noise 
Number of subj: 14 
Median events per subj 1881.5 
Total events proportion 36.46742009535582 
Total events: 41303

path 
Number of subj: 6 
Median events per subj 2781.5 
Total events proportion 13.444287480134204 
Total events: 15227

phys 
Number of subj: 13 
Median events per subj 1424.0 
Total events proportion 50.08829242450998 
Total events: 56730



In [57]:
# Curate
max_img_subj_cat = {
    'noise': 1800, # Pay more the noise
    'path': 1000, # Less path at it is not so hard to identify
    'phys': 1000
}
df_curated = []
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            df_cat = df_subj.loc[df_subj.category_id==cat_id]
            if len(df_curated)<1:
                df_curated = df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))
            else:
                df_curated = pd.concat([df_curated, df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))])
df_curated = df_curated.reset_index(drop=True)

subjects from inst mayo


In [59]:
len(df_curated)

34326

In [58]:
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_curated
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))

for cat in counter:
    array_count = np.array(counter[cat])
    print(cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst mayo
noise 
Number of subj: 14 
Median events per subj 1750.0 
Total events proportion 54.20089727903047 
Total events: 18605

path 
Number of subj: 6 
Median events per subj 1000.0 
Total events proportion 17.138612130746374 
Total events: 5883

phys 
Number of subj: 13 
Median events per subj 1000.0 
Total events proportion 28.660490590223155 
Total events: 9838



Only choosing two subjects for validation: **number 7 and number 10 from fnusa** as it is balanced between classes. Representing ~15% of the total fnusa data.

# Noise detection dataset
I'll balance the classes manually to have a similar amount of training examples per subject. The biggest will be training 

In [3]:
df_fnusa_no_pw = df_fnusa.loc[df_fnusa.category_id != 0]
df_fnusa_no_pw.loc[:,'category_id'] = df_fnusa_no_pw.category_id - 1

df_mayo_no_pw = df_mayo.loc[df_mayo.category_id != 0]
df_mayo_no_pw.loc[:,'category_id'] = df_mayo_no_pw.category_id - 1

df_fnusa_no_pw.head()

Unnamed: 0_level_0,anatomy,category_id,channel,electrode_type,institution,patient_id,reviewer_id,segment_id,soz,category_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7132,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007132,1,pathology
7133,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007133,1,pathology
7134,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007134,1,pathology
7135,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007135,1,pathology
7136,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007136,1,pathology


## Balance Fnusa

Not touching test set as I'll probably do bootstrapping for that

In [4]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_fnusa_no_pw
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)

for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst fnusa
{'noise': [0, 2892, 12, 8463, 5059, 0, 5416, 18, 0, 5876, 3339, 1343, 181], 'path': [1912, 1657, 8076, 0, 1527, 1554, 7738, 1896, 6750, 4260, 4072, 7710, 5318], 'phys': [0, 7809, 0, 0, 5452, 962, 2689, 20860, 0, 1545, 2890, 38217, 14136]}

 noise 
Number of subj: 10 
Median events per subj 3115.5 
Total events proportion 18.147960518624497 
Total events: 32599


 path 
Number of subj: 12 
Median events per subj 4166.0 
Total events proportion 29.21020547907075 
Total events: 52470


 phys 
Number of subj: 9 
Median events per subj 5452.0 
Total events proportion 52.64183400230475 
Total events: 94560



In [5]:
# Manually picked choice
# For training: 1500, 2000, 1200. 
# For val 4000, 2000, 2000
max_img_subj_cat = {
    'noise': 3000, # Pay more the noise
    'path': 800, # Less path at it is not so hard to identify
    'phys': 1500
}
df_curated = []
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            df_cat = df_subj.loc[df_subj.category_id==cat_id]
            if len(df_curated)<1:
                df_curated = df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))
            else:
                df_curated = pd.concat([df_curated, df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))])
df_curated = df_curated.reset_index(drop=True)

subjects from inst fnusa


In [6]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_curated
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)
for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst fnusa
{'noise': [0, 2892, 12, 3000, 3000, 0, 3000, 18, 0, 3000, 3000, 1343, 181], 'path': [800, 800, 800, 0, 800, 800, 800, 800, 800, 800, 800, 800, 800], 'phys': [0, 1500, 0, 0, 1500, 962, 1500, 1500, 0, 1500, 1500, 1500, 1500]}

 noise 
Number of subj: 10 
Median events per subj 2946.0 
Total events proportion 46.29118263187964 
Total events: 19446


 path 
Number of subj: 12 
Median events per subj 800.0 
Total events proportion 22.852789944772425 
Total events: 9600


 phys 
Number of subj: 9 
Median events per subj 1500.0 
Total events proportion 30.856027423347935 
Total events: 12962



In [7]:
df_curated_fnusa = df_curated

In [18]:
len(df_curated_fnusa)

42008

## Balance Mayo

In [8]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_mayo_no_pw
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)

for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst mayo
{'noise': [2318, 0, 466, 4636, 2063, 1002, 12873, 0, 0, 740, 0, 3699, 4096, 1700, 5613, 1278, 58, 761], 'path': [0, 883, 1923, 0, 0, 0, 0, 0, 2816, 0, 3426, 0, 0, 0, 0, 0, 3432, 2747], 'phys': [330, 8653, 399, 2057, 790, 6583, 0, 25951, 0, 0, 498, 177, 6098, 3126, 0, 1424, 0, 644]}

 noise 
Number of subj: 14 
Median events per subj 1881.5 
Total events proportion 36.46742009535582 
Total events: 41303


 path 
Number of subj: 6 
Median events per subj 2781.5 
Total events proportion 13.444287480134204 
Total events: 15227


 phys 
Number of subj: 13 
Median events per subj 1424.0 
Total events proportion 50.08829242450998 
Total events: 56730



In [9]:
# Manually picked choice
# For training: 1500, 2000, 1200. 
# For val 4000, 2000, 2000
max_img_subj_cat = {
    'noise': 2000, # Pay more the noise
    'path': 1500, # Less path at it is not so hard to identify
    'phys': 1300
}
df_curated = []
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            df_cat = df_subj.loc[df_subj.category_id==cat_id]
            if len(df_curated)<1:
                df_curated = df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))
            else:
                df_curated = pd.concat([df_curated, df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))])
df_curated = df_curated.reset_index(drop=True)

subjects from inst mayo


In [10]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_curated
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)
for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst mayo
{'noise': [2000, 0, 466, 2000, 2000, 1002, 2000, 0, 0, 740, 0, 2000, 2000, 1700, 2000, 1278, 58, 761], 'path': [0, 883, 1500, 0, 0, 0, 0, 0, 1500, 0, 1500, 0, 0, 0, 0, 0, 1500, 1500], 'phys': [330, 1300, 399, 1300, 790, 1300, 0, 1300, 0, 0, 498, 177, 1300, 1300, 0, 1300, 0, 644]}

 noise 
Number of subj: 14 
Median events per subj 1850.0 
Total events proportion 49.6081932252145 
Total events: 20005


 path 
Number of subj: 6 
Median events per subj 1500.0 
Total events proportion 20.788077171055896 
Total events: 8383


 phys 
Number of subj: 13 
Median events per subj 1300.0 
Total events proportion 29.603729603729604 
Total events: 11938



In [11]:
df_curated_mayo = df_curated

In [12]:
len(df_curated_mayo), len(df_curated_fnusa)

(40326, 42008)

In [13]:
df_train = df_curated_fnusa
df_val = df_curated_mayo

In [14]:
# Write csv files. Using fnusa as train
df_train.to_csv('/scratch/mcesped/Datasets/Noise_detection/df_train_curated.csv', sep=",", index_label='index')
df_val.to_csv('/scratch/mcesped/Datasets/Noise_detection/df_val_curated.csv', sep=",", index_label='index')
# df_test.to_csv('/scratch/mcesped/Datasets/Noise_detection/df_test.csv', sep=",", index_label='index')

In [16]:
df_fnusa_no_pw.to_csv('/scratch/mcesped/Datasets/Noise_detection/df_train_full.csv', sep=",", index_label='index')
df_mayo_no_pw.to_csv('/scratch/mcesped/Datasets/Noise_detection/df_val_full.csv', sep=",", index_label='index')

# Multiclass (3 classes)

In [3]:
df_fnusa_no_pw = df_fnusa.loc[df_fnusa.category_id != 0]
df_fnusa_no_pw.loc[:,'category_id'] = df_fnusa_no_pw.category_id - 1

df_mayo_no_pw = df_mayo.loc[df_mayo.category_id != 0]
df_mayo_no_pw.loc[:,'category_id'] = df_mayo_no_pw.category_id - 1

df_fnusa_no_pw.head()

Unnamed: 0_level_0,anatomy,category_id,channel,electrode_type,institution,patient_id,reviewer_id,segment_id,soz,category_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7132,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007132,1,pathology
7133,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007133,1,pathology
7134,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007134,1,pathology
7135,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007135,1,pathology
7136,hippocampus anterior,1,B'1,depth,fnusa,1,2,y007136,1,pathology


## Balance Fnusa

Not touching test set as I'll probably do bootstrapping for that

In [7]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_fnusa_no_pw
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)

for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst fnusa
{'noise': [0, 2892, 12, 8463, 5059, 0, 5416, 18, 0, 5876, 3339, 1343, 181], 'path': [1912, 1657, 8076, 0, 1527, 1554, 7738, 1896, 6750, 4260, 4072, 7710, 5318], 'phys': [0, 7809, 0, 0, 5452, 962, 2689, 20860, 0, 1545, 2890, 38217, 14136]}

 noise 
Number of subj: 10 
Median events per subj 3115.5 
Total events proportion 18.147960518624497 
Total events: 32599


 path 
Number of subj: 12 
Median events per subj 4166.0 
Total events proportion 29.21020547907075 
Total events: 52470


 phys 
Number of subj: 9 
Median events per subj 5452.0 
Total events proportion 52.64183400230475 
Total events: 94560



In [8]:

max_img_subj_cat = {
    'noise': 3100, # Pay more the noise and physiology
    'path': 500, # Less path at it is not so hard to identify. Before in 1000
    'phys': 4000
}
df_curated = []
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            df_cat = df_subj.loc[df_subj.category_id==cat_id]
            if len(df_curated)<1:
                df_curated = df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))
            else:
                df_curated = pd.concat([df_curated, df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))])
df_curated = df_curated.reset_index(drop=True)

subjects from inst fnusa


In [9]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_curated
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)
for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst fnusa
{'noise': [0, 2892, 12, 3100, 3100, 0, 3100, 18, 0, 3100, 3100, 1343, 181], 'path': [500, 500, 500, 0, 500, 500, 500, 500, 500, 500, 500, 500, 500], 'phys': [0, 4000, 0, 0, 4000, 962, 2689, 4000, 0, 1545, 2890, 4000, 4000]}

 noise 
Number of subj: 10 
Median events per subj 2996.0 
Total events proportion 36.91516138584542 
Total events: 19946


 path 
Number of subj: 12 
Median events per subj 500.0 
Total events proportion 11.10453064850459 
Total events: 6000


 phys 
Number of subj: 9 
Median events per subj 4000.0 
Total events proportion 51.980307965649985 
Total events: 28086



In [10]:
df_curated_fnusa = df_curated

In [11]:
len(df_curated_fnusa)

54032

## Balance Mayo

In [12]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_mayo_no_pw
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)

for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst mayo
{'noise': [2318, 0, 466, 4636, 2063, 1002, 12873, 0, 0, 740, 0, 3699, 4096, 1700, 5613, 1278, 58, 761], 'path': [0, 883, 1923, 0, 0, 0, 0, 0, 2816, 0, 3426, 0, 0, 0, 0, 0, 3432, 2747], 'phys': [330, 8653, 399, 2057, 790, 6583, 0, 25951, 0, 0, 498, 177, 6098, 3126, 0, 1424, 0, 644]}

 noise 
Number of subj: 14 
Median events per subj 1881.5 
Total events proportion 36.46742009535582 
Total events: 41303


 path 
Number of subj: 6 
Median events per subj 2781.5 
Total events proportion 13.444287480134204 
Total events: 15227


 phys 
Number of subj: 13 
Median events per subj 1424.0 
Total events proportion 50.08829242450998 
Total events: 56730



In [13]:
# Manually picked choice
# For training: 1500, 2000, 1200. 
# For val 4000, 2000, 2000
max_img_subj_cat = {
    'noise': 1500, # Pay more the noise
    'path': 2800, # Less path at it is not so hard to identify
    'phys': 2000
}
df_curated = []
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            df_cat = df_subj.loc[df_subj.category_id==cat_id]
            if len(df_curated)<1:
                df_curated = df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))
            else:
                df_curated = pd.concat([df_curated, df_cat.sample(min(len(df_cat),max_img_subj_cat[cat_name]))])
df_curated = df_curated.reset_index(drop=True)

subjects from inst mayo


In [14]:
# Eval
counter = {
    'noise':[],
    'path':[],
    'phys':[]
}
df_evaluated = df_curated
for inst in np.unique(df_evaluated.institution):
    df_inst = df_evaluated.loc[df_evaluated.institution==inst]
    print(f'subjects from inst {inst}')
    for subj in np.unique(df_inst.patient_id):
        df_subj = df_inst.loc[df_inst.patient_id==subj]
        for cat_id, cat_name in [(0, 'noise'), (1, 'path'), (2,'phys')]:
            counter[cat_name].append(len(df_subj.loc[df_subj.category_id==cat_id]))
print(counter)
for cat in counter:
    array_count = np.array(counter[cat])
    print('\n',cat,f'\nNumber of subj: {len(array_count[array_count!=0])}', f'\nMedian events per subj {np.median(array_count[array_count!=0])}',f'\nTotal events proportion {np.sum(array_count)*100/len(df_evaluated)}', f'\nTotal events: {np.sum(array_count)}\n')

subjects from inst mayo
{'noise': [1500, 0, 466, 1500, 1500, 1002, 1500, 0, 0, 740, 0, 1500, 1500, 1500, 1500, 1278, 58, 761], 'path': [0, 883, 1923, 0, 0, 0, 0, 0, 2800, 0, 2800, 0, 0, 0, 0, 0, 2800, 2747], 'phys': [330, 2000, 399, 2000, 790, 2000, 0, 2000, 0, 0, 498, 177, 2000, 2000, 0, 1424, 0, 644]}

 noise 
Number of subj: 14 
Median events per subj 1500.0 
Total events proportion 35.04944110060189 
Total events: 16305


 path 
Number of subj: 6 
Median events per subj 2773.5 
Total events proportion 29.993551160791057 
Total events: 13953


 phys 
Number of subj: 13 
Median events per subj 1424.0 
Total events proportion 34.957007738607054 
Total events: 16262



In [15]:
df_curated_mayo = df_curated

In [16]:
len(df_curated_mayo), len(df_curated_fnusa)

(46520, 54032)

In [17]:
df_train = df_curated_fnusa
df_val = df_curated_mayo

In [18]:
# Write csv files. Using fnusa as train
df_train.to_csv('/scratch/mcesped/Datasets/Multiclass/df_train_curated.csv', sep=",", index_label='index')
df_val.to_csv('/scratch/mcesped/Datasets/Multiclass/df_val_curated.csv', sep=",", index_label='index')
# df_test.to_csv('/scratch/mcesped/Datasets/Noise_detection/df_test.csv', sep=",", index_label='index')

In [47]:
df_fnusa_no_pw.to_csv('/scratch/mcesped/Datasets/Multiclass/df_train_full.csv', sep=",", index_label='index')
df_mayo_no_pw.to_csv('/scratch/mcesped/Datasets/Multiclass/df_val_full.csv', sep=",", index_label='index')