In [1]:
from torch import tensor, argmax, randn
from torch.nn import CrossEntropyLoss, BCELoss, NLLLoss, LogSoftmax
import pandas as pd

In [35]:
train_list = pd.read_csv(f'data.nosync/networks_multi/train_set_files.csv')
train_list['diagnosis'] = train_list['file'].apply(lambda x: x.split('_')[4])

val_list = pd.read_csv(f'data.nosync/networks_multi/val_set_files.csv')
val_list['diagnosis'] = val_list['file'].apply(lambda x: x.split('_')[4])

test_list = pd.read_csv(f'data.nosync/networks_multi/test_set_files.csv')
test_list['diagnosis'] = test_list['file'].apply(lambda x: x.split('_')[4])

train_list

Unnamed: 0,file,diagnosis
0,data.nosync/networks_multi/1320247_run-1_ADHD2...,TD
1,data.nosync/networks_multi/8415034_run-2_ADHD2...,TD
2,data.nosync/networks_multi/3011311_run-2_ADHD2...,TD
3,data.nosync/networks_multi/0010087_run-2_ADHD2...,ADHD
4,data.nosync/networks_multi/0010030_run-2_ADHD2...,ADHD
...,...,...
436,data.nosync/networks_multi/0010115_run-1_ADHD2...,ADHD
437,data.nosync/networks_multi/0010086_run-2_ADHD2...,ADHD
438,data.nosync/networks_multi/1127915_run-1_ADHD2...,TD
439,data.nosync/networks_multi/2136051_run-1_ADHD2...,TD


In [36]:
train_dist = train_list.groupby('diagnosis').count().reset_index()
train_dist['total'] = train_dist['file'].sum()
train_dist['prob'] = train_dist['file']/train_dist['total']
train_dist = train_dist.to_dict(orient = 'list')
train_dist

{'diagnosis': ['ADHD', 'ASD', 'ASD-ADHD', 'TD'],
 'file': [143, 58, 23, 217],
 'total': [441, 441, 441, 441],
 'prob': [0.3242630385487528,
  0.13151927437641722,
  0.05215419501133787,
  0.49206349206349204]}

In [37]:
val_dist = val_list.groupby('diagnosis').count().reset_index()
val_dist['total'] = val_dist['file'].sum()
val_dist['prob'] = val_dist['file']/val_dist['total']
val_dist = val_dist.to_dict(orient = 'list')

In [38]:
test_dist = test_list.groupby('diagnosis').count().reset_index()
test_dist['total'] = test_dist['file'].sum()
test_dist['prob'] = test_dist['file']/test_dist['total']
test_dist = test_dist.to_dict(orient = 'list')

In [39]:
for name, s in [('train', train_dist), ('val', val_dist), ('test', test_dist)]:
    true_labels = []
    uniform_prob = []
    prior_prob = []
    #for each class
    for i in range(4):
        y = [float(0) for i in range(4)]
        y[i] = float(1)
        #for n sampels in
        for n in range(s['file'][i]):
            true_labels.append(y)
            uniform_prob.append([0.25 for i in range(4)])
            prior_prob.append(train_dist['prob'])

    true_labels = tensor(true_labels)
    uniform_prob = tensor(uniform_prob)
    prior_prob = tensor(prior_prob)

    log_prior_prob = prior_prob.log()
    log_uniform_prob = uniform_prob.log()

    loss_func = NLLLoss()
    print(f"{name}, For uniform probabilities: {loss_func(log_uniform_prob, argmax(true_labels, dim=-1))}")
    print(f"{name}, For prior probabilities: {loss_func(log_prior_prob, argmax(true_labels, dim=-1))}")

train, For uniform probabilities: 1.3862946033477783
train, For prior probabilities: 1.1349709033966064
val, For uniform probabilities: 1.3862944841384888
val, For prior probabilities: 1.1967862844467163
test, For uniform probabilities: 1.3862944841384888
test, For prior probabilities: 1.200453281402588


# Binary

In [40]:
train_dist = train_list.copy()
train_dist['diagnosis'] = train_dist['diagnosis'].replace({'ADHD': 'Non-TD',
                                                           'ASD': 'Non-TD',
                                                           'ASD-ADHD': 'Non-TD'})
train_dist = train_dist.groupby('diagnosis').count().reset_index()
train_dist['total'] = train_dist['file'].sum()
train_dist['prob'] = train_dist['file']/train_dist['total']
train_dist = train_dist.to_dict(orient = 'list')
train_dist

{'diagnosis': ['Non-TD', 'TD'],
 'file': [224, 217],
 'total': [441, 441],
 'prob': [0.5079365079365079, 0.49206349206349204]}

In [41]:
val_dist = val_list.copy()
val_dist['diagnosis'] = val_dist['diagnosis'].replace({'ADHD': 'Non-TD',
                                                           'ASD': 'Non-TD',
                                                           'ASD-ADHD': 'Non-TD'})
val_dist = val_dist.groupby('diagnosis').count().reset_index()
val_dist['total'] = val_dist['file'].sum()
val_dist['prob'] = val_dist['file']/val_dist['total']
val_dist = val_dist.to_dict(orient = 'list')

In [42]:
test_dist = test_list.copy()
test_dist['diagnosis'] = test_dist['diagnosis'].replace({'ADHD': 'Non-TD',
                                                           'ASD': 'Non-TD',
                                                           'ASD-ADHD': 'Non-TD'})
test_dist = test_dist.groupby('diagnosis').count().reset_index()
test_dist['total'] = test_dist['file'].sum()
test_dist['prob'] = test_dist['file']/test_dist['total']
test_dist = test_dist.to_dict(orient = 'list')

In [43]:
for name, s in [('train', train_dist), ('val', val_dist), ('test', test_dist)]:
    true_labels = []
    uniform_prob = []
    prior_prob = []
    for i in range(2):
        y = [float(0) for i in range(2)]
        y[i] = float(1)
        for n in range(s['file'][i]):
            true_labels.append(y)
            uniform_prob.append([0.5 for i in range(2)])
            prior_prob.append(train_dist['prob'])

    true_labels = tensor(true_labels)
    uniform_prob = tensor(uniform_prob)
    prior_prob = tensor(prior_prob)

    loss_func = BCELoss()
    print(f"{name}, For uniform probabilities: {loss_func(uniform_prob, true_labels)}")
    print(f"{name}, For prior probabilities: {loss_func(prior_prob, true_labels)}")

train, For uniform probabilities: 0.6931471824645996
train, For prior probabilities: 0.6930211186408997
val, For uniform probabilities: 0.6931472420692444
val, For prior probabilities: 0.6899414658546448
test, For uniform probabilities: 0.6931472420692444
test, For prior probabilities: 0.6905930042266846


## Participant split

In [7]:
train_list = pd.read_csv(f'data.nosync/networks_multi/train_set_files.csv')
train_list['participant'] = train_list['file'].apply(lambda x: x.split('/')[2])
train_list['participant'] = train_list['participant'].apply(lambda x: x.split('_')[0] + '-' + x.split('_')[2])
train_list['diagnosis'] = train_list['file'].apply(lambda x: x.split('_')[4])
train_list_scans = train_list[['participant', 'diagnosis']].groupby('diagnosis').count().reset_index()
train_list_scans['percent (scan)'] = train_list_scans['participant']/train_list_scans['participant'].sum()*100

train_list_participants = train_list.drop_duplicates('participant')[['participant', 'diagnosis']].groupby('diagnosis').count().reset_index()
train_list_participants['percent (perticipant)'] = train_list_participants['participant']/train_list_participants['participant'].sum()*100
train_list_participants['percent (scan)'] = train_list_scans['percent (scan)']
train_list_participants['scans'] = train_list_scans['participant']
train_list_participants[['diagnosis', 'participant', 'scans', 'percent (perticipant)', 'percent (scan)']]

Unnamed: 0,diagnosis,participant,scans,percent (perticipant),percent (scan)
0,ADHD,87,143,27.358491,32.426304
1,ASD,58,58,18.238994,13.151927
2,ASD-ADHD,23,23,7.232704,5.21542
3,TD,150,217,47.169811,49.206349


In [8]:
val_list = pd.read_csv(f'data.nosync/networks_multi/val_set_files.csv')
val_list['participant'] = val_list['file'].apply(lambda x: x.split('/')[2])
val_list['participant'] = val_list['participant'].apply(lambda x: x.split('_')[0] + '-' + x.split('_')[2])
val_list['diagnosis'] = val_list['file'].apply(lambda x: x.split('_')[4])
val_list_scans = val_list[['participant', 'diagnosis']].groupby('diagnosis').count().reset_index()
val_list_scans['percent (scan)'] = val_list_scans['participant']/val_list_scans['participant'].sum()*100

val_list_participants = val_list.drop_duplicates('participant')[['participant', 'diagnosis']].groupby('diagnosis').count().reset_index()
val_list_participants['percent (perticipant)'] = val_list_participants['participant']/val_list_participants['participant'].sum()*100
val_list_participants['percent (scan)'] = val_list_scans['percent (scan)']
val_list_participants['scans'] = val_list_scans['participant']
val_list_participants[['diagnosis', 'participant', 'scans', 'percent (perticipant)', 'percent (scan)']]

Unnamed: 0,diagnosis,participant,scans,percent (perticipant),percent (scan)
0,ADHD,20,33,29.411765,40.740741
1,ASD,11,11,16.176471,13.580247
2,ASD-ADHD,5,5,7.352941,6.17284
3,TD,32,32,47.058824,39.506173


In [9]:
test_list = pd.read_csv(f'data.nosync/networks_multi/test_set_files.csv')
test_list['participant'] = test_list['file'].apply(lambda x: x.split('/')[2])
test_list['participant'] = test_list['participant'].apply(lambda x: x.split('_')[0] + '-' + x.split('_')[2])
test_list['diagnosis'] = test_list['file'].apply(lambda x: x.split('_')[4])
test_list_scans = test_list[['participant', 'diagnosis']].groupby('diagnosis').count().reset_index()
test_list_scans['percent (scan)'] = test_list_scans['participant']/test_list_scans['participant'].sum()*100

test_list_participants = test_list.drop_duplicates('participant')[['participant', 'diagnosis']].groupby('diagnosis').count().reset_index()
test_list_participants['percent (perticipant)'] = test_list_participants['participant']/test_list_participants['participant'].sum()*100
test_list_participants['percent (scan)'] = test_list_scans['percent (scan)']
test_list_participants['scans'] = test_list_scans['participant']
test_list_participants[['diagnosis', 'participant', 'scans', 'percent (perticipant)', 'percent (scan)']]

Unnamed: 0,diagnosis,participant,scans,percent (perticipant),percent (scan)
0,ADHD,20,29,29.411765,37.662338
1,ASD,11,11,16.176471,14.285714
2,ASD-ADHD,5,5,7.352941,6.493506
3,TD,32,32,47.058824,41.558442
