In [1]:
from torch import tensor, argmax, randn
from torch.nn import CrossEntropyLoss, BCELoss, NLLLoss, LogSoftmax
import pandas as pd

In [37]:
train_list = pd.read_csv(f'data.nosync/networks_multi/train_set_files.csv')
train_list['diagnosis'] = train_list['file'].apply(lambda x: x.split('_')[4])
train_list

Unnamed: 0,file,diagnosis
0,data.nosync/networks_multi/1320247_run-1_ADHD2...,TD
1,data.nosync/networks_multi/8415034_run-2_ADHD2...,TD
2,data.nosync/networks_multi/3011311_run-2_ADHD2...,TD
3,data.nosync/networks_multi/0010087_run-2_ADHD2...,ADHD
4,data.nosync/networks_multi/0010030_run-2_ADHD2...,ADHD
...,...,...
436,data.nosync/networks_multi/0010115_run-1_ADHD2...,ADHD
437,data.nosync/networks_multi/0010086_run-2_ADHD2...,ADHD
438,data.nosync/networks_multi/1127915_run-1_ADHD2...,TD
439,data.nosync/networks_multi/2136051_run-1_ADHD2...,TD


In [39]:
train_dist = train_list.groupby('diagnosis').count().reset_index()
train_dist['total'] = train_dist['file'].sum()
train_dist['prob'] = train_dist['file']/train_dist['total']
train_dist = train_dist.to_dict(orient = 'list')
train_dist

{'diagnosis': ['ADHD', 'ASD', 'ASD-ADHD', 'TD'],
 'file': [143, 58, 23, 217],
 'total': [441, 441, 441, 441],
 'prob': [0.3242630385487528,
  0.13151927437641722,
  0.05215419501133787,
  0.49206349206349204]}

In [40]:
true_labels = []
uniform_prob = []
prior_prob = []
for i in range(4):
    y = [float(0) for i in range(4)]
    y[i] = float(1)
    for n in range(train_dist['file'][i]):
        true_labels.append(y)
        uniform_prob.append([0.25 for i in range(4)])
        prior_prob.append(train_dist['prob'])

true_labels = tensor(true_labels)
uniform_prob = tensor(uniform_prob)
prior_prob = tensor(prior_prob)

log_prior_prob = prior_prob.log()
log_uniform_prob = uniform_prob.log()

loss_func = NLLLoss()
print(f"For uniform probabilities: {loss_func(log_uniform_prob, argmax(true_labels, dim=-1))}")
print(f"For prior probabilities: {loss_func(log_prior_prob, argmax(true_labels, dim=-1))}")

For uniform probabilities: 1.3862946033477783
For prior probabilities: 1.1349709033966064


# Binary

In [41]:
train_dist = train_list.copy()
train_dist['diagnosis'] = train_dist['diagnosis'].replace({'ADHD': 'Non-TD',
                                                           'ASD': 'Non-TD',
                                                           'ASD-ADHD': 'Non-TD'})
train_dist = train_dist.groupby('diagnosis').count().reset_index()
train_dist['total'] = train_dist['file'].sum()
train_dist['prob'] = train_dist['file']/train_dist['total']
train_dist = train_dist.to_dict(orient = 'list')
train_dist

{'diagnosis': ['Non-TD', 'TD'],
 'file': [224, 217],
 'total': [441, 441],
 'prob': [0.5079365079365079, 0.49206349206349204]}

In [42]:
true_labels = []
uniform_prob = []
prior_prob = []
for i in range(2):
    y = [float(0) for i in range(2)]
    y[i] = float(1)
    for n in range(train_dist['file'][i]):
        true_labels.append(y)
        uniform_prob.append([0.5 for i in range(2)])
        prior_prob.append(train_dist['prob'])

true_labels = tensor(true_labels)
uniform_prob = tensor(uniform_prob)
prior_prob = tensor(prior_prob)

loss_func = BCELoss()
print(f"For uniform probabilities: {loss_func(uniform_prob, true_labels)}")
print(f"For prior probabilities: {loss_func(prior_prob, true_labels)}")

For uniform probabilities: 0.6931471824645996
For prior probabilities: 0.6930211186408997


## Participant split

In [43]:
train_list = pd.read_csv(f'data.nosync/networks_multi/train_set_files.csv')
train_list['participant'] = train_list['file'].apply(lambda x: x.split('/')[2])
train_list['participant'] = train_list['participant'].apply(lambda x: x.split('_')[0] + '-' + x.split('_')[2])
train_list['diagnosis'] = train_list['file'].apply(lambda x: x.split('_')[4])
train_list = train_list.drop_duplicates('participant')[['participant', 'diagnosis']].groupby('diagnosis').count().reset_index()
train_list['percent'] = train_list['participant']/train_list['participant'].sum()*100
train_list

Unnamed: 0,diagnosis,participant,percent
0,ADHD,87,27.358491
1,ASD,58,18.238994
2,ASD-ADHD,23,7.232704
3,TD,150,47.169811


In [44]:
val_list = pd.read_csv(f'data.nosync/networks_multi/val_set_files.csv')
val_list['participant'] = val_list['file'].apply(lambda x: x.split('/')[2])
val_list['participant'] = val_list['participant'].apply(lambda x: x.split('_')[0] + '-' + x.split('_')[2])
val_list['diagnosis'] = val_list['file'].apply(lambda x: x.split('_')[4])
val_list = val_list.drop_duplicates('participant')[['participant', 'diagnosis']].groupby('diagnosis').count().reset_index()
val_list['percent'] = val_list['participant']/val_list['participant'].sum()*100
val_list

Unnamed: 0,diagnosis,participant,percent
0,ADHD,20,29.411765
1,ASD,11,16.176471
2,ASD-ADHD,5,7.352941
3,TD,32,47.058824


In [45]:
test_list = pd.read_csv(f'data.nosync/networks_multi/test_set_files.csv')
test_list['participant'] = test_list['file'].apply(lambda x: x.split('/')[2])
test_list['participant'] = test_list['participant'].apply(lambda x: x.split('_')[0] + '-' + x.split('_')[2])
test_list['diagnosis'] = test_list['file'].apply(lambda x: x.split('_')[4])
test_list = test_list.drop_duplicates('participant')[['participant', 'diagnosis']].groupby('diagnosis').count().reset_index()
test_list['percent'] = test_list['participant']/test_list['participant'].sum()*100
test_list

Unnamed: 0,diagnosis,participant,percent
0,ADHD,20,29.411765
1,ASD,11,16.176471
2,ASD-ADHD,5,7.352941
3,TD,32,47.058824
