This notebook is a first-pass analysis of control vs depressed baseline functional connectivity using supervised classification to determine if there is a difference.

In [120]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Get labels
labels = pd.read_csv("/cnl/abcd/data/labels/baseline_depr.csv", header=None, index_col=0)

In [3]:
# Get data
data = pd.read_csv("../data/processed/ddc-dmn.csv", index_col=0)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,label
NDARINV08FUB58A,0.088259,0.009808,0.007736,0.003860,-0.004891,-0.275622,-0.054998,-0.015698,-0.018366,-0.000137,...,-0.016821,-0.028054,0.034787,-0.004491,-0.004873,0.032629,0.013581,0.003728,-0.011202,0
NDARINV08FUB58A,0.145287,-0.028347,0.030145,-0.175452,-0.050963,-0.118163,0.130920,0.035423,-0.014544,-0.052245,...,-0.027951,0.045783,-0.106946,-0.060583,-0.066972,0.138585,0.046847,-0.018121,-0.071378,0
NDARINV08FUB58A,-0.075931,0.004218,0.185626,0.265850,-0.004416,0.458572,0.083927,-0.239199,-0.009050,0.044333,...,-0.020634,0.073503,0.086219,0.003611,0.076337,0.036322,-0.062929,-0.009320,0.026146,0
NDARINV08FUB58A,-0.036843,-0.017967,-0.048861,0.173209,0.051875,0.098732,0.038865,0.170830,-0.042687,0.010919,...,0.007071,-0.133634,0.248850,0.097448,0.134514,0.021322,0.189131,-0.102037,0.019561,0
NDARINV04GAB2AA,0.020079,-0.016281,0.018131,0.005322,-0.002723,0.055809,0.013384,-0.022191,-0.012850,-0.008150,...,-0.061429,0.075310,0.040656,-0.097219,0.057467,-0.004817,-0.058127,-0.003521,0.023226,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NDARINVW2RNZLMD,0.015606,-0.056917,0.009711,-0.067322,0.023579,0.053430,0.003875,-0.060303,-0.009425,-0.026791,...,-0.119751,0.017228,-0.082968,-0.000850,0.048945,-0.006270,0.069382,0.019487,-0.001199,1
NDARINVW2RNZLMD,-0.347761,0.220354,0.061091,-0.385694,-0.053784,-0.091968,-0.092911,-0.234036,-0.031959,-0.015978,...,0.133702,0.040504,-0.200807,-0.048550,-0.033583,-0.077967,-0.117345,-0.032688,-0.006473,1
NDARINVYCRTDT3X,0.273888,0.240946,0.352112,-0.544630,0.106004,1.211838,-0.306533,-0.176396,-0.085251,0.082698,...,0.203905,0.301626,-0.407056,0.109447,0.462676,-0.372731,-0.139495,-0.112265,0.128837,1
NDARINVYCRTDT3X,-0.001444,0.009398,0.018054,0.030912,0.004985,0.006798,0.009832,0.025640,-0.016117,-0.023348,...,-0.026024,0.000037,-0.005400,0.025757,-0.011463,0.006709,0.002625,-0.007692,-0.003698,1


In [4]:
# Split subjects into train/test sets
train_subj, test_subj, _, _ = train_test_split(labels.index, labels.values, stratify=labels.values, test_size=0.2, random_state=42)

train_indices = np.nonzero(data.index.isin(train_subj))
test_indices = np.nonzero(data.index.isin(test_subj))

train = data.iloc[train_indices]
test = data.iloc[test_indices]

X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

# Notes
- poly: guessing all 0's. bad even with increased degree.
- linear/rbf/sigmoid: not guessing all 0's but bad

In [121]:
#clf = SVC(kernel='linear', class_weight='balanced', random_state=10)
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [122]:
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.8181818181818182,
  'recall': 0.5974710221285564,
  'f1-score': 0.6906211936662606,
  'support': 949},
 '1': {'precision': 0.17849462365591398,
  'recall': 0.39712918660287083,
  'f1-score': 0.2462908011869436,
  'support': 209},
 'accuracy': 0.5613126079447323,
 'macro avg': {'precision': 0.4983382209188661,
  'recall': 0.4973001043657136,
  'f1-score': 0.4684559974266021,
  'support': 1158},
 'weighted avg': {'precision': 0.7027287753010635,
  'recall': 0.5613126079447323,
  'f1-score': 0.6104268482187846,
  'support': 1158}}

In [123]:
confusion_matrix(y_test, y_pred)

array([[567, 382],
       [126,  83]])

In [124]:
roc_auc_score(y_test, y_pred)

0.4973001043657136

In [125]:
balanced_accuracy_score(y_test, y_pred)

0.4973001043657136

In [126]:
f1_score(y_test, y_pred)

0.2462908011869436