This notebook is a first-pass analysis of control vs depressed baseline functional connectivity using supervised classification to determine if there is a difference.

In [127]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier

In [128]:
# Get labels
labels = pd.read_csv("/cnl/abcd/data/labels/baseline_depr.csv", header=None, index_col=0)

In [157]:
# Get data
data = pd.read_csv("../data/processed/ddc-cen.csv", index_col=0)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,label
NDARINV08FUB58A,-0.023839,0.103451,-0.009615,0.000399,-0.010581,-0.014656,-0.246580,-0.145001,0.002667,-0.002508,...,0.062646,-0.011388,0.016601,0.017278,-0.004887,-0.083591,-0.074559,0.017756,-0.033086,0
NDARINV08FUB58A,0.045872,0.179502,0.191033,0.186047,0.056114,0.041019,-0.113858,0.058701,0.053120,-0.158251,...,0.155051,0.115531,0.181448,0.051199,0.032887,-0.046807,0.026803,0.042084,-0.116059,0
NDARINV08FUB58A,0.168559,-0.073980,-0.073049,0.082772,0.036330,-0.117608,0.270814,0.244847,-0.052392,0.070483,...,-0.047398,-0.192300,0.140593,0.077823,-0.176631,0.283544,0.228315,-0.012474,0.076119,0
NDARINV08FUB58A,-0.107575,-0.068836,-0.111463,-0.021093,-0.097544,0.144803,0.155760,-0.009650,0.089469,0.017582,...,-0.053035,-0.063657,-0.013359,-0.063339,0.112036,0.113514,-0.014991,0.071271,0.002102,0
NDARINV04GAB2AA,-0.015523,0.010193,-0.035077,0.033344,0.028600,-0.006935,0.036020,-0.009731,0.079665,-0.001582,...,0.015272,-0.012316,0.039204,0.033496,-0.003704,0.049696,-0.028097,0.059826,-0.001827,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NDARINVW2RNZLMD,0.002213,0.007045,0.075156,0.042193,0.046312,-0.029403,0.053861,-0.029366,-0.023747,0.006481,...,0.011007,0.026316,0.025602,0.032073,-0.011304,0.030364,-0.033201,0.015104,-0.001438,1
NDARINVW2RNZLMD,0.048890,-0.325826,-0.302150,0.095398,0.027032,-0.159989,-0.047972,0.067536,0.052095,-0.067952,...,-0.124811,-0.109064,0.065522,0.007651,-0.030336,0.019395,0.000825,0.026628,-0.016941,1
NDARINVYCRTDT3X,0.335081,0.243078,0.420549,0.079057,-0.047334,-0.112552,0.857535,0.115325,-0.382203,-0.196483,...,0.244532,0.326265,0.086937,-0.037719,-0.112236,0.613490,0.090680,-0.286774,-0.166868,1
NDARINVYCRTDT3X,0.019379,-0.004614,0.018799,0.005264,0.011753,0.025143,0.031757,-0.055849,0.024012,-0.034599,...,0.005314,0.014760,-0.001164,0.007024,0.036444,0.033476,-0.067694,0.025954,-0.048318,1


In [158]:
# Split subjects into train/test sets
train_subj, test_subj, _, _ = train_test_split(labels.index, labels.values, stratify=labels.values, test_size=0.2, random_state=42)

train_indices = np.nonzero(data.index.isin(train_subj))
test_indices = np.nonzero(data.index.isin(test_subj))

train = data.iloc[train_indices]
test = data.iloc[test_indices]

X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

# Notes
- poly: guessing all 0's. bad even with increased degree.
- linear/rbf/sigmoid: not guessing all 0's but bad

In [159]:
clf = SVC(kernel='linear', class_weight='balanced', random_state=42)
#clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [160]:
confusion_matrix(y_test, y_pred)

array([[567, 382],
       [139,  70]])

In [161]:
roc_auc_score(y_test, y_pred)

0.4661996258968141

In [162]:
balanced_accuracy_score(y_test, y_pred)

0.4661996258968141

In [163]:
f1_score(y_test, y_pred)

0.2118003025718608