Read connectomes

In [None]:
from dataclasses import dataclass
from enum import Enum
import os
import pandas as pd
import numpy as np

CONNECTOMES_ROOT = "connectomes-csv"

# Utility functions and classes:
class ConnectomeKind(Enum):
    FA = 0
    LN = 1
    NWS = 2
    WS = 3
    
@dataclass(frozen=True)
class Measurement:
    subject_ID: str
    kind: ConnectomeKind
    
    @classmethod
    def from_filename(cls, filename: str):
        subject_ID = filename.split("-")[1]
        kind_str = filename.split("-")[4].split(".")[0]
        kind_from_str = {
            "fa": ConnectomeKind.FA,
            "ln": ConnectomeKind.LN,
            "nws": ConnectomeKind.NWS,
            "ws": ConnectomeKind.WS
        }
        return Measurement(subject_ID, kind_from_str[kind_str])
    
def read_connectomes() -> dict[str, np.array]: # subject_ID -> (87, 87, 4)
    connectomes = {}
    
    for filename in os.listdir(CONNECTOMES_ROOT):
        m = Measurement.from_filename(filename)
        
        if m.subject_ID not in connectomes.keys():
            connectomes[m.subject_ID] = np.zeros(shape=(87, 87, 4), dtype=float)
            
        path = os.path.join(CONNECTOMES_ROOT, filename)
        connectomes[m.subject_ID][:, :, m.kind.value] = pd.read_csv(path, header=None).to_numpy()
        
    return connectomes
     
connectomes = read_connectomes()

Read labels

In [None]:
class Sex(Enum):
    MALE = 0
    FEMALE = 1
    
@dataclass(frozen=True)
class Label:
    birth_age: float
    scan_age: float
    sex: Sex
    birth_weight: float
    
    @classmethod
    def from_row(cls, row: pd.Series):
        age = row["birth_age"]
        scan_age = row["scan_age"]
        weight = row["birth_weight"]
        sex_dict = {
            "male": Sex.MALE,
            "female": Sex.FEMALE
        }
        sex = sex_dict[row["sex"]]
        return Label(age, scan_age, sex, weight)
    

def read_labels(connectomes: dict[str, np.array]) -> dict[str, Label]:
    participants = pd.read_csv("combined.tsv", sep="\t")
    labels = {}
    
    for _, row in participants.iterrows():
        subject_ID = row["participant_id"]
        if subject_ID in connectomes.keys():
            labels[subject_ID] = Label.from_row(row)
    
    return labels
            
labels = read_labels(connectomes)

Convert to arrays and split

In [None]:
from sklearn.model_selection import train_test_split

# Transform structured data into arrays
N = len(connectomes)
flattened_features = 87*87*4

X = np.zeros(shape=(N, flattened_features), dtype=float)
y_age = np.zeros(shape=(N,), dtype=float)
y_sex = np.zeros(shape=(N,), dtype=int)
y_scan = np.zeros(shape=(N,), dtype=float)

for i, k in enumerate(connectomes.keys()):
    X[i] = connectomes[k].reshape(-1)
    y_age[i] = labels[k].birth_age
    y_scan[i] = labels[k].scan_age
    y_sex[i] = labels[k].sex.value
    
X_train, X_test, y_age_train, y_age_test, y_scan_train, y_scan_test, y_sex_train, y_sex_test = train_test_split(
    X, y_age, y_scan, y_sex, test_size=0.3, random_state=42
)

print(f"No. of train samples: {X_train.shape[0]}")
print(f"No. of test samples: {X_test.shape[0]}")

LR baseline for age regression

In [None]:
# Birth age
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import  mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error
from scipy.stats import pearsonr

for Model, name in ((Lasso, "Lasso"), (Ridge, "Ridge")):
    for alpha in [0.1, 1, 10, 100, 1_000, 10_000]:
        model = Model(alpha=alpha, random_state=42)
        model = model.fit(X_train, y_age_train)
        y_age_pred = model.predict(X_test)
    
        mae = mean_absolute_error(y_age_test, y_age_pred)
        corr, _ = pearsonr(y_age_test, y_age_pred)
        r2 = r2_score(y_age_test, y_age_pred)

        print(f"{name}, {alpha:>5} MAE: {mae:.4f}\tcorr: {corr:.4f}\tR2: {r2:.4f}")

In [None]:
# Scan age
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import  mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error
from scipy.stats import pearsonr

for Model, name in ((Lasso, "Lasso"), (Ridge, "Ridge")):
    for alpha in [0.1, 1, 10, 100, 1_000, 10_000]:
        model = Model(alpha=alpha, random_state=42)
        model = model.fit(X_train, y_scan_train)
        y_scan_pred = model.predict(X_test)
    
        mae = mean_absolute_error(y_scan_test, y_scan_pred)
        corr, _ = pearsonr(y_scan_test, y_scan_pred)
        r2 = r2_score(y_scan_test, y_scan_pred)

        print(f"{name}, {alpha:>5} MAE: {mae:.4f}\tcorr: {corr:.4f}\tR2: {r2:.4f}")

In [None]:
from sklearn.ensemble import RandomForestRegressor
for n_estimators in [1, 2, 3, 5, 10, 20, 40, 80]:
    rf_regressor = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
    rf_regressor = rf_regressor.fit(X_train, y_age_train)
    y_age_pred = rf_regressor.predict(X_test)
    
    mse = mean_squared_error(y_age_test, y_age_pred)
    absolute_errors = np.abs(y_age_test - y_age_pred)
    mae = np.mean(absolute_errors)
    stdae = np.std(absolute_errors)
    percentage_errors = absolute_errors / y_age_test
    mape = np.mean(percentage_errors)
    stdape = np.std(percentage_errors)
    r2 = r2_score(y_age_test, y_age_pred)
    corr, _ = pearsonr(y_age_test, y_age_pred)
    
    print(f"\nMSE:{mse:>7.4f} MAE:{mae:>7.4f}+-{stdae:>6.4f} corr:{corr:>7.4f} R2:{r2:>6.4f}")

In [None]:
from sklearn.ensemble import RandomForestRegressor
for n_estimators in [1, 2, 3, 5, 10, 20, 40, 80]:
    rf_regressor = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
    rf_regressor = rf_regressor.fit(X_train, y_age_train)
    y_scan_pred = rf_regressor.predict(X_test)
    
    mse = mean_squared_error(y_scan_test, y_scan_pred)
    absolute_errors = np.abs(y_scan_test - y_scan_pred)
    mae = np.mean(absolute_errors)
    stdae = np.std(absolute_errors)
    percentage_errors = absolute_errors / y_scan_test
    mape = np.mean(percentage_errors)
    stdape = np.std(percentage_errors)
    r2 = r2_score(y_scan_test, y_scan_pred)
    corr, _ = pearsonr(y_scan_test, y_scan_pred)
    
    print(f"\nMSE:{mse:>7.4f} MAE:{mae:>7.4f}+-{stdae:>6.4f} corr:{corr:>7.4f} R2:{r2:>6.4f}")

In [None]:
np.var(y_age)

In [None]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier = classifier.fit(X_train, y_sex_train)
Acc = classifier.score(X_test, y_sex_test)
print(f"Sex classification: Mean Acc. = {Acc}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_sex_train = y_sex_train.astype(int)
y_sex_test = y_sex_test.astype(int)

for n_estimators in [1, 2, 3, 5, 10, 20, 40, 80]:
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_classifier = rf_classifier.fit(X_train, y_sex_train)
    y_sex_pred = rf_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_sex_test, y_sex_pred)
    report = classification_report(y_sex_test, y_sex_pred)
    conf_matrix = confusion_matrix(y_sex_test, y_sex_pred)
    
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", conf_matrix)
    
    print(f"\nAcc:{accuracy:>6.4f}\nAccuracy report: {report}\nConfusion matrix: {conf_matrix}")