Read connectomes

In [1]:
from dataclasses import dataclass
from enum import Enum
import os
import pandas as pd
import numpy as np

CONNECTOMES_ROOT = "connectomes-csv"

# Utility functions and classes:
class ConnectomeKind(Enum):
    FA = 0
    LN = 1
    NWS = 2
    WS = 3
    
@dataclass(frozen=True)
class Measurement:
    subject_ID: str
    kind: ConnectomeKind
    
    @classmethod
    def from_filename(cls, filename: str):
        subject_ID = filename.split("-")[1]
        kind_str = filename.split("-")[4].split(".")[0]
        kind_from_str = {
            "fa": ConnectomeKind.FA,
            "ln": ConnectomeKind.LN,
            "nws": ConnectomeKind.NWS,
            "ws": ConnectomeKind.WS
        }
        return Measurement(subject_ID, kind_from_str[kind_str])
    
def read_connectomes() -> dict[str, np.array]: # subject_ID -> (87, 87, 4)
    connectomes = {}
    
    for filename in os.listdir(CONNECTOMES_ROOT):
        m = Measurement.from_filename(filename)
        
        if m.subject_ID not in connectomes.keys():
            connectomes[m.subject_ID] = np.zeros(shape=(87, 87, 4), dtype=float)
            
        path = os.path.join(CONNECTOMES_ROOT, filename)
        connectomes[m.subject_ID][:, :, m.kind.value] = pd.read_csv(path, header=None).to_numpy()
        
    return connectomes
     
connectomes = read_connectomes()

Read labels

In [2]:
class Sex(Enum):
    MALE = 0
    FEMALE = 1
    
@dataclass(frozen=True)
class Label:
    birth_age: float
    sex: Sex
    birth_weight: float
    
    @classmethod
    def from_row(cls, row: pd.Series):
        age = row["birth_age"]
        weight = row["birth_weight"]
        sex_dict = {
            "male": Sex.MALE,
            "female": Sex.FEMALE
        }
        sex = sex_dict[row["sex"]]
        return Label(age, sex, weight)
    

def read_labels(connectomes: dict[str, np.array]) -> dict[str, Label]:
    participants = pd.read_csv("participants.tsv", sep="\t")
    labels = {}
    
    for _, row in participants.iterrows():
        subject_ID = row["participant_id"]
        if subject_ID in connectomes.keys():
            labels[subject_ID] = Label.from_row(row)
    
    return labels
            
labels = read_labels(connectomes)

Convert to arrays and split

In [3]:
from sklearn.model_selection import train_test_split

# Transform structured data into arrays
N = len(connectomes)
flattened_features = 87*87*4

X = np.zeros(shape=(N, flattened_features), dtype=float)
y_age = np.zeros(shape=(N,), dtype=float)
y_sex = np.zeros(shape=(N,), dtype=int)

for i, k in enumerate(connectomes.keys()):
    X[i] = connectomes[k].reshape(-1)
    y_age[i] = labels[k].birth_age
    y_sex[i] = labels[k].sex.value
    
X_train, X_test, y_age_train, y_age_test, y_sex_train, y_sex_test = train_test_split(
    X, y_age, y_sex, test_size=0.2, random_state=42
)

print(f"No. of train samples: {X_train.shape[0]}")
print(f"No. of test samples: {X_test.shape[0]}")

No. of train samples: 488
No. of test samples: 122


LR baseline for age regression

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import  mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error

for Model, name in ((Lasso, "Lasso"), (Ridge, "Ridge")):
    for alpha in [0.1, 1, 10, 100, 1_000, 10_000]:
        model = Model(alpha=alpha, random_state=42)
        model = model.fit(X_train, y_age_train)
        y_age_pred = model.predict(X_test)
    
        mse = mean_squared_error(y_age_test, y_age_pred)
        mape = mean_absolute_percentage_error(y_age_test, y_age_pred)
        r2 = r2_score(y_age_test, y_age_pred)

        print(f"{name}, {alpha:>5} MSE: {mse:.4f}\tMAPE: {mape:.4f}\tR2: {r2:.4f}")

  model = cd_fast.enet_coordinate_descent(


Lasso,   0.1 MSE: 18.2917	MAPE: 0.0899	R2: -0.0401


  model = cd_fast.enet_coordinate_descent(


Lasso,     1 MSE: 20.2900	MAPE: 0.0917	R2: -0.1538
Lasso,    10 MSE: 9.5193	MAPE: 0.0644	R2: 0.4587
Lasso,   100 MSE: 9.4715	MAPE: 0.0602	R2: 0.4614
Lasso,  1000 MSE: 11.9234	MAPE: 0.0711	R2: 0.3220
Lasso, 10000 MSE: 16.1168	MAPE: 0.0955	R2: 0.0835
Ridge,   0.1 MSE: 19.2410	MAPE: 0.0886	R2: -0.0941
Ridge,     1 MSE: 19.2410	MAPE: 0.0886	R2: -0.0941
Ridge,    10 MSE: 19.2409	MAPE: 0.0886	R2: -0.0941
Ridge,   100 MSE: 19.2405	MAPE: 0.0886	R2: -0.0941
Ridge,  1000 MSE: 19.2366	MAPE: 0.0885	R2: -0.0939
Ridge, 10000 MSE: 19.1972	MAPE: 0.0885	R2: -0.0916


In [5]:
from sklearn.ensemble import RandomForestRegressor
for n_estimators in [1, 2, 3, 5, 10, 20, 40, 80]:
    rf_regressor = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
    rf_regressor = rf_regressor.fit(X_train, y_age_train)
    y_age_pred = rf_regressor.predict(X_test)
    
    mse = mean_squared_error(y_age_test, y_age_pred)
    absolute_errors = np.abs(y_age_test - y_age_pred)
    mae = np.mean(absolute_errors)
    stdae = np.std(absolute_errors)
    percentage_errors = absolute_errors / y_age_test
    mape = np.mean(percentage_errors)
    stdape = np.std(percentage_errors)
    r2 = r2_score(y_age_test, y_age_pred)
    
    print(f"\nMSE:{mse:>7.4f} MAE:{mae:>7.4f}+-{stdae:>6.4f} MAPE:{mape:>7.4f}+-{stdape:>6.4f} R2:{r2:>6.4f}")


MSE:14.7516 MAE: 2.5726+-2.8519 MAPE: 0.0757+-0.0977 R2:0.1612

MSE:10.5442 MAE: 2.1850+-2.4021 MAPE: 0.0646+-0.0836 R2:0.4004

MSE: 8.6903 MAE: 1.9102+-2.2453 MAPE: 0.0567+-0.0807 R2:0.5058

MSE: 9.1800 MAE: 1.9084+-2.3533 MAPE: 0.0572+-0.0862 R2:0.4780

MSE: 8.2668 MAE: 1.7905+-2.2496 MAPE: 0.0538+-0.0834 R2:0.5299

MSE: 7.5152 MAE: 1.7768+-2.0877 MAPE: 0.0531+-0.0785 R2:0.5727

MSE: 7.6792 MAE: 1.7690+-2.1331 MAPE: 0.0532+-0.0807 R2:0.5633

MSE: 7.7819 MAE: 1.7562+-2.1674 MAPE: 0.0531+-0.0820 R2:0.5575


In [6]:
np.var(y_age)

19.22003087964745

In [7]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier = classifier.fit(X_train, y_sex_train)
Acc = classifier.score(X_test, y_sex_test)
print(f"Sex classification: Mean Acc. = {Acc}")

Sex classification: Mean Acc. = 0.45901639344262296


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_sex_train = y_sex_train.astype(int)
y_sex_test = y_sex_test.astype(int)

for n_estimators in [1, 2, 3, 5, 10, 20, 40, 80]:
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_classifier = rf_classifier.fit(X_train, y_sex_train)
    y_sex_pred = rf_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_sex_test, y_sex_pred)
    report = classification_report(y_sex_test, y_sex_pred)
    conf_matrix = confusion_matrix(y_sex_test, y_sex_pred)
    
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", conf_matrix)
    
    print(f"\nAcc:{accuracy:>6.4f}\nAccuracy report: {report}\nConfusion matrix: {conf_matrix}")

Accuracy: 0.57
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.63      0.62        68
           1       0.51      0.48      0.50        54

    accuracy                           0.57       122
   macro avg       0.56      0.56      0.56       122
weighted avg       0.56      0.57      0.56       122

Confusion Matrix:
 [[43 25]
 [28 26]]

Acc:0.5656
Accuracy report:               precision    recall  f1-score   support

           0       0.61      0.63      0.62        68
           1       0.51      0.48      0.50        54

    accuracy                           0.57       122
   macro avg       0.56      0.56      0.56       122
weighted avg       0.56      0.57      0.56       122

Confusion matrix: [[43 25]
 [28 26]]
Accuracy: 0.59
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.84      0.70        68
           1       0.58      0.28      0.38        54

