In [1]:
from dataclasses import dataclass
from enum import Enum
import os
import pandas as pd
import numpy as np

CONNECTOMES_ROOT = "connectomes-csv"

# Utility functions and classes:
class ConnectomeKind(Enum):
    FA = 0
    LN = 1
    NWS = 2
    WS = 3
    
@dataclass(frozen=True)
class Measurement:
    subject_ID: str
    kind: ConnectomeKind
    
    @classmethod
    def from_filename(cls, filename: str):
        subject_ID = filename.split("-")[1]
        kind_str = filename.split("-")[4].split(".")[0]
        kind_from_str = {
            "fa": ConnectomeKind.FA,
            "ln": ConnectomeKind.LN,
            "nws": ConnectomeKind.NWS,
            "ws": ConnectomeKind.WS
        }
        return Measurement(subject_ID, kind_from_str[kind_str])
    
def read_connectomes() -> dict[str, np.array]: # subject_ID -> (87, 87, 4)
    connectomes = {}
    
    for filename in os.listdir(CONNECTOMES_ROOT):
        m = Measurement.from_filename(filename)
        
        if m.subject_ID not in connectomes.keys():
            connectomes[m.subject_ID] = np.zeros(shape=(87, 87, 4), dtype=float)
            
        path = os.path.join(CONNECTOMES_ROOT, filename)
        connectomes[m.subject_ID][:, :, m.kind.value] = pd.read_csv(path, header=None).to_numpy()
        
    return connectomes
     
connectomes = read_connectomes()

In [2]:
class Sex(Enum):
    MALE = 0
    FEMALE = 1
    
@dataclass(frozen=True)
class Label:
    birth_age: float
    sex: Sex
    birth_weight: float
    
    @classmethod
    def from_row(cls, row: pd.Series):
        age = row["birth_age"]
        weight = row["birth_weight"]
        sex_dict = {
            "male": Sex.MALE,
            "female": Sex.FEMALE
        }
        sex = sex_dict[row["sex"]]
        return Label(age, sex, weight)
    

def read_labels(connectomes: dict[str, np.array]) -> dict[str, Label]:
    participants = pd.read_csv("participants.tsv", sep="\t")
    labels = {}
    
    for _, row in participants.iterrows():
        subject_ID = row["participant_id"]
        if subject_ID in connectomes.keys():
            labels[subject_ID] = Label.from_row(row)
    
    return labels
            
labels = read_labels(connectomes)

In [28]:
from sklearn.linear_model import LinearRegression

N = len(connectomes)
flattened_features = 87*87*4

X = np.zeros(shape=(N, flattened_features), dtype=float)
y_age = np.zeros(shape=(N,), dtype=float)

for i, k in enumerate(connectomes.keys()):
    X[i] = connectomes[k].reshape(-1)
    y_age[i] = labels[k].birth_age


lr = LinearRegression()
lr = lr.fit(X, y_age)

predictions = lr.predict(X)
np.max(np.absolute(predictions - y_age))

6.750155989720952e-14

In [83]:
y_weight = np.zeros(shape=(N,), dtype=float)

for i, k in enumerate(connectomes.keys()):
    y_weight[i] = labels[k].birth_weight

no_outliers_idx = (y_weight > 0) & (y_weight < 5)

lr = LinearRegression()
lr = lr.fit(X[no_outliers_idx], y_weight[no_outliers_idx])

predictions = lr.predict(X[no_outliers_idx])
np.max(np.absolute(predictions - y_weight[no_outliers_idx]))
# list(zip(predictions, y_weight))[217]

1.7985612998927536e-14

In [85]:
from sklearn.naive_bayes import GaussianNB

y_sex = np.zeros(shape=(N,), dtype=int)

for i, k in enumerate(connectomes.keys()):
    y_sex[i] = labels[k].sex.value

classifier = GaussianNB()
classifier = classifier.fit(X, y_sex)
classifier.score(X, y_sex)

0.978494623655914