In [2]:
# Import the script function that downloads preprocessed fMRI data
from download_abide_preprocessed_dataset import collect_and_download

In [3]:
def download_data(desired_derivative, desired_strategy, desired_pipeline):
    # Variables to specify download settings (modify these values as needed)    
    download_asd_dir = 'abide_preprocessed_dataset/ASD'  # Path to local folder to download files to for ASD data
    download_tdc_dir = 'abide_preprocessed_dataset/TDC' # Path to local folder to download files to for controls
    desired_diagnosis_asd = 'asd'  # 'asd', 'tdc', or 'both' corresponding to the diagnosis of the participants for whom data should be downloaded
    desired_diagnosis_tdc = 'tdc'  # 'asd', 'tdc', or 'both' corresponding to the diagnosis of the participants for whom data should be downloaded

    # Call the collect and download routine
    collect_and_download(desired_derivative, desired_pipeline, desired_strategy, download_asd_dir, desired_diagnosis_asd)
    collect_and_download(desired_derivative, desired_pipeline, desired_strategy, download_tdc_dir, desired_diagnosis_tdc)

In [4]:
# Organizing Features and Labels to input into ML algorithm

# importing necessary files
import os
import numpy as np

def gather_features(data_dir):
    features = []

    for file_name in os.listdir(data_dir):
        subject_path = os.path.join(data_dir, file_name)

        # Check if the file is a timeseries file 
        if file_name.endswith('.1D'):
            # Load the timeseries data 
            timeseries_data = load_timeseries_data(subject_path)

            # Append features
            features.append(timeseries_data)

    features_list = [np.array(feature) for feature in features]    
    return features_list

def load_timeseries_data(file_path):
    # Load timeseries data from the 1D file
    with open(file_path, 'r') as f:
        timeseries = np.loadtxt(f)

    return timeseries

# Define a function for padding time series data
def pad_timeseries_to_length(timeseries, target_length):
    if len(timeseries) >= target_length:
        return timeseries[:target_length]
    else:
        padding = np.zeros((target_length - len(timeseries), timeseries.shape[1]))
        return np.vstack([timeseries, padding])

def features_and_labels(pipeline, derivative, strategy):
    # Specify the path to the directory containing the downloaded preprocessed data for ASD data
    download_asd_dir = f'abide_preprocessed_dataset/ASD/Outputs/{pipeline}/{strategy}/{derivative}'
    # Make features and labels arrays for ASD data
    features_asd = gather_features(download_asd_dir)
    labels_asd = np.ones(len(features_asd)) # Label 1 for ASD, 0 for TDC

    # Specify the path to the directory containing the downloaded preprocessed data for TDC data
    download_tdc_dir = f'abide_preprocessed_dataset/TDC/Outputs/{pipeline}/{strategy}/{derivative}'
    # Make features and labels arrays for TDC data
    features_tdc = gather_features(download_tdc_dir)
    labels_tdc = np.zeros(len(features_tdc)) # Label 1 for ASD, 0 for TDC

    # Concatenate ASD and TDC features 
    all_features = []
    all_features.extend(features_asd) 
    all_features.extend(features_tdc)

    # Padding timeseries and # of features for features 
    max_length = max(len(ts) for ts in all_features) # Find the maximum length among all timeseries
    all_features_padded = [pad_timeseries_to_length(ts, max_length) for ts in all_features] # Pad each time series to the maximum length
    all_features = np.array(all_features_padded) # Convert the list of padded arrays to a 2D NumPy array
    all_features = all_features.reshape((len(all_features), -1)) # Flatten each time series in all_features to make it a 2D array that is readable for scikit-learn's ML algorithms

    # Concatenate ASD and TDC labels
    all_labels = np.zeros(len(labels_asd) + len(labels_tdc)) # Create an array of zeros with the total length
    all_labels[:len(labels_asd)] = labels_asd # Assign the ASD labels to the first part of the array
    all_labels[len(labels_asd):] = labels_tdc # Assign the TDC labels to the remaining part of the array
    all_labels = np.array(all_labels)

    print("Number of rows in all_features:", len(all_features))
    print("Number of rows in all_labels:", len(all_labels))
    print("Shape of all_features: ", all_features.shape)
    print("Shape of all_labels: ", all_labels.shape)

    return all_features, all_labels

In [5]:
# Part of scikit-learn library; Support Vector Classification (SVC), a type of SVM used for classification tasks
from sklearn.svm import SVC 

# Part of scikit-learn library; Random Forest Classifier, a type of Random Forest used for classification tasks
from sklearn.ensemble import RandomForestClassifier

# Part of scikit-learn library; Logistic Regression algorithm
from sklearn.linear_model import LogisticRegression

# Part of scikit-learn library; Decision Tree Classifier, a type of Decision Tree used for classification tasks
from sklearn.tree import DecisionTreeClassifier

# Part of scikit-learn library; Naive Bayes algorithm
from sklearn.naive_bayes import GaussianNB

# Part of scikit-learn library; K-Neighbors Classifier; a type of K-Nearest Neighbors used for classification tasks
from sklearn.neighbors import KNeighborsClassifier

# Function from scikit-learn; splits datasets into training and testing sets
from sklearn.model_selection import train_test_split 

# Function from scikit-learn; computes parameters relating to the model's performance
from sklearn.metrics import accuracy_score, confusion_matrix

# Functionalities from scikit-learn for performing k-fold cross validation of data
from sklearn.model_selection import KFold, cross_val_predict

import numpy as np 


# TRAINING AND TESTING ML MODEL (various algorithms)

# Function for training/testing ML model (for all algorithms)
def train_test_model(X_train, X_test, y_train, algorithm):
    match_algorithm = {
        'SVM': SVC(),
        'RF': RandomForestClassifier(n_estimators=100, random_state=42),
        'LR': LogisticRegression(),
        'DT': DecisionTreeClassifier(random_state=42),
        'NB': GaussianNB(),
        'KNN': KNeighborsClassifier(n_neighbors=5)
    } # This maps a string input to its corresponding class in scikit-learn  

    model = match_algorithm.get(algorithm) # Creating the actual model based on the algorithm parameter 

    model.fit(X_train, y_train) # Training the model

    y_predictions = model.predict(X_test) # Testing the model; getting predictions from the X_test values

    return y_predictions

# Function for calculating performance metrics
def calculate_metrics(y_test, y_predictions):
    accuracy = accuracy_score(y_test, y_predictions) # Compute accuracy
    conf_matrix = confusion_matrix(y_test, y_predictions) # Compute confusion matrix
    tn, fp, fn, tp = conf_matrix.ravel() # Extract true positives, false positives, true negatives, and false negatives from confusion matrix
    sensitivity = tp / (tp + fn) # Compute sensitivity (recall)
    specificity = tn / (tn + fp) # Compute specificity
    precision = tp / (tp + fp) # Compute precision
    f1_score = 2 * (precision * sensitivity) / (precision + sensitivity) # Compute F1 score
    
    return accuracy, conf_matrix, sensitivity, specificity, precision, f1_score

# Function for training/testing fMRI data with ML model
def train_test_fMRI_data(fMRI_features, labels, algorithm, k):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    total_accuracy = 0
    total_conf_matrix = np.zeros((2, 2))
    total_sensitivity = 0
    total_specificity = 0
    total_precision = 0
    total_f1_score = 0

    for train_index, test_index in kf.split(fMRI_features):
        X_train, X_test = fMRI_features[train_index], fMRI_features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        predictions = train_test_model(X_train, X_test, y_train, algorithm) # Train and test model using corresponding algorithm and get predictions 

        accuracy, conf_matrix, sensitivity, specificity, precision, f1_score = calculate_metrics(y_test, predictions) # Calculate performance metrics

        total_accuracy += accuracy
        total_conf_matrix += conf_matrix
        total_sensitivity += sensitivity
        total_specificity += specificity
        total_precision += precision
        total_f1_score += f1_score

    print("Average Accuracy: ", total_accuracy / k)
    print("Overall Confusion Matrix: \n", total_conf_matrix / k)
    print("Average Sensitivity: ", total_sensitivity / k)
    print("Average Specificity: ", total_specificity / k)
    print("Average Precision: ", total_precision / k)
    print("Average F1 Score: ", total_f1_score / k)

In [6]:
def test_diagnostic_model(derivative, strategy, pipeline, algorithm, k=5):
    download_data(desired_derivative=derivative, desired_strategy=strategy, desired_pipeline=pipeline)
    
    features, labels = features_and_labels(derivative=derivative, pipeline=pipeline, strategy=strategy)

    train_test_fMRI_data(features, labels, algorithm, k=k)

In [7]:
# Test run w/ k-fold cross validation
test_diagnostic_model("rois_ho", "nofilt_noglobal", "cpac", "LR")

File abide_preprocessed_dataset/ASD/Outputs/cpac/nofilt_noglobal/rois_ho/Pitt_0050004_rois_ho.1D already exists, skipping...
File abide_preprocessed_dataset/ASD/Outputs/cpac/nofilt_noglobal/rois_ho/Pitt_0050005_rois_ho.1D already exists, skipping...
File abide_preprocessed_dataset/ASD/Outputs/cpac/nofilt_noglobal/rois_ho/Pitt_0050006_rois_ho.1D already exists, skipping...
File abide_preprocessed_dataset/ASD/Outputs/cpac/nofilt_noglobal/rois_ho/Pitt_0050007_rois_ho.1D already exists, skipping...
File abide_preprocessed_dataset/ASD/Outputs/cpac/nofilt_noglobal/rois_ho/Pitt_0050008_rois_ho.1D already exists, skipping...
File abide_preprocessed_dataset/ASD/Outputs/cpac/nofilt_noglobal/rois_ho/Pitt_0050009_rois_ho.1D already exists, skipping...
File abide_preprocessed_dataset/ASD/Outputs/cpac/nofilt_noglobal/rois_ho/Pitt_0050010_rois_ho.1D already exists, skipping...
File abide_preprocessed_dataset/ASD/Outputs/cpac/nofilt_noglobal/rois_ho/Pitt_0050011_rois_ho.1D already exists, skipping...
