# Let's begin! 
### First we import some useful python libraries...

In [None]:
## Imports
from nilearn import datasets
from nilearn.connectome import ConnectivityMeasure
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Download data

In [None]:
n_subjects = 100
parcel = "rois_ho"  # 'rois_ho' or 'rois_aal
data = datasets.fetch_abide_pcp(n_subjects=n_subjects, derivatives=[parcel], data_dir="./")

## Phenotypes and Demographics

In [None]:
pheno = pd.DataFrame(data["phenotypic"]).drop(columns=["i", "Unnamed: 0"])
pheno.head()

In [None]:
site_counts = pheno["SITE_ID"].value_counts()
dx_counts = pheno["DX_GROUP"].value_counts()

print(f"Dx count:\n{dx_counts}\n\nScanning site_counts:\n{site_counts}")

## MRI features

### These are stored in a list, where each list element is a subject-specific feature matrix
### Subject specific feature matrix: timepoints x ROIs
### ROIs are defined based on [harvard_oxford atlas](https://nilearn.github.io/modules/generated/nilearn.datasets.fetch_atlas_harvard_oxford.html) or [AAL atlas](https://nilearn.github.io/modules/generated/nilearn.datasets.fetch_atlas_aal.html)

In [None]:
features = data[parcel]

print(f"Number of samples: {len(features)}")

subject_feature_shape = features[0].shape

print(f"subject_feature_shape: {subject_feature_shape}")

## Let's see how the atlas looks like

In [None]:
from nilearn import plotting

if parcel == "rois_ho":
    atlas = datasets.fetch_atlas_harvard_oxford("cort-maxprob-thr25-2mm")
else:
    atlas = datasets.fetch_atlas_aal()

plotting.plot_roi(atlas.maps, draw_cross=False, title=parcel)

## And the subject-specific feature matrix

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
g = sns.heatmap(features[0].T, ax=ax)
g.set_xlabel("timepoint")
g.set_ylabel("ROI")
g.set_title("Functional data timeseries")

## Preprocessing / feature engineering

### Commonly functional (timeseries) neuroimaging data is represented as functional connectome aka network ake graph. 

In [None]:
connectome_matrix = ConnectivityMeasure(kind="correlation")
connectome_matrix = connectome_matrix.fit_transform([features[0]])[0]
print(f"Shape connectome: {connectome_matrix.shape}")

f, ax = plt.subplots(figsize=(15, 10))
g = sns.heatmap(connectome_matrix, ax=ax)
g.set_xlabel("ROI")
g.set_ylabel("ROI")
g.set_title("Connectome")

In [None]:
# Extract lower (or upper) triangle entrees (excluding diagonal)
tril_idx = np.tril_indices(len(connectome_matrix), k=-1)
features_flat = connectome_matrix[tril_idx]
print(f"Number of features per subject: {len(features_flat)}")

## Now do this for all subjects!

In [None]:
# defs are definitely useful!


def extract_connectome_features(func_data, measure):
    """A function to calculate connnectome based on timeseries data and similarity measure"""
    connectome_matrix = measure.fit_transform([func_data])[0]
    tril_idx = np.tril_indices(len(connectome_matrix), k=-1)
    flat_features = connectome_matrix[tril_idx]

    return flat_features

## Note

here we are pre-processing each image independently i.e. not using any group-level information for scaling / normalization / feature transformation (e.g. PCA). Therefore there is no "double dipping" or leakage of information from a test set. This sort of independent image pre-processing, we can do on entire dataset without creating train-test splits first and then defining feature-set on the training data only. 

In [None]:
correlation_measure = ConnectivityMeasure(kind="correlation")

flat_features_list = []
for func_data in features:
    flat_features = extract_connectome_features(func_data, correlation_measure)
    flat_features_list.append(flat_features)

print(f"Length of flat_features_list {len(flat_features_list)}")

## Input data matrix (n_samples, n_features)

In [None]:
X = np.array(flat_features_list)

print(f"Input data (X) shape: {X.shape}")

# Memory intensive #

# g = sns.heatmap(flat_features_array)
# g.set_xlabel('Connection strength')
# g.set_ylabel('ROI')
# g.set_title('Connectome')

## Output labels (y): Diagnosis


In [None]:
from sklearn import preprocessing

outcome = "DX_GROUP"
y = pheno[outcome]
y_counts = y.value_counts()

print(f"Unique output clasess:\n{y_counts}")

# Encode labels to integer categories
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

## Okay now we have our input data (X) and output data (y) in the following format
<img src="./QLS_ML_terminology.png" alt="terms" width="800"/>


### Save `ML-ready` data

In [None]:
save_dataset = False

ds_name = "my_ML_dataset"
save_path = f"../../data/{ds_name}"

if save_dataset:
    print(f"Saving dataset to {save_path}")
    print(f"Dataset X_shape: {X.shape}, y_shape: {y.shape}")
    np.savez(save_path, X=X, y=y)

### Load the dataset (sanity check)

In [None]:
print(f"Loading dataset from: {save_path}")
my_ds = np.load(save_path + ".npz")
X = my_ds["X"]
y = my_ds["y"]
print(f"Loaded dataset X_shape: {X.shape}, y_shape: {y.shape}")