# Machine Learning pipeline

In this notebook, we go through the machine learning pipeline to reproduce Lydia Chougar's paper. The following sections will be covered:

1 - Convert CSV to DataFrame

2 - Normalize

3 - Train and predict models

4 - Cross Validation

5 - Results 

### Imports

In [1]:
import pandas as pd
import numpy as np
import glob, utils, sys

# Convert CSV to DataFrame

Converts data from CSV to DataFrame and applies any function. 
- "combine": sums all Left and Right regions into one column

In [2]:
def get_data(csvFileName: str, ROI: [], heuristic = None):
    '''
    The following function will sanitize data and build a numpy array with X ROI's volumes and y being the class [NC, PD]
    @csvFileName: input volumes csv
    @ROI: regions of interests desired
    @heuristic: function key
    '''
    df = pd.read_csv(csvFileName)
    df = utils.remove_unwanted_columns(df, ROI)
    
    if heuristic == "combine":
        df = utils.combine_left_right_vol(df)
        
    arr = df.values
    X = arr[:, :-1]
    y = utils.convert_Y(arr[:, -1])
    return X,y

Test *get_data()* function

In [3]:
ROI = [
      "class",
      "Left-Putamen", "Right-Putamen", 
      "Right-Caudate", "Left-Caudate", 
      "Right-Thalamus-Proper", "Left-Thalamus-Proper", 
      "Left-Pallidum", "Right-Pallidum", 
      "Left-Cerebellum-Cortex", "Right-Cerebellum-Cortex", "lhCortexVol", "rhCortexVol", "CortexVol",
      "Left-Cerebellum-White-Matter", "Right-Cerebellum-White-Matter",
      "CerebralWhiteMatterVol", 
      "3rd-Ventricle", "4th-Ventricle"
   ]
X, y = get_data("volumes.csv", ROI, "combine")

  df = df.drop(column, 1)


# 2. [Normalize](#normal)

In this section, normalization of the data using "Normalization 1" and "Normaliztion 2" techniques are implemented. 

Normalization 1:

$$\dfrac{Variable – mean \; of \;PD \;and \;NC \;in \;the \;training \;cohort}{\sigma \;of \;PD \;and \;NC \;in \;the \;training \;cohort}$$

Normalization 2:

$$\dfrac{Variable – mean \; of \;controls \;scanned \;using \;the \;same \;scanner}{\sigma \;of \;controls \;scanned \;using \;the \;same \;scanner}$$

In [4]:
def normalize1(data):
    normalizedX = []
    
    for row in X:
        normalizedRow = []
        for columnIndex, variable in enumerate(row):
            mean = np.mean(data[:, columnIndex])
            std = np.std(data[:, columnIndex])
            normalizedValue = (variable - mean)/std
            normalizedRow.append(normalizedValue)        
        normalizedX.append(normalizedRow)
        
    return np.array(normalizedX)
            
normalize1(X)

array([[ 1.55335454,  0.96866592,  1.40183806, ...,  1.23027958,
         1.28178675,  1.51350306],
       [ 0.01989099,  0.04470567, -0.07057706, ..., -0.168547  ,
        -0.20556361,  0.01279145],
       [ 0.78740875,  0.12263003, -0.45080533, ..., -0.22258116,
        -0.22465209,  0.14865698],
       ...,
       [-0.59219754, -0.19455276,  0.04771861, ...,  0.45704757,
         0.4538891 , -0.58305278],
       [ 0.69191115,  1.18404692, -0.56112723, ...,  0.8528606 ,
         0.88509841, -0.11419325],
       [ 0.32937396,  0.29808084, -0.01563457, ...,  1.07436628,
         0.99221888,  0.91377117]])

### TODO: Fetch metadata for every patient

In [44]:
def normalize2():
    print("TODO - Unimplemented")

# 3. [Train and predict models](#predict)

In this section, we define four models being logisitc regression, SVM with linear and radial kernel and a random forest. 

### Imports

In [5]:
# Models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Utils
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit

## Utilities


In [6]:
def split_data(X, y, training_split):
    '''
    The following function splits the training and testing data sets
    according to a split [0 - 1] passed.
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = training_split, random_state = 42)
    return X_train, X_test, y_train, y_test

### SVM

In [None]:
def svm(X, y, kernelType: str, dataSplit: float, normalize):
    print(f"Running SVM with the following parameters: \nKernel type: {kernelType}\nNormalization method: {normalize.__name__}")
    X_train, X_test, y_train, y_test = preprocess.split_data(X, y, dataSplit)
    X_val, X_test, y_val, y_test = preprocess.split_data(X_test, y_test, 0.5)
    
    X_grid = np.concatenate((X_train, X_val))
    y_grid = np.concatenate((y_train, y_val))
    separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val]
    ps = PredefinedSplit(separation_boundary)
    
    param_grid = {
        'C': [1.0, 10.0, 100.0, 1000.0],
        'gamma': [0.01, 0.10, 1.00, 10.00],
        'kernel': [kernelType]
    }

    print(f"param_grid: {param_grid}")

    clf = GridSearchCV(SVC(random_state=0), param_grid, cv=ps)

    model = clf.fit(normalize(X_grid), y_grid)
    train_acc = model.score(normalize(X_train), y_train)
    val_acc = model.score(normalize(X_val), y_val)
    test_acc = model.score(normalize(X_test), y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'validation score: {round(val_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    print(f'Best model params: {model.best_params_}')

### Logistic Regression

In [None]:
def logistic_regression(X, y, dataSplit: float, normalize):
    print(f"Running Logistic Regression with the following parameters: \nNormalization method: {normalize.__name__}")
    X_train, X_test, y_train, y_test = split_data(X, y, dataSplit)
    X_val, X_test, y_val, y_test = split_data(X_test, y_test, 0.5)
    
    X_grid = np.concatenate((X_train, X_val))
    y_grid = np.concatenate((y_train, y_val))
    separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val]
    ps = PredefinedSplit(separation_boundary)
    
    param_grid = {
        'penalty': ["l1", "l2", "elasticnet"],
        'C': [1.0, 10.0, 100.0, 1000.0]
    }
    
    clf = GridSearchCV(LogisticRegression(random_state=0), param_grid, cv=ps)

    model = clf.fit(normalize(X_grid), y_grid)
    train_acc = model.score(normalize(X_train), y_train)
    val_acc = model.score(normalize(X_val), y_val)
    test_acc = model.score(normalize(X_test), y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'validation score: {round(val_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    print(f'Best model params: {model.best_params_}')