# Machine Learning pipeline

In this notebook, we go through the machine learning pipeline to reproduce Lydia Chougar's paper. The following sections will be covered:

1 - Convert CSV to DataFrame

2 - Normalize

3 - Train and predict models

4 - Cross Validation

5 - Results 

### Imports

In [1]:
import pandas as pd
import numpy as np
import glob, utils, sys
from bs4 import BeautifulSoup as bs

# Convert CSV to DataFrame

Converts data from CSV to DataFrame and applies any function. 
- "combine": sums all Left and Right regions into one column

In [84]:
def get_data(csvFileName: str, ROI: [], heuristic = None, getDf = False):
    '''
    The following function will sanitize data and build a numpy array with X ROI's volumes and y being the class [NC, PD]
    @csvFileName: input volumes csv
    @ROI: regions of interests desired
    @heuristic: function key
    '''
    df = pd.read_csv(csvFileName)
    df = utils.remove_unwanted_columns(df, ROI)
    
    if heuristic == "combine":
        df = utils.combine_left_right_vol(df)
        
    if (getDf):
        cols = list(df.columns.values)
        cols.pop(cols.index("subjectId"))
        df = df[["subjectId"]+cols]
        return df
    else:
        df = df.drop("subjectId", 1)
        
    arr = df.values
    X = arr[:, :-1]
    y = utils.convert_Y(arr[:, -1])
    
    return X,y

Test *get_data()* function

In [85]:
ROI = [
      "subjectId", "class",
      "Left-Putamen", "Right-Putamen", 
      "Right-Caudate", "Left-Caudate", 
      "Right-Thalamus-Proper", "Left-Thalamus-Proper", 
      "Left-Pallidum", "Right-Pallidum", 
      "Left-Cerebellum-Cortex", "Right-Cerebellum-Cortex", "lhCortexVol", "rhCortexVol", "CortexVol",
      "Left-Cerebellum-White-Matter", "Right-Cerebellum-White-Matter",
      "CerebralWhiteMatterVol", 
      "3rd-Ventricle", "4th-Ventricle"
   ]
X, y = get_data("volumes.csv", ROI, "combine")
df = get_data("volumes.csv", ROI, "combine", getDf=True)
df



Unnamed: 0,subjectId,Pallidum,Putamen,Caudate,Cerebellum-Cortex,Cerebellum-White-Matter,3rd-Ventricle,4th-Ventricle,lhCortexVol,rhCortexVol,CortexVol,CerebralWhiteMatterVol,class
0,3572,3234.8,9403.7,6425.5,101202.1,25991.9,809.7,1595.2,214211.281877,214614.080385,428825.362262,398502.0,NC
1,3756,4021.1,11431.6,8336.5,112507.9,27906.7,1677.0,3021.0,241712.658555,250985.173323,492697.831879,475569.0,NC
2,3768,4240.4,9828.9,7275.3,108180.9,31634.3,2087.1,2487.0,240319.225000,236183.634262,476502.859262,526061.0,NC
3,3369,3871.7,11068.1,7607.9,98919.4,27742.0,918.2,1360.3,227101.242448,229470.755367,456571.997815,420273.0,NC
4,4004,3584.6,9379.7,5883.9,93553.2,25869.0,1374.2,2325.2,215114.677435,213485.100332,428599.777767,388901.0,NC
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,3366,5022.9,9982.7,6595.5,105510.8,38319.9,1169.8,1489.1,221069.501788,222936.985582,444006.487370,503456.0,PD
211,3367,4348.7,11044.1,6933.7,105346.1,31321.5,1101.7,1875.8,222235.310341,227057.206724,449292.517066,472333.0,PD
212,3116,4310.3,9121.3,6645.2,101564.1,28589.7,1863.4,2550.6,198129.877854,201670.182642,399800.060496,404765.0,PD
213,3587,4172.0,10429.8,8795.1,123481.9,28049.4,1282.8,2278.8,282485.457627,284700.760325,567186.217952,501236.0,PD


# 2. [Normalize](#normal)

In this section, normalization of the data using "Normalization 1" and "Normaliztion 2" techniques are implemented. 

Normalization 1:

$$\dfrac{Variable – mean \; of \;PD \;and \;NC \;in \;the \;training \;cohort}{\sigma \;of \;PD \;and \;NC \;in \;the \;training \;cohort}$$

Normalization 2:

$$\dfrac{Variable – mean \; of \;controls \;scanned \;using \;the \;same \;scanner}{\sigma \;of \;controls \;scanned \;using \;the \;same \;scanner}$$


In [179]:
def normalize1(data, mean, std):
    df = pd.DataFrame(data=data)

    if mean is None and std is None:
        mean = df.mean(axis=0)
        std = df.std(axis=0)
        normalizedDf = (df - mean)/std
        return normalizedDf.values, mean, std

    normalizedDf = (df - mean)/std
    return normalizedDf.values

In [182]:
# Testing normalization1
trainDf = pd.DataFrame(np.array([[1, 2, 3], [3, 4, 7]]),columns=['a', 'b', 'c'])
testDf = pd.DataFrame(np.array([[2, 6, 4], [3, 7, 9]]),columns=['a', 'b', 'c'])
normTrainDf, trainMean, trainStd = normalize1(trainDf, None, None)
print(trainMean)
normTestDf = normalize1(testDf, trainMean, trainStd)
normTrainDf

a    2.0
b    3.0
c    5.0
dtype: float64


array([[-0.70710678, -0.70710678, -0.70710678],
       [ 0.70710678,  0.70710678,  0.70710678]])

In [140]:
def normalize2(df):
    df_no_id = df.drop(["subjectId", "class"], 1)
    metadata_df = utils.parse_metadata()
    merged_df = pd.merge(df, metadata_df, on=["subjectId"])
    
    stats = {}
    for scanner in merged_df["scannerType"].unique():
        mean, std = utils.get_mean_and_stats(merged_df.drop("subjectId",1), scanner)
        stats[scanner] = {
            "mean": mean.to_dict(),
            "std": std.to_dict()
        }
        
    for index in merged_df.index:
        rowInfo = merged_df.iloc[index]
        scanner = rowInfo["scannerType"]
        mean = list(stats[scanner]["mean"].values())
        std = list(stats[scanner]["std"].values())
        df_no_id.iloc[index] = (df_no_id.iloc[index]-mean)/std
        
    return df_no_id.values

In [141]:
df_norm2 = get_data("test.csv", ROI, "combine", getDf=True)
normalize2Df = normalize2(df_norm2)
normalize2Df

  
  


array([[-0.70710678, -0.70710678, -0.70710678, -0.70710678, -0.70710678,
        -0.70710678, -0.70710678, -0.70710678, -0.70710678, -0.70710678,
        -0.70710678],
       [ 0.70710678,  0.70710678,  0.70710678,  0.70710678,  0.70710678,
         0.70710678,  0.70710678,  0.70710678,  0.70710678,  0.70710678,
         0.70710678],
       [ 1.82235096,  2.68279471,  0.65286694,  1.58555251,  6.58397649,
         0.69631596,  2.55149474, 11.59372081,  2.76593576,  4.94406413,
         2.81328748],
       [ 0.70710678,  0.70710678,  0.70710678, -0.70710678,  0.70710678,
        -0.70710678, -0.70710678, -0.70710678, -0.70710678, -0.70710678,
         0.70710678],
       [-0.70710678, -0.70710678, -0.70710678,  0.70710678, -0.70710678,
         0.70710678,  0.70710678,  0.70710678,  0.70710678,  0.70710678,
        -0.70710678]])

# 3. [Train and predict models](#predict)

In this section, we define four models being logisitc regression, SVM with linear and radial kernel and a random forest. As per the paper:

_Using the scikit-learn package, four supervised
machine learning algorithms were used: logistic regression, support vector machine (SVM) with a linear kernel, SVM with a radial basis function kernel, and
random forest_ (Chougar et al.)

Additionally, we will implement a stratified cross validation loop for hyperparameter tuning. As per the paper:

_The cross-validation procedure on the training cohort included two nested loops: an outer loop with repeated stratified random splits with 50 repetitions evaluating the classification performances and an inner loop with 5 fold cross-validation used to optimize the hyperparameters of the algorithms_ (Chougar et al.)

### Imports

In [153]:
# Models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Utils
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

## Utilities


In [163]:
def split_data(X, y, training_split):
    '''
    The following function splits the training and testing data sets
    according to a split [0 - 1] passed.
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = training_split, random_state = 42)
    return X_train, X_test, y_train, y_test

def get_model_score(model, X_train, y_train, X_test, y_test):
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    print(f'training score: {round(train_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
    return train_acc, test_acc

def model(X, y, modelType, dataSplit, normalize, paramGrid, dataFile, ROI, heuristic=None):
    # Define training, validation and test sets
    X_train, X_test, y_train, y_test = split_data(X, y, dataSplit)
    
    # Setup CV
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=50)
    
    # Define model type
    if modelType == "SVM":
        clf = GridSearchCV(SVC(random_state=0), paramGrid, cv=cv)
    elif modelType == "RF":
        clf = GridSearchCV(RandomForestClassifier(random_state=0), paramGrid, cv=cv)
    elif modelType == "LR":
        clf = GridSearchCV(LogisticRegression(random_state=0), paramGrid, cv=cv)
        
    # Normalize model data
    if normalize.__name__ == "normalize1":
        X_train_normalized, mean_train, std_train = normalize(X_train, None, None)
        X_test_normalized = normalize(X_test, mean_train, std_train)
        
    elif normalize.__name__ == "normalize2":
        df = get_data(dataFile, ROI, heuristic, getDf=True)
        X_train, X_test, y_train, y_test = split_data(df, y, dataSplit)
        X_train_normalized = normalize2(X_train)
        X_test_normalized = normalize2(X_test)
        
    # Fit and predict
    model = clf.fit(X_train_normalized, y_train)
    train_acc = cross_val_score(model, X_train_normalized, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
    test_acc = cross_val_score(model, X_test_normalized, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
    print(f'training score: {round(train_acc, 3)}')
    print(f'testing score: {round(test_acc, 3)}')
#     train_acc, test_acc = get_model_score(model, X_train_normalized, y_train, X_test_normalized, y_test)
    print(f'Best model params: {model.best_params_}')

In [164]:
param_grid = {
    'C': [1.0, 10.0, 100.0, 1000.0],
    'gamma': [0.01, 0.10, 1.00, 10.00]
}
X, y = get_data("volumes.csv", ROI, "combine")
model(X, y, "SVM", 0.7, normalize2, param_grid, "volumes.csv", ROI, heuristic="combine")

  df = df.drop(column, 1)
  
  
  mean = queryDf.mean()
  std = queryDf.std()


KeyboardInterrupt: 

### SVM

In [28]:
param_grid = {
    'C': [1.0, 10.0, 100.0, 1000.0],
    'gamma': [0.01, 0.10, 1.00, 10.00]
}
for kernelType in ["linear", "rbf"]:
    param_grid["kernel"] = kernelType
    model(X, y, "SVM", 0.7, normalize1, param_grid, "test.csv", ROI, heuristic="heurisitc")

### Logistic Regression

In [29]:
param_grid = {
    'penalty': ["l1", "l2", "elasticnet"],
    'C': [1.0, 10.0, 100.0, 1000.0]
}
# model(X, y, "LR", 0.7, normalize1, param_grid)

### Random forest

In [30]:
param_grid = {
    'n_estimators': [100, 500, 1000],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 4, 5, 10, 13],
    'min_samples_leaf': [1, 2, 5, 8, 13]
}
# model(X, y, "RF", 0.7, normalize1, param_grid)