# Table of Contents
1. Introduction
2. Import
3. Analysis & Preprocessing
4. Model & Training
5. Analysis & Conclusion

# 1. Introduction
References:
- https://machinelearningmastery.com/feature-selection-for-regression-data/
- https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor
- https://towardsdatascience.com/deep-neural-multilayer-perceptron-mlp-with-scikit-learn-2698e77155e
- https://www.studytonight.com/post/what-is-mean-squared-error-mean-absolute-error-root-mean-squared-error-and-r-squared#:~:text=MAE%3A%20It%20is%20not%20very,the%20weighted%20individual%20differences%20equally.

With MLPRegressor, we can split the training data to also be used as validation data for early stopping.



# 2. Import

In [5]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from tensorflow.keras.losses import MeanSquaredLogarithmicError
import matplotlib.pyplot as plt
import numpy as np 
import sklearn.metrics as metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# 3. Analysis & Preprocessing


In [6]:
data = pd.read_excel('../../data_sets/Volumetric_features.xlsx')
data_feat = pd.DataFrame(data, columns = data.columns[:-1])
data_feat = data_feat.drop(['S.No','Age'], axis=1)

data.head(5)
data.describe()

Unnamed: 0,S.No,Left-Lateral-Ventricle,Left-Inf-Lat-Vent,Left-Cerebellum-White-Matter,Left-Cerebellum-Cortex,Left-Thalamus,Left-Caudate,Left-Putamen,Left-Pallidum,3rd-Ventricle,...,rh_supramarginal_thickness,rh_frontalpole_thickness,rh_temporalpole_thickness,rh_transversetemporal_thickness,rh_insula_thickness,rh_MeanThickness_thickness,BrainSegVolNotVent.2,eTIV.1,Age,dataset
count,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,...,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0
mean,2113.5,13370.040795,574.849716,14646.696711,52002.811571,7164.947539,3337.653526,4505.158755,1958.214458,1418.947373,...,2.429779,2.684327,3.555803,2.288283,2.846123,2.372266,1085468.0,1514925.0,58.374586,4.533838
std,1220.085448,9194.928348,594.590387,2622.868798,6378.435917,1207.229615,502.352001,713.65858,287.139826,635.143286,...,0.185543,0.275245,0.332094,0.269851,0.195038,0.146944,124888.1,165179.8,20.064099,3.057928
min,1.0,2204.1,0.0,6920.1,29911.8,4145.4,1035.6,2294.0,851.9,39.7,...,1.345,1.655,1.94,1.176,1.533,1.48329,627960.0,832981.5,18.0,1.0
25%,1057.25,7031.625,243.2,12909.875,47359.675,6239.425,2984.5,4008.125,1764.7,941.825,...,2.309,2.51,3.36,2.105,2.72,2.274935,995758.5,1404471.0,43.0,1.0
50%,2113.5,10669.95,385.8,14277.0,51333.65,7032.15,3294.05,4438.1,1940.1,1225.45,...,2.4405,2.685,3.5865,2.297,2.851,2.383375,1075919.0,1511767.0,61.0,4.0
75%,3169.75,17332.65,720.825,15959.725,56287.775,7977.4,3655.125,4963.025,2128.0,1780.225,...,2.56275,2.851,3.79,2.476,2.975,2.483142,1168888.0,1625445.0,76.0,8.0
max,4226.0,79812.5,7533.8,35042.5,79948.2,13008.3,6018.0,8446.1,4357.7,4461.6,...,2.996,3.928,4.487,3.123,3.482,2.80373,1545129.0,2075213.0,96.0,9.0


In [7]:
scaler = StandardScaler()
x = scaler.fit_transform(data_feat)
n = 20
pca = PCA(n_components=n)
pca_data = pca.fit_transform(x)

labels = data_feat.columns.values.tolist()
label_index = [np.abs(pca.components_[i]).argmax() for i in range(n)]
columns = [labels[label_index[i]] for i in range(n)]

pca_df = pd.DataFrame(data=pca_data, columns=columns)
print(pca_df.head)
pca_df.head()

<bound method NDFrame.head of       rh_MeanThickness_thickness  CerebralWhiteMatterVol  \
0                       1.754401                1.293660   
1                       1.417516                1.506792   
2                       2.060537                1.356491   
3                       4.321472                1.316559   
4                       3.432616                1.645477   
...                          ...                     ...   
4221                    3.508241                2.349695   
4222                    4.445945               -2.409495   
4223                    8.016491                2.326576   
4224                   -0.596625                2.547035   
4225                   -2.307309               -3.481648   

      Left-Lateral-Ventricle  lh_pericalcarine_thickness  SurfaceHoles  \
0                   1.400081                   -1.479290     -1.847142   
1                   1.654037                   -0.614614     -1.383877   
2                   1.48897

Unnamed: 0,rh_MeanThickness_thickness,CerebralWhiteMatterVol,Left-Lateral-Ventricle,lh_pericalcarine_thickness,SurfaceHoles,CC_Posterior,rh_caudalanteriorcingulate_thickness,CC_Posterior.1,Right-Caudate,lh_parahippocampal_thickness,MaskVol-to-eTIV,Brain-Stem,Left-vessel,Right-vessel,non-WM-hypointensities,rh_isthmuscingulate_thickness,5th-Ventricle,5th-Ventricle.1,5th-Ventricle.2,5th-Ventricle.3
0,1.754401,1.29366,1.400081,-1.47929,-1.847142,2.542015,-1.133833,-0.41109,-0.388942,1.552638,-0.227492,0.218987,-1.63487,0.568368,-0.286874,0.586273,-0.233864,-0.268157,0.582861,-0.795788
1,1.417516,1.506792,1.654037,-0.614614,-1.383877,2.363133,-1.437472,-0.853719,-0.784845,1.706035,0.186719,0.4497,-1.530065,0.186152,-0.735079,0.253801,0.243168,-0.29146,1.046822,-0.603192
2,2.060537,1.356491,1.488975,-1.056305,-1.174194,2.159467,-2.170179,-0.817869,-1.138811,1.448323,0.080882,0.456564,-1.006849,0.633636,-0.617794,0.416272,0.486921,0.059992,0.948482,-0.39764
3,4.321472,1.316559,1.100205,-0.240387,-1.373084,2.410745,-1.140959,-1.565931,-0.841107,1.33451,0.308029,-0.308046,-1.411119,0.560599,-0.323439,0.204077,0.372448,-0.052275,0.765669,-0.579789
4,3.432616,1.645477,2.126721,-0.80497,-1.320604,3.072865,-1.201045,-0.820277,-0.543963,1.519215,-0.086463,0.460403,-1.529703,1.21149,-0.854428,0.128542,-0.277969,-0.205248,1.060636,-0.950348


In [11]:
# Split for validation --> train, val, test = 80/15/5
# train to test (val and test) --> include random shuffle
x_train, x_validation, y_train, y_validation = train_test_split(pca_df, data['Age'], test_size=0.20, random_state=33)

# (20% of total dataset -> 75% validation = 15% total, 25% validation = 5% total
# val and test --> include random shuffle
x_val, x_test, y_val, y_test = train_test_split(x_validation, y_validation, test_size=0.25, random_state=33)

print("x_train shape is:",x_train.shape)
print("y_train shape is:",y_train.shape, "\n")
print("x_val shape is:",x_val.shape)
print("y_val shape is:",y_val.shape, "\n")
print("x_test shape is:",x_test.shape)
print("y_test shape is:",y_test.shape)

x_train shape is: (3380, 20)
y_train shape is: (3380,) 

x_val shape is: (634, 20)
y_val shape is: (634,) 

x_test shape is: (212, 20)
y_test shape is: (212,)


# 4. Model & Train


In [14]:
model = MLPRegressor(hidden_layer_sizes=(64,64,64),activation="relu" ,random_state=1, max_iter=500).fit(x_train, y_train)
model.fit(x_train, y_train)

MLPRegressor(hidden_layer_sizes=(64, 64, 64), max_iter=500, random_state=1)

In [13]:
y_pred=model.predict(x_test)

print("Explained variance: " + str(metrics.explained_variance_score(y_test, y_pred)))
print("Max Error: " + str(metrics.max_error(y_test, y_pred)))
print("Mean absolute error: " + str(metrics.mean_absolute_error(y_test, y_pred)))
print("Mean squared error: " + str(metrics.mean_squared_error(y_test, y_pred)))
print("Root Mean squared error: " + str(metrics.mean_squared_error(y_test, y_pred, squared=False)))
print("R2: " + str(metrics.r2_score(y_test, y_pred)))

Explained variance: 0.8571969629974416
Max Error: 24.93274734352451
Mean absolute error: 5.508498039494388
Mean squared error: 56.76007959746956
Root Mean squared error: 7.533928563337295
R2: 0.8552366470350719
