# Table of Contents
1. Introduction
2. Import
3. Analysis & Preprocessing
4. Model & Training
5. Analysis & Conclusion

# 1. Introduction
References:
- https://machinelearningmastery.com/feature-selection-for-regression-data/
- https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor
- https://towardsdatascience.com/deep-neural-multilayer-perceptron-mlp-with-scikit-learn-2698e77155e
- https://www.studytonight.com/post/what-is-mean-squared-error-mean-absolute-error-root-mean-squared-error-and-r-squared#:~:text=MAE%3A%20It%20is%20not%20very,the%20weighted%20individual%20differences%20equally.

With MLPRegressor, we can split the training data to also be used as validation data for early stopping.

# 2. Import

In [1]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from tensorflow.keras.losses import MeanSquaredLogarithmicError
import matplotlib.pyplot as plt
import numpy as np 
import sklearn.metrics as metrics
from sklearn.metrics import r2_score

# 3. Anaysis & Preprocessing

In [2]:
data = pd.read_excel('../../data_sets/Volumetric_features.xlsx')
data_feat = pd.DataFrame(data, columns = data.columns[:-1])
data_feat = data_feat.drop(['S.No','Age'], axis=1)

data.head(5)
data.describe()

Unnamed: 0,S.No,Left-Lateral-Ventricle,Left-Inf-Lat-Vent,Left-Cerebellum-White-Matter,Left-Cerebellum-Cortex,Left-Thalamus,Left-Caudate,Left-Putamen,Left-Pallidum,3rd-Ventricle,...,rh_supramarginal_thickness,rh_frontalpole_thickness,rh_temporalpole_thickness,rh_transversetemporal_thickness,rh_insula_thickness,rh_MeanThickness_thickness,BrainSegVolNotVent.2,eTIV.1,Age,dataset
count,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,...,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0,4226.0
mean,2113.5,13370.040795,574.849716,14646.696711,52002.811571,7164.947539,3337.653526,4505.158755,1958.214458,1418.947373,...,2.429779,2.684327,3.555803,2.288283,2.846123,2.372266,1085468.0,1514925.0,58.374586,4.533838
std,1220.085448,9194.928348,594.590387,2622.868798,6378.435917,1207.229615,502.352001,713.65858,287.139826,635.143286,...,0.185543,0.275245,0.332094,0.269851,0.195038,0.146944,124888.1,165179.8,20.064099,3.057928
min,1.0,2204.1,0.0,6920.1,29911.8,4145.4,1035.6,2294.0,851.9,39.7,...,1.345,1.655,1.94,1.176,1.533,1.48329,627960.0,832981.5,18.0,1.0
25%,1057.25,7031.625,243.2,12909.875,47359.675,6239.425,2984.5,4008.125,1764.7,941.825,...,2.309,2.51,3.36,2.105,2.72,2.274935,995758.5,1404471.0,43.0,1.0
50%,2113.5,10669.95,385.8,14277.0,51333.65,7032.15,3294.05,4438.1,1940.1,1225.45,...,2.4405,2.685,3.5865,2.297,2.851,2.383375,1075919.0,1511767.0,61.0,4.0
75%,3169.75,17332.65,720.825,15959.725,56287.775,7977.4,3655.125,4963.025,2128.0,1780.225,...,2.56275,2.851,3.79,2.476,2.975,2.483142,1168888.0,1625445.0,76.0,8.0
max,4226.0,79812.5,7533.8,35042.5,79948.2,13008.3,6018.0,8446.1,4357.7,4461.6,...,2.996,3.928,4.487,3.123,3.482,2.80373,1545129.0,2075213.0,96.0,9.0


In [3]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x = scaler.fit_transform(data)
n = 20
pca = PCA(n_components=n)
pca_data = pca.fit_transform(x)

labels = data.columns.values.tolist()
label_index = [np.abs(pca.components_[i]).argmax() for i in range(n)]
columns = [labels[label_index[i]] for i in range(n)]

pca_df = pd.DataFrame(data=pca_data, columns=columns)
print(pca_df.head)
pca_df.head()

<bound method NDFrame.head of       rh_MeanThickness_thickness  CerebralWhiteMatterVol  \
0                       2.116693                1.364190   
1                       1.781763                1.577276   
2                       2.423064                1.424486   
3                       4.657487                1.366377   
4                       3.795704                1.701513   
...                          ...                     ...   
4221                    3.332053                2.220377   
4222                    4.258130               -2.535944   
4223                    7.826457                2.169783   
4224                   -0.702317                2.439426   
4225                   -2.373678               -3.566135   

      Left-Lateral-Ventricle  lh_lateralorbitofrontal_thickness  SurfaceHoles  \
0                   1.509802                          -2.003121     -1.880328   
1                   1.751954                          -1.118550     -1.486579   
2     

Unnamed: 0,rh_MeanThickness_thickness,CerebralWhiteMatterVol,Left-Lateral-Ventricle,lh_lateralorbitofrontal_thickness,SurfaceHoles,CC_Posterior,rh_entorhinal_thickness,CC_Posterior.1,Right-Caudate,MaskVol-to-eTIV,rh_frontalpole_thickness,MaskVol-to-eTIV.1,Right-Cerebellum-White-Matter,MaskVol-to-eTIV.2,Right-vessel,non-WM-hypointensities,rh_isthmuscingulate_thickness,5th-Ventricle,non-WM-hypointensities.1,non-WM-hypointensities.2
0,2.116693,1.36419,1.509802,-2.003121,-1.880328,2.276848,-1.649639,-0.007367,-0.453152,1.808253,-0.646168,0.146171,-1.069372,1.512286,-1.021502,-0.426147,0.024956,0.809091,0.052304,0.003078
1,1.781763,1.577276,1.751954,-1.11855,-1.486579,2.075661,-1.817474,-0.36995,-0.915827,1.988267,-0.58411,0.58018,-0.872833,1.436075,-1.493649,-0.715451,-0.488153,0.526728,-0.084644,0.472698
2,2.423064,1.424486,1.583733,-1.54259,-1.246739,1.773018,-2.459066,-0.637561,-1.271929,1.772413,-0.718069,0.537112,-0.485187,1.756954,-1.089075,-0.777283,-0.318507,0.173705,0.012735,0.006911
3,4.657487,1.366377,1.174636,-0.637782,-1.459907,2.253791,-1.243366,-0.936529,-1.087601,1.576583,-0.511822,0.073793,-0.487378,1.8274,-1.459152,-0.207134,-0.430751,0.646171,0.099156,-0.194418
4,3.795704,1.701513,2.22699,-1.242737,-1.389213,2.820539,-1.68886,-0.281402,-0.691087,1.803964,-0.80899,0.444286,-0.994916,2.086454,-0.699686,-0.605364,-0.497586,1.149907,0.034801,0.37891


In [4]:
# Split for validation --> train, val, test = 80/15/5
# train to test (val and test) --> include random shuffle
x_train, x_validation, y_train, y_validation = train_test_split(pca_df, data['Age'], test_size=0.20, random_state=33)

# (20% of total dataset -> 75% validation = 15% total, 25% validation = 5% total
# val and test --> include random shuffle
x_val, x_test, y_val, y_test = train_test_split(x_validation, y_validation, test_size=0.25, random_state=33)

print("x_train shape is:",x_train.shape)
print("y_train shape is:",y_train.shape, "\n")
print("x_val shape is:",x_val.shape)
print("y_val shape is:",y_val.shape, "\n")
print("x_test shape is:",x_test.shape)
print("y_test shape is:",y_test.shape)

x_train shape is: (3380, 20)
y_train shape is: (3380,) 

x_val shape is: (634, 20)
y_val shape is: (634,) 

x_test shape is: (212, 20)
y_test shape is: (212,)


# 4. Model & Train

In [5]:
model = MLPRegressor(hidden_layer_sizes=(64,64,64),activation="relu" ,random_state=1, max_iter=2000).fit(x_train, y_train)
model.fit(x_train, y_train)

MLPRegressor(hidden_layer_sizes=(64, 64, 64), max_iter=2000, random_state=1)

# 5. Analysis & Conclusion

In [6]:
y_pred=model.predict(x_test)

print("Explained variance: " + str(metrics.explained_variance_score(y_test, y_pred)))
print("Max Error: " + str(metrics.max_error(y_test, y_pred)))
print("Mean absolute error: " + str(metrics.mean_absolute_error(y_test, y_pred)))
print("Mean squared error: " + str(metrics.mean_squared_error(y_test, y_pred)))
print("Root Mean squared error: " + str(metrics.mean_squared_error(y_test, y_pred, squared=False)))
print("R2: " + str(metrics.r2_score(y_test, y_pred)))

Explained variance: 0.838112960338836
Max Error: 37.96850001837336
Mean absolute error: 5.912247785639341
Mean squared error: 65.6727049498043
Root Mean squared error: 8.103869751532555
R2: 0.8325055032651163
