# Peer-graded Assignment: Build a Regression Model in Keras
## Introduction to Deep Learning & Neural Networks with Keras
### Nicolò Cogno

In [55]:
# Import needed libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [56]:
# Get the data and read it into a dataframe
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()
concrete_data.isnull().sum()    # Check if the data is clean

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [57]:
predictors = concrete_data.loc[:, concrete_data.columns != 'Strength']  # Build a predictors df excluding "Strength"
target = concrete_data['Strength']  # Build a target df using "Strength"

In [58]:
n_cols = predictors.shape[1]    # The number of columns (2nd dimension) of the predictors' df is needed to specify the NN input

## Part A

In [59]:
def regression_model(): # Define a function that returns a  regression model with one hidden layer
    model = Sequential()
    model.add(Dense(10, activation = 'relu', input_shape = (n_cols,)))  # 1 hidden layer, 10 nodes
    model.add(Dense(1)) # Output layer
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [60]:
mean_squared_errors = []    # List for the mean squared errors

In [61]:
for regr in range(50):  # Train the model and predict 50 times
    model = regression_model()  # Our previously built model
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.30, random_state=42)   # Split train and test, with 30% of the data as test 
    model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 50, verbose = 0)   # Train the model
    scores = model.evaluate(X_test, y_test)
    print('Loss: {}'.format(scores))
    y_pred = model.predict(X_test)  # Predict Strength from the predictors
    MSE = mean_squared_error(y_test, y_pred)
    mean_squared_errors.append(MSE)

Loss: 276.6073303222656
Loss: 97.82149505615234
Loss: 152.24343872070312
Loss: 257.4018859863281
Loss: 395.8190612792969
Loss: 284.0530090332031
Loss: 108.07923889160156
Loss: 115.56226348876953
Loss: 432.3526611328125
Loss: 118.11132049560547
Loss: 109.1284408569336
Loss: 125.69947052001953
Loss: 739.0977172851562
Loss: 112.54144287109375
Loss: 288.4467468261719
Loss: 119.91796875
Loss: 144.6535186767578
Loss: 106.30297088623047
Loss: 107.05835723876953
Loss: 437.7818603515625
Loss: 89.43695831298828
Loss: 625.1016235351562
Loss: 116.6165771484375
Loss: 305.1651611328125
Loss: 118.46409606933594
Loss: 100.81780242919922
Loss: 265.8695373535156
Loss: 168.82505798339844
Loss: 115.09770965576172
Loss: 146.27952575683594
Loss: 118.45795440673828
Loss: 309.5190734863281
Loss: 536.57861328125
Loss: 135.82176208496094
Loss: 131.7104034423828
Loss: 117.56607818603516
Loss: 1249.61181640625
Loss: 498.9057922363281
Loss: 110.87825012207031
Loss: 163.9605712890625
Loss: 449.4671325683594
Loss: 9

In [62]:
mean_squared_errors_array = np.array(mean_squared_errors)   # Transform list into array
meanMSE = mean_squared_errors_array.mean()  # Calculate values mean
stdMSE = mean_squared_errors_array.std()    # Calculate values std dev
print('Mean MSE: {} \nStandard deviation MSE: {}'.format(meanMSE, stdMSE))

Mean MSE: 250.7087381033969 
Standard deviation MSE: 228.3624550675365


## Part B

In [63]:
predictors_norm = (predictors-predictors.mean())/predictors.std()   #   Normalize predictors using mean and std dev

In [64]:
predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [65]:
mean_squared_errors_norm = []

In [66]:
for regr in range(50):
    model = regression_model()
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.30, random_state=42)
    model.fit(X_train, y_train,validation_data = (X_test, y_test), epochs = 50, verbose = 0)
    scores = model.evaluate(X_test, y_test)
    print('Loss: {}'.format(scores))
    y_pred = model.predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)
    mean_squared_errors_norm.append(MSE)

Loss: 261.5780029296875
Loss: 493.7534484863281
Loss: 286.6724548339844
Loss: 246.83859252929688
Loss: 712.2465209960938
Loss: 389.51348876953125
Loss: 248.03787231445312
Loss: 263.8329162597656
Loss: 340.5679626464844
Loss: 251.8599853515625
Loss: 584.3123779296875
Loss: 344.10760498046875
Loss: 653.2900390625
Loss: 454.9606018066406
Loss: 402.8463439941406
Loss: 357.2767028808594
Loss: 229.3707733154297
Loss: 235.9031982421875
Loss: 365.38079833984375
Loss: 312.2326965332031
Loss: 480.4718933105469
Loss: 243.61331176757812
Loss: 286.0677185058594
Loss: 536.010009765625
Loss: 239.21156311035156
Loss: 385.92193603515625
Loss: 369.8065185546875
Loss: 289.7602233886719
Loss: 349.325439453125
Loss: 361.13140869140625
Loss: 404.69207763671875
Loss: 301.1956481933594
Loss: 269.2046813964844
Loss: 403.1210632324219
Loss: 454.1298522949219
Loss: 323.8584289550781
Loss: 472.3177490234375
Loss: 719.3666381835938
Loss: 306.0127868652344
Loss: 457.5511779785156
Loss: 265.0275573730469
Loss: 341.6

In [67]:
mean_squared_errors_norm_array = np.array(mean_squared_errors_norm)
meanMSE_norm = mean_squared_errors_norm_array.mean()
stdMSE_norm = mean_squared_errors_norm_array.std()
print('Mean MSE: {} \nStandard deviation MSE: {}'.format(meanMSE_norm, stdMSE_norm))

Mean MSE: 369.64711999500037 
Standard deviation MSE: 117.91217155842706


### How does the mean of the mean squared errors compare to that from Step A?
The mean of the mean squared errors from Step B is higher than that from Step A, though the std dev is way lower and that means that the errors' distribution is more centered around the mean. Normalizing the data, therefore, dind't provide better predictions.

## Part C

In [68]:
mean_squared_errors_norm_100 = []

In [69]:
for regr in range(50):
    model = regression_model()
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.30, random_state=42)
    model.fit(X_train, y_train,validation_data = (X_test, y_test), epochs = 100, verbose = 0)
    scores = model.evaluate(X_test, y_test)
    print('Loss: {}'.format(scores))
    y_pred = model.predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)
    mean_squared_errors_norm_100.append(MSE)

Loss: 153.69483947753906
Loss: 167.2458038330078
Loss: 184.14096069335938
Loss: 140.02243041992188
Loss: 142.9476776123047
Loss: 149.64903259277344
Loss: 148.46377563476562
Loss: 148.3480224609375
Loss: 146.82749938964844
Loss: 146.2733612060547
Loss: 150.4480743408203
Loss: 171.5102081298828
Loss: 137.39456176757812
Loss: 154.47264099121094
Loss: 153.71524047851562
Loss: 146.25306701660156
Loss: 141.7863311767578
Loss: 152.0480194091797
Loss: 149.4839324951172
Loss: 157.70729064941406
Loss: 151.86146545410156
Loss: 160.28370666503906
Loss: 166.5121307373047
Loss: 150.34878540039062
Loss: 155.58468627929688
Loss: 217.93234252929688
Loss: 157.04420471191406
Loss: 157.41871643066406
Loss: 145.88552856445312
Loss: 135.4132080078125
Loss: 156.16336059570312
Loss: 155.75286865234375
Loss: 157.2366180419922
Loss: 152.71990966796875
Loss: 170.14849853515625
Loss: 159.8982391357422
Loss: 159.30059814453125
Loss: 149.24298095703125
Loss: 162.79165649414062
Loss: 151.04595947265625
Loss: 167.581

In [70]:
mean_squared_errors_norm_100_array = np.array(mean_squared_errors_norm_100)
meanMSE_norm_100 = mean_squared_errors_norm_100_array.mean()
stdMSE_norm_100 = mean_squared_errors_norm_100_array.std()
print('Mean MSE: {} \nStandard deviation MSE: {}'.format(meanMSE_norm_100, stdMSE_norm_100))

Mean MSE: 155.52045926758984 
Standard deviation MSE: 12.894827450704993


### How does the mean of the mean squared errors compare to that from Step B?
The mean of MSEs from Step C is half the value from Step B. Doubling the number of epochs, therefore, improved the accuracay of our NN.

## Part D

In [71]:
def regression_model_3_hidden(): # Define a function that returns a regression model with three hidden layer
    model = Sequential()
    model.add(Dense(10, activation = 'relu', input_shape = (n_cols,)))  # Hidden layer 1
    model.add(Dense(10, activation = 'relu'))  # Hidden layer 2
    model.add(Dense(10, activation = 'relu'))  # Hidden layer 3
    model.add(Dense(1)) # Output layer
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [72]:
mean_squared_errors_norm_3_hidden = [] 

In [73]:
for regr in range(50):
    model = regression_model_3_hidden()
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.30, random_state=42)
    model.fit(X_train, y_train,validation_data = (X_test, y_test), epochs = 50, verbose = 0)
    scores = model.evaluate(X_test, y_test)
    print('Loss: {}'.format(scores))
    y_pred = model.predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)
    mean_squared_errors_norm_3_hidden.append(MSE)

Loss: 92.87045288085938
Loss: 120.51066589355469
Loss: 136.85162353515625
Loss: 140.48004150390625
Loss: 130.45806884765625
Loss: 127.97142791748047
Loss: 129.89422607421875
Loss: 119.19283294677734
Loss: 141.9438934326172
Loss: 106.83126068115234
Loss: 138.22799682617188
Loss: 135.2830352783203
Loss: 133.78428649902344
Loss: 82.9016342163086
Loss: 128.78562927246094
Loss: 129.10418701171875
Loss: 149.78553771972656
Loss: 127.364013671875
Loss: 138.46392822265625
Loss: 135.64292907714844
Loss: 130.11729431152344
Loss: 125.68170928955078
Loss: 122.06484985351562
Loss: 137.12664794921875
Loss: 119.50796508789062
Loss: 127.14738464355469
Loss: 127.9810791015625
Loss: 115.96002960205078
Loss: 132.3997802734375
Loss: 133.86305236816406
Loss: 133.11825561523438
Loss: 134.95724487304688
Loss: 127.26692962646484
Loss: 126.78720092773438
Loss: 135.86228942871094
Loss: 130.66204833984375
Loss: 138.62240600585938
Loss: 131.9420928955078
Loss: 130.94625854492188
Loss: 130.78460693359375
Loss: 100.

In [74]:
mean_squared_errors_norm_3_hidden_array = np.array(mean_squared_errors_norm_3_hidden)
meanMSE_norm_3_hidden = mean_squared_errors_norm_3_hidden_array.mean()
stdMSE_norm_3_hidden = mean_squared_errors_norm_3_hidden_array.std()
print('Mean MSE: {} \nStandard deviation MSE: {}'.format(meanMSE_norm_3_hidden, stdMSE_norm_3_hidden))

Mean MSE: 127.49116445580167 
Standard deviation MSE: 12.39738802070658


### How does the mean of the mean squared errors compare to that from Step B?
The mean of the MSEs from Step B is almost one-third of the value from Step B and lower than the value from Step C. Therefore adding two hidden layers further improved the accuracy of our NN. 