# 1. Importing Libraries

In [3]:
import pandas as pd
import numpy as np
from IPython.display import clear_output

import keras
from keras.models import Sequential
from keras.layers import Dense

from sklearn.model_selection import train_test_split as tts

# 2. Loading Data

In [4]:
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [5]:
concrete_data.shape   #Number of rows and columns

(1030, 9)

# 3. Target Definition and Feature Selection

* Target: 'Strength'
* Features: All columns but 'Strength' and 'Age'

In [6]:
features = [col for col in concrete_data.columns if col not in ['Strength', 'Age']]   # All features but 'Strength' and 'Age'.
n_cols = len(features)

predictors = concrete_data[features] 
target = concrete_data['Strength'] # Strength column

In [8]:
target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

In [7]:
predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5


<a id='item33'></a>


# 4. Defining Model Building and Scoring Functions

### 4.1 Build a Regression Neural Network Model Given the Model Settings

In [9]:
# build regression model
def regression_model(n_cols, hidden_layers=1, nodes=[10],
                     activations=['relu'], optimizer='adam', loss='mean_squared_error'):
    
    # create model
    model = Sequential()
    model.add(Dense(nodes[0], activation=activations[0], input_shape=(n_cols,))) # Adding first node and activation function

    if len(nodes)>1: # If number of nodes is greater than one, add nodes and activation functions iteratively
        for i in range(1, len(nodes)):    
            model.add(Dense(nodes[i], activation=activations[i]))

    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer=optimizer, loss=loss)
    return model

### 4.2 Build and Evaluate a Regression Neural Network (RNN) Model Multiple Times (Using different random train/test splits).

In [10]:
# build and score regression neural netwrok (RNN) models multiple times using different random train/test splits each time

def build_score_RNN(n_times, predictors, target, test_size, # Features and target data, and train and test split size
                    n_cols, hidden_layers, nodes, activations, optimizer, loss, # Parameters of "regression_model" function defined in the cell above
                    validation_split, epochs # Parameters for the compile of the model
                   ):
    
    # Loop to store multiple model evaluation results
    results = []
    for i in range(n_times):

        # clear last result output and print current loop stage
        clear_output(wait=True); print(f'Loop State: {i+1}/{n_times}')

        # random split sample into training and testing datasets (holding 30% for testing)
        x_train, x_test, y_train, y_test = tts(predictors, target, test_size=test_size, random_state=None)

        # build the model
        model = regression_model(n_cols=n_cols, hidden_layers=hidden_layers,
                                 nodes=nodes, activations=activations,
                                 optimizer=optimizer, loss=loss)
        # fit the model
        model.fit(x_train, y_train, validation_split=validation_split, epochs=epochs, verbose=0)

        # compute model evaluation metric (mean squared error)
        mean_squared_error = model.evaluate(x_test, y_test, verbose=0)

        # store result in a list
        results.append(mean_squared_error)
    
    return results

### 4.3 Report Average and Standard Deviation of Score List

In [18]:
def report_scoring(result, name=''):
    
    report = pd.DataFrame([np.mean(result), np.std(result)],
                      index=['Average', 'Standard Deviation'],
                      columns=['Mean Squared Error - 50 times'])
    report.index.name=name
    
    return report.T

In [48]:
report_scoring([70,74,85,75,81,78,74,82], 'Reporting Function Use Example')

Reporting Function Use Example,Average,Standard Deviation
Mean Squared Error - 50 times,77.375,4.68875


# 5. EXERCISE SOLUTION

# PART A. Build a Baseline Model (50 times)

- One hidden layer of 10 nodes, and a ReLU activation function

- Use the adam optimizer and the mean squared error  as the loss function.

### Model Building and Evaluation (50 times)

In [12]:
# sampling settings  (same settings as PART A.)
test_size = 0.3 

# hidden layers settings
hidden_layers = 1
nodes = [10]
activations = ['relu']

# compile settings
optimizer = 'adam'
loss = 'mean_squared_error'
epochs = 50 # NEW NUMBER OF EPOCHS
validation_split = 0.3

In [13]:
n_times = 50
baseline_result = build_score_RNN(
                                    n_times=n_times, predictors=predictors, target=target, test_size=test_size,
                                    n_cols=n_cols, hidden_layers=hidden_layers, nodes=nodes, activations=activations, optimizer=optimizer, loss=loss,
                                    validation_split=validation_split, epochs=epochs
                                 )

Loop State: 50/50


### Mean and the standard deviation of the mean squared errors:

In [19]:
report_scoring(baseline_result, 'Baseline Model')

Baseline Model,Average,Standard Deviation
Mean Squared Error - 50 times,554.92219,467.196829


# PART B. Normalize the data

- Repeat Part A but use a normalized version of the data. Recall that one way to normalize the data is by subtracting the mean from the individual predictors and dividing by the standard deviation.

### Normalizing Feature Variables

In [20]:
predictors_norm = (predictors - predictors.mean()) / predictors.std()

predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569


### Model Building and Evaluation (50 times) 

#### Using normalized data but same model settings as the baseline model in part A.

In [21]:
# sampling settings           (same settings as baseline model in PART A.)
test_size = 0.3             # (Check next cell comments to see the difference from the baseline model)

# hidden layers settings
hidden_layers = 1
nodes = [10]
activations = ['relu']

# compile settings
optimizer = 'adam'
loss = 'mean_squared_error'
epochs = 50
validation_split = 0.3

In [22]:
n_times = 50
norm_result = build_score_RNN( # "predictors" parameter changed to "predictors_norm" (normalized data)
                                n_times=n_times,
                                predictors=predictors_norm,
                                target=target, test_size=test_size,
                                n_cols=n_cols, hidden_layers=hidden_layers, nodes=nodes, activations=activations, optimizer=optimizer, loss=loss,
                                validation_split=validation_split, epochs=epochs
                                 )

Loop State: 50/50


### How does the mean of the mean squared errors compare to that from Step A?

- The mean of the mean squared errors (MSE) for the model built on normalized data is actually higher than the mean for the model built on not normalized data (671and 554), which tells that the model performance dropped. The standard deviation of the MSE is lower, meaning that precision at prediction was consistently lower. There is a chance, though, that the model would perform better for normalized data if a greater number of epochs or hidden layers were applied for an example. Normalized data usually facilitates the model learning, but a longer training process or layer sequence may be required for that, so the model can full identify the available information.

In [23]:
report_scoring(norm_result, 'Normalized Data Model')

Normalized Data Model,Average,Standard Deviation
Mean Squared Error - 50 times,671.275787,133.876281


# PART C. Increase the number of epochs (100 epochs)

- Repeat Part B but use 100 epochs this time for training.

### Model Building and Evaluation (50 times) - Using 100 epochs for training

In [24]:
# sampling settings
test_size = 0.3 

# hidden layers settings
hidden_layers = 1
nodes = [10]
activations = ['relu']

# compile settings
optimizer = 'adam'
loss = 'mean_squared_error'
epochs = 100                          # INCREASED NUMBER OF EPOCHS
validation_split = 0.3

In [25]:
n_times = 50
epochs_100_result = build_score_RNN( # "predictors" parameter changed to "predictors_norm" (normalized data)
                                    n_times=n_times, predictors=predictors_norm, target=target, test_size=test_size,
                                    n_cols=n_cols, hidden_layers=hidden_layers, nodes=nodes, activations=activations, optimizer=optimizer, loss=loss,
                                    validation_split=validation_split, epochs=epochs
                                 )

Loop State: 50/50


### How does the mean of the mean squared errors compare to that from Step B?
- ...

In [26]:
report_scoring(epochs_100_result, '100 Epochs Model')

100 Epochs Model,Average,Standard Deviation
Mean Squared Error - 50 times,238.907568,35.085005


# PART D. Increase the number of hidden layers

Repeat part B but use a neural network with the following instead:

- Three hidden layers, each of 10 nodes and ReLU activation function.

### Model Building and Evaluation (50 times) -  Using three hidden layers of 10 nodes

In [27]:
# sampling settings  (same settings as PART A.)
test_size = 0.3 

# hidden layers settings
hidden_layers = 3  # NEW NUMBER OF HIDDEN LAYERS
nodes = [10, 10, 10] # 10 NODES IN EACH HIDDEN LAYER
activations = ['relu', 'relu', 'relu'] # "relu" ACTIVATION FUNCTION FOR EACH HIDDEN LAYER

# compile settings
optimizer = 'adam'
loss = 'mean_squared_error'
epochs = 50 # SAME NUMBER OF EPOCHS AS PART B. AS REQUESTED.
validation_split = 0.3

In [28]:
n_times = 50
layers_3_result = build_score_RNN( # "predictors" parameter changed to "predictors_norm" (normalized data)
                                n_times=n_times, predictors=predictors_norm, target=target, test_size=test_size,
                                n_cols=n_cols, hidden_layers=hidden_layers, nodes=nodes, activations=activations, optimizer=optimizer, loss=loss,
                                validation_split=validation_split, epochs=epochs
                                 )

Loop State: 50/50


### How does the mean of the mean squared errors compare to that from Step B?
- ...

In [29]:
report_scoring(layers_3_result, 'Three Hidden Layers Model')

Three Hidden Layers Model,Average,Standard Deviation
Mean Squared Error - 50 times,185.822932,17.015256


# 6. Model Comparison

In [45]:
[display(i) for i in [report_scoring(baseline_result, 'Baseline Model'),
report_scoring(norm_result, 'Normalized Data Model'),
report_scoring(epochs_100_result, '100 Epochs Model'),
report_scoring(layers_3_result, 'Three Hidden Layers Model')]];

Baseline Model,Average,Standard Deviation
Mean Squared Error - 50 times,554.92219,467.196829


Normalized Data Model,Average,Standard Deviation
Mean Squared Error - 50 times,671.275787,133.876281


100 Epochs Model,Average,Standard Deviation
Mean Squared Error - 50 times,238.907568,35.085005


Three Hidden Layers Model,Average,Standard Deviation
Mean Squared Error - 50 times,185.822932,17.015256


- Author: Luis Resende Silva
- Last Update: 10/10/2021