In [1]:
#Import pandas and numpy for data processing
import pandas as pd
import numpy as np

#Import sklearn libraries and packages for test/train data split
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split

In [2]:
#Read data
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
#Check data points for data
concrete_data.shape

(1030, 9)

In [4]:
#Describe dataset
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [5]:
#Check for missing data
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [6]:
#Split the datasets into training and test

X_train, X_test = train_test_split(concrete_data, test_size=0.3)

In [7]:
#Break Dataset into predictors and targets for training and test datasets
concrete_data_columns_train = X_train.columns
concrete_data_columns_test = X_test.columns

predictors_train = X_train[concrete_data_columns_train[concrete_data_columns_train != 'Strength']]
target_train = X_train['Strength'] 

predictors_test = X_test[concrete_data_columns_test[concrete_data_columns_test != 'Strength']]
target_test = X_test['Strength']

In [8]:
#Check predictors dataset
predictors_train.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
640,400.0,0.0,0.0,187.0,0.0,1025.0,745.0,7
611,277.0,0.0,0.0,191.0,0.0,968.0,856.0,14
779,295.0,0.0,0.0,185.0,0.0,1069.0,769.0,28
603,339.0,0.0,0.0,197.0,0.0,968.0,781.0,180
580,290.2,193.5,0.0,185.7,0.0,998.2,704.3,7


In [9]:
#Check target dataset
target_train.head()

640    30.14
611    21.26
779    25.18
603    36.45
580    21.86
Name: Strength, dtype: float64

In [10]:
#Normalize data to improve training efficiency
predictors_train_norm = (predictors_train - predictors_train.mean()) / predictors_train.std()
predictors_test_norm = (predictors_test - predictors_test.mean()) / predictors_test.std()
#predictors_norm.head()

predictors_train = predictors_train_norm
predictors_test = predictors_test_norm

In [11]:
#Saving the columns as n_cols
#n_cols = predictors_norm.shape[1] # number of predictors
n_cols_train = predictors_train.shape[1]
n_cols_test = predictors_test.shape[1]

In [12]:
#Baseline model code starts here
#Import all the necessary libraries and packages
import keras

from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [13]:
# define regression model
import keras.backend as K

def mean_pred(predictors_test, target_test):
    return K.mean(target_test)

def std_pred(predictors_test, target_test):
    return K.std(target_test)

def regression_model():
    # create model - 10 nodes, relu activation function, 'adam' optimizer and 'mean_squared_error'
    #Note - data is not normalized
    model_train = Sequential()
    model_train.add(Dense(10, activation='relu', input_shape=(n_cols_train,)))
    model_train.add(Dense(10, activation='relu'))
    model_train.add(Dense(1))
    
    # compile model
    model_train.compile(optimizer='adam', loss='mean_squared_error',  metrics=[mean_pred, std_pred] )
    return model_train

In [14]:
# build the model
model_train = regression_model()

In [15]:
# fit the model - Requested 50 epochs - against test data with the mean, std and the mean squared error
#model_train.fit(predictors_train, target_train, validation_split=0.3, epochs=50, verbose=2)

model_train.fit(predictors_train, target_train, validation_data=(predictors_test,target_test), epochs=50, verbose=2)



Train on 721 samples, validate on 309 samples
Epoch 1/50
 - 1s - loss: 1540.3301 - mean_pred: 0.0974 - std_pred: 0.2388 - val_loss: 1569.0250 - val_mean_pred: 0.2066 - val_std_pred: 0.2690
Epoch 2/50
 - 0s - loss: 1522.6692 - mean_pred: 0.3223 - std_pred: 0.3352 - val_loss: 1551.1536 - val_mean_pred: 0.4349 - val_std_pred: 0.3633
Epoch 3/50
 - 0s - loss: 1503.5809 - mean_pred: 0.5664 - std_pred: 0.4608 - val_loss: 1530.8152 - val_mean_pred: 0.6954 - val_std_pred: 0.4836
Epoch 4/50
 - 0s - loss: 1481.5741 - mean_pred: 0.8484 - std_pred: 0.6097 - val_loss: 1506.2504 - val_mean_pred: 1.0118 - val_std_pred: 0.6367
Epoch 5/50
 - 0s - loss: 1453.9711 - mean_pred: 1.2046 - std_pred: 0.8136 - val_loss: 1476.5020 - val_mean_pred: 1.4005 - val_std_pred: 0.8198
Epoch 6/50
 - 0s - loss: 1420.1984 - mean_pred: 1.6522 - std_pred: 1.0473 - val_loss: 1437.1872 - val_mean_pred: 1.9185 - val_std_pred: 1.0703
Epoch 7/50
 - 0s - loss: 1375.7685 - mean_pred: 2.2451 - std_pred: 1.3985 - val_loss: 1387.0532 

<keras.callbacks.History at 0x7f371c2976d8>

In [None]:
## the mean of the normalized data is much lower than the mean of the un-normalized dataset