# Concrete Strength Prediction using Neural Networks
 This script builds a baseline regression model using Keras to predict concrete strength based on various ingredients and age.

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset

In [3]:
print("Loading concrete strength dataset...")
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('data/concrete_data.csv')

# Display basic information about the dataset
print("\nDataset Information:")
print(f"Number of samples: {df.shape[0]}")
print(f"Number of features: {df.shape[1] - 1}")  # Excluding the target variable
print("\nFeature names:")
for col in df.columns[:-1]:  # All columns except the last one
    print(f"- {col}")
print("\nFirst few rows of the dataset:")
print(df.head())

Loading concrete strength dataset...

Dataset Information:
Number of samples: 1030
Number of features: 8

Feature names:
- Cement
- Blast Furnace Slag
- Fly Ash
- Water
- Superplasticizer
- Coarse Aggregate
- Fine Aggregate
- Age

First few rows of the dataset:
   Cement  Blast Furnace Slag  Fly Ash  Water  Superplasticizer  \
0   540.0                 0.0      0.0  162.0               2.5   
1   540.0                 0.0      0.0  162.0               2.5   
2   332.5               142.5      0.0  228.0               0.0   
3   332.5               142.5      0.0  228.0               0.0   
4   198.6               132.4      0.0  192.0               0.0   

   Coarse Aggregate  Fine Aggregate  Age  Strength  
0            1040.0           676.0   28     79.99  
1            1055.0           676.0   28     61.89  
2             932.0           594.0  270     40.27  
3             932.0           594.0  365     41.05  
4             978.4           825.5  360     44.30  


# Step 2: Data Preprocessing

In [4]:
print("\nPreparing data for model training...")
# Separate features (X) and target variable (y)
# X contains all predictors: Cement, Blast Furnace Slag, Fly Ash, Water,
# Superplasticizer, Coarse Aggregate, Fine Aggregate, and Age
X = df.iloc[:, :-1].values
# y contains the target variable: Concrete Strength
y = df.iloc[:, -1].values


Preparing data for model training...


## Step 2.1: Normalize the data

In [5]:
# Normalization involves subtracting the mean and dividing by standard deviation
# This puts all features on a similar scale, which helps neural networks learn better
print("\nNormalizing the data...")
# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler to the data and transform it
# This calculates mean and standard deviation, then applies (x - mean) / std_dev
X_normalized = scaler.fit_transform(X)

# Display information about the normalization
print("\nNormalization statistics:")
print("Feature means before normalization:", np.mean(X, axis=0))
print("Feature standard deviations before normalization:", np.std(X, axis=0))
print("Feature means after normalization:", np.mean(X_normalized, axis=0))
print("Feature standard deviations after normalization:", np.std(X_normalized, axis=0))


Normalizing the data...

Normalization statistics:
Feature means before normalization: [281.16786408  73.89582524  54.18834951 181.56728155   6.20466019
 972.91893204 773.58048544  45.66213592]
Feature standard deviations before normalization: [104.45562093  86.2374484   63.9659301   21.34384992   5.97094077
  77.71620016  80.13705031  63.13923913]
Feature means after normalization: [-4.55299229e-16 -1.24172517e-16 -5.51877853e-17 -1.65563356e-16
 -8.27816780e-17  6.76050370e-16 -4.75994648e-16  2.06954195e-17]
Feature standard deviations after normalization: [1. 1. 1. 1. 1. 1. 1. 1.]


# Step 3-7: Repeat model training and evaluation 50 times

In [6]:
print("\nRepeating model training and evaluation 100 times with different random splits...")

# Initialize a list to store the MSE values
mse_list = []

# Create a function to build and compile the model (to reuse in each iteration)
def create_model(input_dim):
    # Sequential model allows stacking layers in sequence
    model = Sequential()
    
    # Add a hidden layer with 10 nodes and ReLU activation function as specified
    # ReLU (Rectified Linear Unit) activation: f(x) = max(0, x)
    # It introduces non-linearity and helps prevent the vanishing gradient problem
    model.add(Dense(10, input_dim=input_dim, activation='relu'))
    
    # Add an output layer with a single node (for regression)
    # No activation function means linear activation, appropriate for regression
    model.add(Dense(1))
    
    # Compile the model with Adam optimizer and MSE loss
    # Adam: Adaptive Moment Estimation, combines advantages of AdaGrad and RMSProp
    # MSE: Measures the average squared difference between predictions and actual values
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

# Suppress verbose output during the loop
import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow logging
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Run the process 50 times
for i in range(100):
    print(f"\nIteration {i+1}/100")
    
    # Use scikit-learn's train_test_split with a different random state each time
    # This ensures each iteration has a different train/test split
    # test_size=0.3 holds 30% of the data for testing as specified
    # random_state ensures reproducibility but varies each iteration
    X_train, X_test, y_train, y_test = train_test_split(
        X_normalized, y, test_size=0.3, random_state=i
    )
    
    print(f"  Training: {X_train.shape[0]} samples, Testing: {X_test.shape[0]} samples")
    
    # Create and compile the model
    model = create_model(X_normalized.shape[1])
    
    # Train the model for 50 epochs as specified
    # An epoch is one complete pass through the entire training dataset
    # batch_size=32 means 32 samples are processed before the model is updated
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        verbose=0  # No output during training
    )
    
    # Evaluate the model by making predictions on the test set
    y_pred = model.predict(X_test, verbose=0)
    
    # Compute MSE using scikit-learn's mean_squared_error function
    # MSE measures the average squared difference between predicted and actual values
    # Lower MSE indicates better model performance
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)
    print(f"  Mean Squared Error: {mse:.4f}")

# Display statistics about the MSE values to understand model stability
print("\nMSE Statistics across 50 runs:")
print(f"Mean MSE: {np.mean(mse_list):.4f}")
print(f"Median MSE: {np.median(mse_list):.4f}")
print(f"Min MSE: {np.min(mse_list):.4f}")
print(f"Max MSE: {np.max(mse_list):.4f}")
print(f"Standard Deviation: {np.std(mse_list):.4f}")

# Display all MSE values
print("\nList of all 50 MSE values:")
for i, mse in enumerate(mse_list):
    print(f"Run {i+1}: {mse:.4f}")


Repeating model training and evaluation 100 times with different random splits...

Iteration 1/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 279.8381

Iteration 2/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 349.8759

Iteration 3/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 260.4612

Iteration 4/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 665.7235

Iteration 5/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 384.8996

Iteration 6/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 296.1377

Iteration 7/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 374.7994

Iteration 8/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 238.1713

Iteration 9/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 491.1147

Iteration 10/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 249.7501

Iteration 11/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 472.8473

Iteration 12/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 394.0919

Iteration 13/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 295.3821

Iteration 14/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 313.2202

Iteration 15/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 370.7484

Iteration 16/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 339.8910

Iteration 17/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 322.3801

Iteration 18/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 658.2767

Iteration 19/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 479.2394

Iteration 20/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 299.8523

Iteration 21/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 587.5944

Iteration 22/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 250.6851

Iteration 23/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 429.9895

Iteration 24/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 330.6196

Iteration 25/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 380.6220

Iteration 26/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 394.0840

Iteration 27/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 475.7782

Iteration 28/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 334.8646

Iteration 29/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 371.3023

Iteration 30/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 275.8426

Iteration 31/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 336.2594

Iteration 32/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 276.5448

Iteration 33/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 248.1446

Iteration 34/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 306.7739

Iteration 35/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 282.3210

Iteration 36/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  Mean Squared Error: 318.4738

Iteration 37/100
  Training: 721 samples, Testing: 309 samples


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


KeyboardInterrupt: 