# Concrete Strength Prediction using Neural Networks
 This script builds a baseline regression model using Keras to predict concrete strength based on various ingredients and age.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset

In [None]:
print("Loading concrete strength dataset...")
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('data/concrete_data.csv')

# Display basic information about the dataset
print("\nDataset Information:")
print(f"Number of samples: {df.shape[0]}")
print(f"Number of features: {df.shape[1] - 1}")  # Excluding the target variable
print("\nFeature names:")
for col in df.columns[:-1]:  # All columns except the last one
    print(f"- {col}")
print("\nFirst few rows of the dataset:")
print(df.head())

# Step 2: Data Preprocessing

In [None]:
print("\nPreparing data for model training...")
# Separate features (X) and target variable (y)
# X contains all predictors: Cement, Blast Furnace Slag, Fly Ash, Water,
# Superplasticizer, Coarse Aggregate, Fine Aggregate, and Age
X = df.iloc[:, :-1].values
# y contains the target variable: Concrete Strength
y = df.iloc[:, -1].values

## Step 2.1: Normalize the data

In [None]:
# Normalization involves subtracting the mean and dividing by standard deviation
# This puts all features on a similar scale, which helps neural networks learn better
print("\nNormalizing the data...")
# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler to the data and transform it
# This calculates mean and standard deviation, then applies (x - mean) / std_dev
X_normalized = scaler.fit_transform(X)

# Display information about the normalization
print("\nNormalization statistics:")
print("Feature means before normalization:", np.mean(X, axis=0))
print("Feature standard deviations before normalization:", np.std(X, axis=0))
print("Feature means after normalization:", np.mean(X_normalized, axis=0))
print("Feature standard deviations after normalization:", np.std(X_normalized, axis=0))

# Step 3-7: Repeat model training and evaluation 50 times

In [None]:
print("\nRepeating model training and evaluation 50 times with different random splits...")

# Initialize a list to store the MSE values
mse_list = []

# Create a function to build and compile the model (to reuse in each iteration)
def create_model(input_dim):
    # Sequential model allows stacking layers in sequence
    model = Sequential()
    
    # Add a hidden layer with 10 nodes and ReLU activation function as specified
    # ReLU (Rectified Linear Unit) activation: f(x) = max(0, x)
    # It introduces non-linearity and helps prevent the vanishing gradient problem
    model.add(Dense(10, input_dim=input_dim, activation='relu'))
    
    # Add an output layer with a single node (for regression)
    # No activation function means linear activation, appropriate for regression
    model.add(Dense(1))
    
    # Compile the model with Adam optimizer and MSE loss
    # Adam: Adaptive Moment Estimation, combines advantages of AdaGrad and RMSProp
    # MSE: Measures the average squared difference between predictions and actual values
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

# Suppress verbose output during the loop
import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow logging
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Run the process 50 times
for i in range(50):
    print(f"\nIteration {i+1}/50")
    
    # Use scikit-learn's train_test_split with a different random state each time
    # This ensures each iteration has a different train/test split
    # test_size=0.3 holds 30% of the data for testing as specified
    # random_state ensures reproducibility but varies each iteration
    X_train, X_test, y_train, y_test = train_test_split(
        X_normalized, y, test_size=0.3, random_state=i
    )
    
    print(f"  Training: {X_train.shape[0]} samples, Testing: {X_test.shape[0]} samples")
    
    # Create and compile the model
    model = create_model(X_normalized.shape[1])
    
    # Train the model for 50 epochs as specified
    # An epoch is one complete pass through the entire training dataset
    # batch_size=32 means 32 samples are processed before the model is updated
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        verbose=0  # No output during training
    )
    
    # Evaluate the model by making predictions on the test set
    y_pred = model.predict(X_test, verbose=0)
    
    # Compute MSE using scikit-learn's mean_squared_error function
    # MSE measures the average squared difference between predicted and actual values
    # Lower MSE indicates better model performance
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)
    print(f"  Mean Squared Error: {mse:.4f}")

# Display statistics about the MSE values to understand model stability
print("\nMSE Statistics across 50 runs:")
print(f"Mean MSE: {np.mean(mse_list):.4f}")
print(f"Median MSE: {np.median(mse_list):.4f}")
print(f"Min MSE: {np.min(mse_list):.4f}")
print(f"Max MSE: {np.max(mse_list):.4f}")
print(f"Standard Deviation: {np.std(mse_list):.4f}")

# Display all MSE values
print("\nList of all 50 MSE values:")
for i, mse in enumerate(mse_list):
    print(f"Run {i+1}: {mse:.4f}")