In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

train_data_path = "Concrete_Data_Train.csv"
df = pd.read_csv(train_data_path)

df.columns

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

In [3]:
# column names changed for ease of reading
df.columns = ['Cement','BlastFurnaceSlag','FlyAsh','Water','Superplasticizer','CoarseAggregate','FineAggregate','Age','ConcreteCompressiveStrength']
X_train = df.iloc[:, :-1]
y_train = df.iloc[:, -1:]

In [5]:
# neural network model
class ConcreteStrength(nn.Module):
        def __init__(self, input_dim):
            super(ConcreteStrength, self).__init__()
            self.model = nn.Sequential(
                nn.Linear(input_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Linear(64, 32),
                nn.ReLU(),
                nn.Linear(32, 1)
            )
    
        def forward(self, x):
            return self.model(x)
    

In [7]:
print(df.describe())
# we can see that BlastFurnaceSlag, FlyAsh, Superplasticizer have high number of zeroes and age has more smaller values. So let us use log transformation for these features

           Cement  BlastFurnaceSlag      FlyAsh       Water  Superplasticizer  \
count  824.000000        824.000000  824.000000  824.000000        824.000000   
mean   283.360801         74.371602   53.160801  181.797087          6.163956   
std    107.536404         86.977784   64.000646   21.321905          5.967258   
min    102.000000          0.000000    0.000000  121.800000          0.000000   
25%    192.000000          0.000000    0.000000  164.900000          0.000000   
50%    275.100000         22.000000    0.000000  185.350000          6.100000   
75%    359.900000        144.775000  118.300000  192.000000         10.125000   
max    540.000000        359.400000  195.000000  247.000000         32.200000   

       CoarseAggregate  FineAggregate         Age  ConcreteCompressiveStrength  
count       824.000000     824.000000  824.000000                   824.000000  
mean        973.548544     772.107403   44.661408                    35.857864  
std          78.694630     

In [9]:
X_train["BlastFurnaceSlag"] = np.log1p(X_train["BlastFurnaceSlag"])
X_train["FlyAsh"] = np.log1p(X_train["FlyAsh"])
X_train["Superplasticizer"] = np.log1p(X_train["Superplasticizer"])
X_train["Age"] = np.log1p(X_train["Age"])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [11]:
import json
import numpy as np
scaler.feature_names_in_ = df.columns.tolist()

scaler_params = {
    "mean_": scaler.mean_.tolist(),
    "scale_": scaler.scale_.tolist(),
    "var_": scaler.var_.tolist(),
    "n_samples_seen_": int(scaler.n_samples_seen_),
    "feature_names_in_": scaler.feature_names_in_
}

print(json.dumps(scaler_params, indent=4))

{
    "mean_": [
        283.36080097087375,
        2.5552736017613937,
        2.091743575425426,
        181.79708737864075,
        1.4479760883048582,
        973.5485436893204,
        772.1074029126214,
        3.239097061274715
    ],
    "scale_": [
        107.47113139056532,
        2.4137541776391926,
        2.3698221380244933,
        21.308962544444068,
        1.1534634219125226,
        78.646864026356,
        80.93556052399222,
        1.1001286591127104
    ],
    "var_": [
        11550.044082368155,
        5.826209230070655,
        5.616056965870981,
        454.07188472052025,
        1.3304778656901461,
        6185.329221180129,
        6550.564957332807,
        1.21028306660113
    ],
    "n_samples_seen_": 824,
    "feature_names_in_": [
        "Cement",
        "BlastFurnaceSlag",
        "FlyAsh",
        "Water",
        "Superplasticizer",
        "CoarseAggregate",
        "FineAggregate",
        "Age",
        "ConcreteCompressiveStrength"
    ]
}


In [13]:
# return required metric parameters and model for given eval data ratio
def get_train_vs_eval(ratio_v):
    X_train_final, X_eval, y_train_final, y_eval = train_test_split(X_train_scaled, y_train, test_size=ratio_v, random_state=42)
    
    X_train_tensor = torch.tensor(X_train_final, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_final.values, dtype=torch.float32).view(-1, 1)
    X_eval_tensor = torch.tensor(X_eval, dtype=torch.float32)
    y_eval_tensor = torch.tensor(y_eval.values, dtype=torch.float32).view(-1, 1)
    
    input_dim = X_train_tensor.shape[1]
    model = ConcreteStrength(input_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    
    epochs = 1000
    batch_size = 32
    
    for epoch in range(epochs):
        for i in range(0, X_train_tensor.shape[0], batch_size):
            batch_X = X_train_tensor[i:i+batch_size]
            batch_y = y_train_tensor[i:i+batch_size]
    
            optimizer.zero_grad()
            y_pred = model(batch_X)
            loss = criterion(y_pred, batch_y)
            loss.backward()
            optimizer.step()
    
    
    with torch.no_grad():
        y_train_pred = model(X_train_tensor)
    
    y_train_pred_np = y_train_pred.numpy().flatten()
    
    r2_train = r2_score(y_train_final, y_train_pred_np)
    mse_train = mean_squared_error(y_train_final, y_train_pred_np)
    mae_train = mean_absolute_error(y_train_final, y_train_pred_np)
    
    
    with torch.no_grad():
        y_eval_pred = model(X_eval_tensor)
    
    y_eval_pred_np = y_eval_pred.numpy().flatten()
    
    r2_eval = r2_score(y_eval, y_eval_pred_np)
    mse_eval = mean_squared_error(y_eval, y_eval_pred_np)
    mae_eval = mean_absolute_error(y_eval, y_eval_pred_np)
    torch.save(model, "entire_model.pth")
    
    return r2_train, mse_train, mae_train, r2_eval, mse_eval, mae_eval, model

In [15]:
# median of required metrics over some reps. Model in each rep is stored in models list
r2t_list = []
mset_list = []
mae_list =[]
r2e_list = []
msee_list = []
maee_list = []
reps = 20
models = []
for i in range(reps):
    r2_train, mse_train, mae_train, r2_eval, mse_eval, mae_eval, model = get_train_vs_eval(0.1)
    r2t_list.append(r2_train)
    mset_list.append(mse_train)
    mae_list.append(mae_train)
    r2e_list.append(r2_eval)
    msee_list.append(mse_eval)
    maee_list.append(mae_eval)
    models.append(model)

# Have not printed RMSE as MSE is already printed
print("For training data:")
print(f"R² Score: {np.median(r2t_list):.4f}")
print(f"MSE: {np.median(mset_list):.4f}")
print(f"MAE: {np.median(mae_list):.4f}")
print("For evaluation data:")
print(f"R² Score: {np.median(r2e_list):.4f}")
print(f"MSE: {np.median(msee_list):.4f}")
print(f"MAE: {np.median(maee_list):.4f}")

For training data:
R² Score: 0.9869
MSE: 3.6540
MAE: 1.2937
For evaluation data:
R² Score: 0.9510
MSE: 16.2898
MAE: 2.8434


__Following data is medians of metrics when evaluated 30 reps for respective ratios__

__For 0.1 :-__
For training data:
R² Score: 0.9874
MSE: 3.5141
MAE: 1.2278
For evaluation data:
R² Score: 0.9578
MSE: 14.0352
MAE: 2.6801

__For 0.15__
For training data:
R² Score: 0.9851
MSE: 4.1923
MAE: 1.3710
For evaluation data:
R² Score: 0.9519
MSE: 14.3603
MAE: 2.6941

__For 0.2__
For training data:
R² Score: 0.9753
MSE: 7.0188
MAE: 1.9683
For evaluation data:
R² Score: 0.8972
MSE: 28.8752
MAE: 3.7538

__For 0.25__
For training data:
R² Score: 0.9771
MSE: 6.3570
MAE: 1.7626
For evaluation data:
R² Score: 0.8861
MSE: 34.7276
MAE: 3.8904