In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns

In [2]:
data=pd.read_csv("https://drive.google.com/uc?export=download&id=1qAEfcDYnHVVU1TjKPj09evrYIFka9ol6")
data.head(10)

Unnamed: 0,Fly Ash,GGBFS,NaOH_Molarity,NaOH amount,Sodium Silicate,Extra Water,Coarse Agg,Fine Agg,Coarse/Fine Agg,SuperPlasticizer,Curing Time(h),Curing Temp(C),Age of Testing (day),Compressive Strength (MPa),Total GHG emission,Total Cost(USD)
0,400.0,0,12,57.0,143.0,40.0,950.0,850.0,1.117647,28.0,24.0,70.0,,53.46,645.335614,222.127
1,400.0,0,12,57.0,143.0,48.0,950.0,850.0,1.117647,28.0,24.0,70.0,,45.01,645.335614,222.127
2,400.0,0,12,57.0,143.0,60.0,950.0,850.0,1.117647,28.0,24.0,70.0,,37.31,645.335614,222.127
3,400.0,0,12,57.0,143.0,80.0,950.0,850.0,1.117647,28.0,24.0,70.0,,22.58,645.335614,222.127
4,400.0,0,12,57.0,143.0,48.0,950.0,850.0,1.117647,28.0,48.0,70.0,,51.03,645.335614,222.127
5,400.0,0,12,57.0,143.0,48.0,950.0,850.0,1.117647,28.0,72.0,70.0,,51.41,645.335614,222.127
6,400.0,0,12,57.0,143.0,48.0,950.0,850.0,1.117647,28.0,96.0,70.0,,51.68,645.335614,222.127
7,400.0,0,12,57.0,143.0,48.0,950.0,850.0,1.117647,28.0,48.0,60.0,,44.81,645.335614,222.127
8,400.0,0,12,57.0,143.0,48.0,950.0,850.0,1.117647,28.0,48.0,80.0,,48.56,645.335614,222.127
9,400.0,0,12,57.0,143.0,48.0,950.0,850.0,1.117647,28.0,48.0,90.0,,47.99,645.335614,222.127


In [3]:

data.isnull().sum() # to see null values

Unnamed: 0,0
Fly Ash,0
GGBFS,0
NaOH_Molarity,0
NaOH amount,0
Sodium Silicate,0
Extra Water,186
Coarse Agg,0
Fine Agg,0
Coarse/Fine Agg,0
SuperPlasticizer,858


In [4]:
y=data['Compressive Strength (MPa)']
X = data.drop(columns=['Compressive Strength (MPa)','Total GHG emission','Total Cost(USD)'])





In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np # Import numpy for np.abs and np.mean


# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
xgb_model=xgb.XGBRegressor()
xgb_model.fit(X_train,y_train)

y_pred = xgb_model.predict(X_test)

print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f} K")
print(f"R²:  {r2_score(y_test, y_pred):.3f}")
print(f"MAPE: {np.mean(np.abs((y_test - y_pred) / y_test)) * 100:.2f}%")
print(f"RMSE: {np.sqrt(np.mean((y_test - y_pred)**2)):.2f} K")

MAE: 2.08 K
R²:  0.917
MAPE: 5.09%
RMSE: 4.43 K


# Random Search

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np # Import numpy for sqrt


def model_random(X,y):


    # Step: Train/test split (85% temp, 15% test)
    X_temp, X_test, y_temp, y_test = train_test_split(

        X, y, test_size=0.15, random_state=42 )

    X_train, X_val, y_train, y_val = train_test_split(

        X_temp, y_temp, test_size=0.1765, random_state=42)  # 0.1765 of 85% = 15%


    # Define the base XGBoost regressor with a fixed random seed
    xgb_regressor = xgb.XGBRegressor(random_state=42, verbosity=1)

    # Define the hyperparameter search space
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500],
        'learning_rate': [0.01, 0.03, 0.09, 0.1, 0.2],
        'max_depth': [3, 4, 5, 6],
        'min_child_weight': [1, 3, 5, 7],
        'subsample': [0.6, 0.75, 0.8],
        'colsample_bytree': [0.6, 0.65, 0.7, 0.75],
        'gamma': [0, 0.1, 0.3, 0.5, 1.0],
        'reg_alpha': [0, 0.01, 0.1, 1],
        'reg_lambda': [1, 1.5, 2, 3]
    }

    # Set up the randomized search cross-validator
    tuner = RandomizedSearchCV(
        estimator=xgb_regressor,
        param_distributions=param_grid,
        n_iter=100,
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    # Combine train and validation data for better hyperparameter tuning
    X_combined = pd.concat([X_train, X_val])
    y_combined = pd.concat([y_train, y_val])

    # Perform the randomized search
    tuner.fit(X_combined, y_combined)

    # Return best parameters, best model, and the data splits
    return tuner.best_params_, tuner.best_estimator_, X_train, X_val, y_train, y_val, X_test, y_test

In [7]:
best_params, model_strength, X_train, X_val, y_train, y_val, X_test, y_test = model_random(X,y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [8]:
r2=model_strength.score(X_test,y_test)
print(r2)
mae=mean_absolute_error(y_test,model_strength.predict(X_test))
print(mae)

rmse=np.sqrt(mean_squared_error(y_test,model_strength.predict(X_test)))
print(rmse)


0.9541620224431195
2.0145475078600024
3.419952198233551


In [9]:
best_params

{'subsample': 0.75,
 'reg_lambda': 2,
 'reg_alpha': 0.1,
 'n_estimators': 500,
 'min_child_weight': 5,
 'max_depth': 6,
 'learning_rate': 0.1,
 'gamma': 0.3,
 'colsample_bytree': 0.65}