In [1]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:
# load the Concrete dataset 

concrete = pd.read_csv(r"D:\00 Henotic\SRKR\Datasets\Concrete_Data_V1.0.csv", header=0)

# copy the file to back-up file

concrete_bk = concrete.copy()

# display first 5 records

concrete.head()

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
# Display concrete data information

concrete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Cement            1030 non-null   float64
 1   Slag              1030 non-null   float64
 2   Flyash            1030 non-null   float64
 3   Water             1030 non-null   float64
 4   SuperPlasticizer  1030 non-null   float64
 5   CoarseAggregate   1030 non-null   float64
 6   FineAggregate     1030 non-null   float64
 7   Age               1030 non-null   int64  
 8   CSinMPa           1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [4]:
# Prepare cols1 for scaling

cols1 = ['Cement', 'Slag', 'Flyash', 'Water', 'SuperPlasticizer', 'CoarseAggregate', 'FineAggregate', 'Age']

In [5]:
# Identify the independent and Target variables

IndepVar = []
for col in concrete.columns:
    if col != 'CSinMPa':
        IndepVar.append(col)

TargetVar = 'CSinMPa'

x = concrete[IndepVar]
y = concrete[TargetVar]

In [6]:
# Split the data into train and test

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=143)

# Display the shape of the train_data and test_data

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((721, 8), (309, 8), (721,), (309,))

In [7]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols1] = mmscaler.fit_transform(x_train[cols1])
x_train = pd.DataFrame(x_train)

x_test[cols1] = mmscaler.fit_transform(x_test[cols1])
x_test = pd.DataFrame(x_test)

# Gradient Boosting Regressor Algorithm

In [8]:
# Build the model with Gradient Boosting Regressor

from sklearn.ensemble import GradientBoostingRegressor  

modelGBR = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                     min_impurity_split=None, init=None, random_state=None, max_features=None,
                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
modelGBR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = modelGBR.predict(x_test)

# Evaluation metrics for Regression analysis

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 4.837
Mean Squared Error (MSE): 42.405
Root Mean Squared Error (RMSE): 6.512
R2_score: 0.838145
Root Mean Squared Log Error (RMSLE): 1.874
Mean Absolute Percentage Error (MAPE): 15.878 %
Adj R Square:  0.836877


In [9]:
Results = pd.DataFrame({'CSinMPa_A':y_test, 'CSinMPa_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = concrete_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(10)

Unnamed: 0,Cement,Slag,Flyash,Water,SuperPlasticizer,CoarseAggregate,FineAggregate,Age,CSinMPa,CSinMPa_A,CSinMPa_P
795,525.0,0.0,0.0,189.0,0.0,1125.0,613.0,180,61.92,61.92,63.943645
477,446.0,24.0,79.0,162.0,11.6,967.0,712.0,3,23.35,23.35,28.556917
823,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,29.59,29.59,37.425121
500,491.0,26.0,123.0,201.0,3.9,822.0,699.0,28,57.92,57.92,49.374486
766,385.0,0.0,0.0,186.0,0.0,966.0,763.0,14,27.92,27.92,24.804782
894,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,29.72,29.72,27.92824
607,236.0,0.0,0.0,194.0,0.0,968.0,885.0,28,18.42,18.42,21.296028
961,336.5,0.0,0.0,181.9,3.4,985.8,816.8,28,44.87,44.87,35.230039
883,149.0,236.0,0.0,176.0,13.0,847.0,893.0,28,32.96,32.96,35.439714
902,331.0,170.0,0.0,195.0,8.0,811.0,802.0,28,56.61,56.61,50.240116


In [None]:
# Calculate the Max, Min and ave values of target variable

print('Min of CSinMPa:', concrete.CSinMPa.min())
print('Max of CSinMPa:', concrete.CSinMPa.max())
print('Ave of CSinMPa:', concrete.CSinMPa.mean())

In [10]:
# Load the result dataset

RGRResults = pd.read_csv(r"D:\00 Henotic\SRKR\Datasets\RGRResults.csv", header=0)
RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score


# Gradient Boosting Regressor & Compare with all regressors

In [11]:
# Build the Regression / Regressor models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.ensemble import GradientBoostingRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

modelmlg = LinearRegression()
modeldcr = DecisionTreeRegressor()
modelrfr = RandomForestRegressor()
modelSVR = SVR()
modelXGR = xgb.XGBRegressor()
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelETR = ExtraTreesRegressor()

modelGBR = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                     min_impurity_split=None, init=None, random_state=None, max_features=None,
                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

# Evalution matrix for all the algorithms

MM = [modelmlg, modeldcr, modelrfr, modelSVR, modelXGR, modelKNN, modelETR, modelGBR]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    RGRResults = RGRResults.append(new_row, ignore_index=True)
    #-------------------------------------------------------------------------------------------

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 8.386
Mean Squared Error (MSE): 115.568
Root Mean Squared Error (RMSE): 10.75
R2_score: 0.558885
Root Mean Squared Log Error (RMSLE): 2.375
Mean Absolute Percentage Error (MAPE): 31.33 %
Adj R Square:  0.555429
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 6.181
Mean Squared Error (MSE): 81.297
Root Mean Squared Error (RMSE): 9.016
R2_score: 0.689695
Root Mean Squared Log Error (RMSLE): 2.199
Mean Absolute Percentage Error (MAPE): 19.67 %
Adj R Square:  0.687264
------------------------------------------------------------------------------------------------------------
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 4.657
Mean Squared Error (MSE): 41.005
Root Mean Squared Error (RMSE): 6.404
R2_score: 0.843486
Root Mean Squared Log Error (RMSLE): 1.857
Mean Absolute Percentage Error (MAP

In [12]:
# Results with comparing the all the algorithms 

RGRResults.to_csv("D://000 DataScience//01-Internship//RGRResults_03.csv")

RGRResults.head(15)

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score
0,LinearRegression(),8.385699,0.555429,10.750258,31.326908,115.568039,2.37493,0.558885
1,DecisionTreeRegressor(),6.181149,0.687264,9.016479,19.670641,81.29689,2.199054,0.689695
2,"(DecisionTreeRegressor(max_features='auto', ra...",4.656915,0.84226,6.403519,14.408596,41.005059,1.856848,0.843486
3,SVR(),8.445729,0.537147,10.969064,32.725504,120.320371,2.395079,0.540745
4,"XGBRegressor(base_score=0.5, booster='gbtree',...",5.066323,0.822676,6.789394,16.231458,46.095874,1.915362,0.824055
5,KNeighborsRegressor(),7.277146,0.650774,9.527991,26.922818,90.78261,2.254234,0.653489
6,"(ExtraTreeRegressor(random_state=1983063834), ...",4.442354,0.855982,6.118653,13.860194,37.437913,1.811342,0.857102
7,([DecisionTreeRegressor(criterion='friedman_ms...,4.838199,0.836708,6.515233,15.875667,42.448255,1.874143,0.837978
