In [183]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, mean_absolute_percentage_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

df_raw = pd.read_csv('data/Concrete_Data.csv')
df_raw.columns



Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

In [184]:
# Select all independent variables.
X = df_raw.iloc[:, :-1]
# Select only the target variable.
y = df_raw.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size= 0.2,
                                                   shuffle= True, # Shuffle the data to avoid bias
                                                   random_state= 50)

# Multiple Linear Regression

In [185]:
mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)

LinearRegression()

In [186]:
mlr_y_predicted =  mlr_model.predict(X_test)

# Mean Absolute Error
mlr_metric_mae = mean_absolute_error(y_test, mlr_y_predicted)
print(f'Mean Absolute Error {mlr_metric_mae}')

# Mean Absolute Percentage Error
mlr_metric_mape = mean_absolute_percentage_error(y_test, mlr_y_predicted)
print(f'Mean Absolute Percentage Error {mlr_metric_mape}')

# Mean Squared Error
mlr_metric_mse = mean_squared_error(y_test, mlr_y_predicted)
print(f'Mean Squared Error {mlr_metric_mse}')

# Mean Squared Log Error
mlr_metric_msle = mean_squared_log_error(y_test, mlr_y_predicted)
print(f'Mean Squared Log Error {mlr_metric_msle}')

# Root Mean Squared Error
mlr_metric_rmse = np.sqrt(mean_squared_error(y_test, mlr_y_predicted))
print(f'Root Mean Squared Error {mlr_metric_rmse}')

# Explained Variance Score
mlr_metric_evs = explained_variance_score(y_test, mlr_y_predicted)
print(f'Explained Variance Score {mlr_metric_evs}')

# R2 score
mlr_metric_r2 = r2_score(y_test, mlr_y_predicted)
print(f'R Squared Score {mlr_metric_r2}')


Mean Absolute Error 7.9100408151230575
Mean Absolute Percentage Error 0.3338653542629698
Mean Squared Error 98.115811061572
Mean Squared Log Error 0.12016603228350434
Root Mean Squared Error 9.905342551450303
Explained Variance Score 0.6360416556439641
R Squared Score 0.6359646484072214


In [187]:
print(mlr_model.intercept_)
print(mlr_model.coef_)

-8.588336120681141
[ 0.11435239  0.09812924  0.07706773 -0.16950172  0.31739452  0.01450578
  0.01343895  0.11158763]


# Decision Tree Regression (Regression Tree)

In [188]:
regression_model = DecisionTreeRegressor(criterion="squared_error", min_samples_leaf=3, max_depth=10)
regression_model.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=10, min_samples_leaf=3)

In [189]:
dtr_y_predicted = regression_model.predict(X_test)

# Mean Absolute Error
dtr_metric_mae = mean_absolute_error(y_test, dtr_y_predicted)
print(f'Mean Absolute Error {dtr_metric_mae}')

# Mean Absolute Percentage Error
dtr_metric_mape = mean_absolute_percentage_error(y_test, dtr_y_predicted)
print(f'Mean Absolute Percentage Error {dtr_metric_mape}')

# Mean Squared Error
dtr_metric_mse = mean_squared_error(y_test, dtr_y_predicted)
print(f'Mean Squared Error {dtr_metric_mse}')

# Mean Squared Log Error
dtr_metric_msle = mean_squared_log_error(y_test, dtr_y_predicted)
print(f'Mean Squared Log Error {dtr_metric_msle}')

# Root Mean Squared Error
dtr_metric_rmse = np.sqrt(mean_squared_error(y_test, dtr_y_predicted))
print(f'Root Mean Squared Error {dtr_metric_rmse}')

# Explained Variance Score
dtr_metric_evs = explained_variance_score(y_test, dtr_y_predicted)
print(f'Explained Variance Score {dtr_metric_evs}')

# R2 score
dtr_metric_r2 = r2_score(y_test, dtr_y_predicted)
print(f'R Squared Score {dtr_metric_r2}')


Mean Absolute Error 5.139526534044557
Mean Absolute Percentage Error 0.18768171867188754
Mean Squared Error 46.72907642729865
Mean Squared Log Error 0.05418031535021842
Root Mean Squared Error 6.835866911175104
Explained Variance Score 0.8276124326127476
R Squared Score 0.8266228900035051


#  K-Nearest Neighbors (KNN)

In [190]:
# Create a KNN regression model with k=5
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data
knn.fit(X_train, y_train)



KNeighborsRegressor()

In [191]:
knn_y_predicted = knn.predict(X_test)

# Mean Absolute Error
knn_metric_mae = mean_absolute_error(y_test, knn_y_predicted)
print(f'Mean Absolute Error {knn_metric_mae}')

# Mean Absolute Percentage Error
knn_metric_mape = mean_absolute_percentage_error(y_test, knn_y_predicted)
print(f'Mean Absolute Percentage Error {knn_metric_mape}')

# Mean Squared Error
knn_metric_mse = mean_squared_error(y_test, knn_y_predicted)
print(f'Mean Squared Error {knn_metric_mse}')

# Mean Squared Log Error
knn_metric_msle = mean_squared_log_error(y_test, knn_y_predicted)
print(f'Mean Squared Log Error {knn_metric_msle}')

# Root Mean Squared Error
knn_metric_rmse = np.sqrt(mean_squared_error(y_test, knn_y_predicted))
print(f'Root Mean Squared Error {knn_metric_rmse}')

# Explained Variance Score
knn_metric_evs = explained_variance_score(y_test, knn_y_predicted)
print(f'Explained Variance Score {knn_metric_evs}')

# R2 score
knn_metric_r2 = r2_score(y_test, knn_y_predicted)
print(f'R Squared Score {knn_metric_r2}')

Mean Absolute Error 6.636009708737864
Mean Absolute Percentage Error 0.28334725091669005
Mean Squared Error 79.14396293203883
Mean Squared Log Error 0.10386696128789864
Root Mean Squared Error 8.896289278797022
Explained Variance Score 0.7141231271268944
R Squared Score 0.7063551729258979


In [192]:
results = {
    'MLR': [mlr_metric_mae, mlr_metric_mape, mlr_metric_mse, mlr_metric_msle, mlr_metric_rmse, mlr_metric_evs, mlr_metric_r2],
    'DTR': [dtr_metric_mae, dtr_metric_mape, dtr_metric_mse, dtr_metric_msle, dtr_metric_rmse, dtr_metric_evs, dtr_metric_r2],
    'KNN': [knn_metric_mae, knn_metric_mape, knn_metric_mse, knn_metric_msle, knn_metric_rmse, knn_metric_evs, knn_metric_r2]}

results_df = pd.DataFrame(results).set_axis(
    ['Mean Absolute Error', 'Mean Absolute Percentage Error', 'Mean Squared Error', 'Mean Squared Log Error',
     'Root Mean Squared Error', 'Explained Variance Score', 'R Squared Score'])

print(results_df)

                                      MLR        DTR        KNN
Mean Absolute Error              7.910041   5.139527   6.636010
Mean Absolute Percentage Error   0.333865   0.187682   0.283347
Mean Squared Error              98.115811  46.729076  79.143963
Mean Squared Log Error           0.120166   0.054180   0.103867
Root Mean Squared Error          9.905343   6.835867   8.896289
Explained Variance Score         0.636042   0.827612   0.714123
R Squared Score                  0.635965   0.826623   0.706355
