Import the required packages:

In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import warnings
warnings.filterwarnings('ignore')

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from math import sqrt
from sklearn import metrics #import utility functions to evaluate some metrics

Data loading:

In [43]:
df = pd.read_csv("DARTS output.csv")

Normalise dataset:

In [44]:
#Find minimum and maximum of flow rates
#print(df.min()[0:20])
#print(df.max()[0:20])

#Min-max normalisation of flow rates
def flow_norm(x):
    return (x + 1000) / 2000

df.iloc[:,0:20] = df.iloc[:,0:20].apply(flow_norm)

print(df.min()[0:20])
print(df.max()[0:20])

P01_Rate      0.3250
I01_Rate      0.3250
P02_Rate      0.3425
I02_Rate      0.3425
P03_Rate      0.3250
I03_Rate      0.3250
P04_Rate      0.0000
E03_Rate      0.0325
P05_Rate      0.0000
E02_Rate      0.3250
P06_Rate      0.0000
E02_Rate.1    0.0000
P07_Rate      0.4125
E05_Rate      0.5000
P08_Rate      0.3250
E01_Rate      0.5000
P09_Rate      0.3800
E06_Rate      0.3250
P10_Rate      0.2750
I10_Rate      0.2750
dtype: float64
P01_Rate      0.6750
I01_Rate      0.6750
P02_Rate      0.6575
I02_Rate      0.6575
P03_Rate      0.6750
I03_Rate      0.6750
P04_Rate      0.7500
E03_Rate      1.0000
P05_Rate      0.6750
E02_Rate      1.0000
P06_Rate      0.7500
E02_Rate.1    0.7500
P07_Rate      0.5000
E05_Rate      0.5875
P08_Rate      0.5000
E01_Rate      0.6750
P09_Rate      0.6750
E06_Rate      0.6200
P10_Rate      0.7250
I10_Rate      0.7250
dtype: float64


In [45]:
#Min-max normalisation of x-coord
def x_norm(x):
    return x / 230

df.iloc[:,20:40] = df.iloc[:,20:40].apply(x_norm)

print(df.min()[20:40])
print(df.max()[20:40])

P01_X    0.713043
I01_X    0.826087
P02_X    0.704348
I02_X    0.591304
P03_X    0.582609
I03_X    0.469565
P04_X    0.073913
E03_X    0.200000
P05_X    0.491304
E02_X    0.630435
P06_X    0.313043
E04_X    0.226087
P07_X    0.421739
E05_X    0.313043
P08_X    0.734783
E01_X    0.847826
P09_X    0.626087
E06_X    0.739130
P10_X    0.282609
I10_X    0.291304
dtype: float64
P01_X    0.713043
I01_X    0.826087
P02_X    0.704348
I02_X    0.591304
P03_X    0.582609
I03_X    0.469565
P04_X    0.304348
E03_X    0.200000
P05_X    0.526087
E02_X    0.630435
P06_X    0.382609
E04_X    0.226087
P07_X    0.421739
E05_X    0.313043
P08_X    0.782609
E01_X    0.847826
P09_X    0.626087
E06_X    0.739130
P10_X    0.469565
I10_X    0.382609
dtype: float64


In [46]:
#Min-max normalisation of y-coord
def y_norm(y):
    return y / 206

df.iloc[:,40:60] = df.iloc[:,40:60].apply(y_norm)

print(df.min()[40:60])
print(df.max()[40:60])

P01_Y    0.533981
I01_Y    0.485437
P02_Y    0.422330
I02_Y    0.364078
P03_Y    0.558252
I03_Y    0.548544
P04_Y    0.087379
E03_Y    0.223301
P05_Y    0.082524
E02_Y    0.199029
P06_Y    0.402913
E04_Y    0.485437
P07_Y    0.776699
E05_Y    0.737864
P08_Y    0.169903
E01_Y    0.339806
P09_Y    0.825243
E06_Y    0.825243
P10_Y    0.048544
I10_Y    0.082524
dtype: float64
P01_Y    0.533981
I01_Y    0.485437
P02_Y    0.422330
I02_Y    0.364078
P03_Y    0.558252
I03_Y    0.548544
P04_Y    0.271845
E03_Y    0.223301
P05_Y    0.252427
E02_Y    0.199029
P06_Y    0.402913
E04_Y    0.485437
P07_Y    0.776699
E05_Y    0.737864
P08_Y    0.315534
E01_Y    0.339806
P09_Y    0.825243
E06_Y    0.825243
P10_Y    0.417476
I10_Y    0.339806
dtype: float64


Splitting data into training and test:

In [53]:
np.random.seed(308)
df.is_train = np.random.uniform(0, 1, len(df)) <= 0.7
df_train = df[df.is_train]
df_test = df[df.is_train == False]

print(df_train.shape)
print(df_test.shape)

(27, 61)
(4, 61)


Training linear regression models:

In [56]:
#With L2 (ridge) regularisation
df_ridge_reg = Ridge(max_iter=1000, tol=0.01)
df_ridge_reg = df_ridge_reg.fit(df_train.drop('NPV', axis=1), df_train['NPV'])

#With L1 (lasso) regularisation
df_lasso_reg = Lasso(max_iter=1000, tol=0.01)
df_lasso_reg = df_lasso_reg.fit(df_train.drop('NPV', axis=1), df_train['NPV'])

Using the regression models to predict values:

In [57]:
df_ridge_pred = df_ridge_reg.predict(df_test.drop('NPV', axis=1))
df_lasso_pred = df_lasso_reg.predict(df_test.drop('NPV', axis=1))

Evaluating models using a set of metrics:

In [60]:
#Calculating MAE, RMSE, and R-squared for both models
df_ridge_mae = metrics.mean_absolute_error(df_test['NPV'], df_ridge_pred)
df_ridge_rmse = sqrt(metrics.mean_squared_error(df_test['NPV'], df_ridge_pred))
df_ridge_r2 = metrics.r2_score(df_test['NPV'], df_ridge_pred)

df_lasso_mae = metrics.mean_absolute_error(df_test['NPV'], df_lasso_pred)
df_lasso_rmse = sqrt(metrics.mean_squared_error(df_test['NPV'], df_lasso_pred))
df_lasso_r2 = metrics.r2_score(df_test['NPV'], df_lasso_pred)

df_eval = pd.DataFrame({"Ridge":{"MAE":df_ridge_mae, "RMSE":df_ridge_rmse,
                                    "R2":df_ridge_r2},
                           "Lasso":{"MAE":df_lasso_mae, "RMSE":df_lasso_mae,
                                    "R2":df_lasso_r2}
                          })

print(df_eval)

           Ridge      Lasso
MAE    82.532510  82.394061
RMSE  106.805248  82.394061
R2      0.553330   0.633009
