# Import libs

### Install a library that will brute force the best model

In [None]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12


### Import other libraries that will be used in this file

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.svm import NuSVR
from sklearn.base import clone
from sklearn.model_selection import KFold
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, median_absolute_error, r2_score
from gtm_steps import MultiLabelGTM

### Code for simplification and convenient use of various models in the following file

In [None]:
class ModelStrategy(object):
    def __init__(self, regressor) -> None:
        self._strategy = regressor
        self._regressors = []

    def get_strategy(self):
        return self._strategy

    def get_regressors(self):
        return self._regressors

    def set_strategy(self, regressor) -> None:
        self._strategy = regressor

    def fit(self, X_train, Y_train) -> None:
        for y_train in Y_train.T:
            regressor = clone(self._strategy, safe=True)
            # Fit and save all regressors
            regressor.fit(X_train, y_train)
            self._regressors.append(regressor)

    def fit_transform(self, X_train, Y_train):
        y_train_pred = []
        for y_train in Y_train.T:
            regressor = clone(self._strategy, safe=True)
            # Fit and save all regressors
            regressor.fit(X_train, y_train)
            self._regressors.append(regressor)
            # Predict and save prediction for future use
            y_train_pred.append(regressor.predict(X_train))

        # Transfrom data to numpy array and return
        return np.stack(y_train_pred, axis=1)

    def transform(self, X):
        y_pred = []
        for regressor in self._regressors:
            # For each saved regressor predict an output value
            y_pred.append(regressor.predict(X))
        return np.stack(y_pred, axis=1)

class AdapterSGTM(object):
    def __init__(self, number_of_steps = 1000, verbose=0):
        self._gtm = MultiLabelGTM(number_of_steps = number_of_steps, center_of_mass = True, verbose=verbose)

    def fit(self, X_train) -> None:
        self._gtm.fit(X_train, X_train)

    def fit_transform(self, X_train):
        self._gtm.fit(X_train, X_train)
        return self._gtm.predict(X_train)

    def transform(self, X):
        return self._gtm.predict(X)


def relative_root_mean_squared_error(y, y_pred):
    n = len(y)
    num = np.sum(np.square(y - y_pred)) / n
    den = np.sum(np.square(y_pred))
    squared_error = num / den
    rrmse_loss = np.sqrt(squared_error)
    return rrmse_loss

def calculate_errors(y, y_pred):
    MaxError = max_error                         (y, y_pred)
    MedError = median_absolute_error             (y, y_pred)
    MAE = mean_absolute_error                    (y, y_pred)
    MAPE = mean_absolute_percentage_error        (y, y_pred)
    MSE = mean_squared_error                     (y, y_pred)
    RMSE = mean_squared_error                    (y, y_pred, squared=False)
    RRMSE = relative_root_mean_squared_error     (y, y_pred)
    R2 = r2_score                                (y, y_pred)

    return {
            'MaxError' : MaxError,
            'MedError' : MedError,
            'MAE' :      MAE,
            'MAPE' :     MAPE,
            'MSE' :      MSE,
            'RMSE' :     RMSE,
            'RRMSE' :    RRMSE,
            'R2' :       R2}

In [None]:
task_name = 'FullTask'

# Load data

### The step in which all the data will be loaded and divided it into X (inputs) and Y (outputs, predictions)

In [None]:
data = pd.read_csv('/content/Tunneling_Induced_building_damage_dataset_V2.txt', sep='\t')
data = data.drop(labels = 'Number', axis=1)

data = data.drop(labels = [
    'Identifier',
    'Unnamed: 17',
    'Unnamed: 19',
    'Unnamed: 21',
    'Unnamed: 22',
    'Unnamed: 27',
    'Unnamed: 27',
    'Unnamed: 28',
    'Unnamed: 29',
    'Unnamed: 30',
    'Unnamed: 31'
], axis=1)

# remove outliers
data = data[(np.abs(stats.zscore(data.select_dtypes(exclude='object'))) < 3).all(axis=1)]
data_columns = data.columns
data.head()
X = data.iloc[:,:15].to_numpy()
if task_name == 'Task1':
    Y = data[['L. Average', 'G. Average']]
elif task_name == 'Task2':
    Y = data[['C. Width', 'Tot. Cracks ', 'Slope', 'Tilt', 'Ang. Dist.', 'Max Strain']]
else:
    Y = data[['C. Width', 'Tot. Cracks ', 'L. Average', 'Slope', 'Tilt', 'Ang. Dist.', 'Max Strain', 'G. Average']]
target_columns = Y.columns
Y = Y.to_numpy()

# Find First LazyRegressor

### Brute force the first model for our data using LazyRegressor.

* We use cross-validation
* We save the obtained results

In [None]:
from lazypredict.Supervised import LazyRegressor

results = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    scaler_x = MaxAbsScaler()
    scaler_y = MaxAbsScaler()

    X_train = scaler_x.fit_transform(X_train)
    Y_train = scaler_y.fit_transform(Y_train)

    X_test = scaler_x.transform(X_test)
    Y_test = scaler_y.transform(Y_test)

    for i, (y_train, y_test) in enumerate(zip(Y_train[:].T, Y_test[:].T)):
        print(f'Training: {target_columns[i]}')
        reg = LazyRegressor(verbose=-10,
                            ignore_warnings=True,
                            custom_metric=None,
                            predictions=True,
                            regressors='all')#regressors)
        models,predictions = reg.fit(X_train, X_test, y_train, y_test)
        results.append(models)

### By using the saved results, the final is calculated accuracy for each output (rating of the models from the best to the worst Adjusted R-Squared metric)

In [None]:
sorted_results = []
for res in results:
  sorted_results.append(res.sort_values(by=['Model']))

sorted_results

final_results = {target : 0 for target in target_columns}

for i, res in enumerate(sorted_results):
    target = target_columns[i % len(target_columns)]
    final_results[target] = final_results[target] + res.to_numpy()

for target, final_res in final_results.items():
    final_res = final_res / 5
    final_results[target] = final_res

for i, target in enumerate(target_columns):
    print(f'Calculate: {target}')
    print(pd.DataFrame(final_results[target],
                 columns = sorted_results[i].columns,
                 index =   sorted_results[i].index
    ).sort_values(by=['Adjusted R-Squared'], ascending=False))

Calculate: C. Width
                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
MLPRegressor                                 0.65       0.68  0.56        0.82
NuSVR                                        0.62       0.65  0.59        0.11
SVR                                          0.61       0.65  0.60        0.11
HistGradientBoostingRegressor                0.59       0.62  0.61        1.12
LGBMRegressor                                0.58       0.62  0.62        0.20
ExtraTreesRegressor                          0.56       0.60  0.63        0.40
XGBRegressor                                 0.53       0.57  0.65        0.76
RandomForestRegressor                        0.53       0.57  0.66        0.82
GradientBoostingRegressor                    0.50       0.55  0.67        0.63
BaggingRegressor                             0.45       0.50  0.71        0.11
BayesianRidge                   

# Find Second LazyRegressor

### Upload all the data once again and divide it into X (inputs) and Y (outputs, predictions)

* Because our data was corrupted during the previous experiment

In [None]:
######### TODO: Refactor
# Load all data with pandas
data = pd.read_csv('/content/Tunneling_Induced_building_damage_dataset_V2.txt', sep='\t')
data = data.drop(labels = 'Number', axis=1)

data = data.drop(labels = [
    'Identifier',
    'Unnamed: 17',
    'Unnamed: 19',
    'Unnamed: 21',
    'Unnamed: 22',
    'Unnamed: 27',
    'Unnamed: 27',
    'Unnamed: 28',
    'Unnamed: 29',
    'Unnamed: 30',
    'Unnamed: 31'
], axis=1)

# remove outliers
data = data[(np.abs(stats.zscore(data.select_dtypes(exclude='object'))) < 3).all(axis=1)]
data_columns = data.columns
data.head()
X = data.iloc[:,:15].to_numpy()
if task_name == 'Task1':
    Y = data[['L. Average', 'G. Average']]
elif task_name == 'Task2':
    Y = data[['C. Width', 'Tot. Cracks ', 'Slope', 'Tilt', 'Ang. Dist.', 'Max Strain']]
else:
    Y = data[['C. Width', 'Tot. Cracks ', 'L. Average', 'Slope', 'Tilt', 'Ang. Dist.', 'Max Strain', 'G. Average']]
target_columns = Y.columns
Y = Y.to_numpy()
######### END: Refactor

### Brute force a second model for our data using LazyRegressor.

* We use cross-validation
* We use the best model for
* We pass data through SGTM 
* We save the obtained results

In [None]:
results = []

model_1_errors_train = []
model_1_errors_test = []
model_2_errors_train = []
model_2_errors_test = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    scaler_x = MaxAbsScaler()
    scaler_y = MaxAbsScaler()

    X_train = scaler_x.fit_transform(X_train)
    Y_train = scaler_y.fit_transform(Y_train)

    X_test = scaler_x.transform(X_test)
    Y_test = scaler_y.transform(Y_test)

    model1 = ModelStrategy(NuSVR())
    model2 = AdapterSGTM(verbose=1)

    X_train = scaler_x.fit_transform(X_train)
    Y_train = scaler_y.fit_transform(Y_train)
    X_test = scaler_x.transform(X_test)

    # Model 1 fit on the model
    Y_train_pred = model1.fit_transform(X_train, Y_train)
    Y_test_pred = model1.transform(X_test)

    # Use decomposition to increase input features by merge X_train & Y_pred
    X_train_new = model2.fit_transform(np.concatenate((X_train, Y_train_pred), axis=1))
    X_test_new = model2.transform(np.concatenate((X_test, Y_test_pred), axis=1))

    for i, (y_train, y_test) in enumerate(zip(Y_train[:].T, Y_test[:].T)):
        print(f'Training: {target_columns[i]}')
        reg = LazyRegressor(verbose=-10,
                            ignore_warnings=True,
                            custom_metric=None,
                            predictions=True,
                            regressors='all')#regressors)
        models,predictions = reg.fit(X_train_new, X_test_new, y_train, y_test)
        results.append(models)

### By using the saved results, we calculate the final accuracy for each output (rating of the models from the best to the worst Adjusted R-Squared metric)

In [None]:
sorted_results = []
for res in results:
  sorted_results.append(res.sort_values(by=['Model']))

sorted_results

final_results = {target : 0 for target in target_columns}

for i, res in enumerate(sorted_results):
    target = target_columns[i % len(target_columns)]
    final_results[target] = final_results[target] + res.to_numpy()

for target, final_res in final_results.items():
    final_res = final_res / 5
    final_results[target] = final_res

for i, target in enumerate(target_columns):
    print(f'Calculate: {target}')
    print(pd.DataFrame(final_results[target],
                 columns = sorted_results[i].columns,
                 index =   sorted_results[i].index
    ).sort_values(by=['Adjusted R-Squared'], ascending=False))

Calculate: C. Width
                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
LGBMRegressor                                0.69       0.73  0.11        0.30
HistGradientBoostingRegressor                0.68       0.72  0.12        0.78
ExtraTreesRegressor                          0.68       0.72  0.12        0.38
GradientBoostingRegressor                    0.67       0.71  0.12        0.73
XGBRegressor                                 0.67       0.71  0.12        0.64
RandomForestRegressor                        0.66       0.71  0.12        1.02
AdaBoostRegressor                            0.65       0.70  0.12        0.26
NuSVR                                        0.65       0.69  0.12        0.30
BaggingRegressor                             0.65       0.69  0.12        0.12
KNeighborsRegressor                          0.64       0.68  0.12        0.04
SGDRegressor                    

In [None]:
for i, target in enumerate(target_columns):
    print(f'Calculate: {target}')
    print(pd.DataFrame(final_results[target],
                 columns = sorted_results[i].columns,
                 index =   sorted_results[i].index
    ).sort_values(by=['Adjusted R-Squared'], ascending=False).iloc[:1])

Calculate: C. Width
               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                         
LGBMRegressor                0.69       0.73  0.11        0.30
Calculate: Tot. Cracks 
                     Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                               
ExtraTreesRegressor                0.73       0.77  0.10        0.42
Calculate: L. Average
                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
HistGradientBoostingRegressor                0.78       0.81  0.10        0.50
Calculate: Slope
       Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                 
NuSVR                0.88       0.90  0.08        0.31
Calculate: Tilt
       Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                           

# Find Second LazyRegressor with PolyScaler

Do everything same, but here the polynomial scaling will be used for the second model inputs

In [None]:
######### TODO: Refactor
# Load all data with pandas
data = pd.read_csv('/content/Tunneling_Induced_building_damage_dataset_V2.txt', sep='\t')
data = data.drop(labels = 'Number', axis=1)

data = data.drop(labels = [
    'Identifier',
    'Unnamed: 17',
    'Unnamed: 19',
    'Unnamed: 21',
    'Unnamed: 22',
    'Unnamed: 27',
    'Unnamed: 27',
    'Unnamed: 28',
    'Unnamed: 29',
    'Unnamed: 30',
    'Unnamed: 31'
], axis=1)

# remove outliers
data = data[(np.abs(stats.zscore(data.select_dtypes(exclude='object'))) < 3).all(axis=1)]
data_columns = data.columns
data.head()
X = data.iloc[:,:15].to_numpy()
if task_name == 'Task1':
    Y = data[['L. Average', 'G. Average']]
elif task_name == 'Task2':
    Y = data[['C. Width', 'Tot. Cracks ', 'Slope', 'Tilt', 'Ang. Dist.', 'Max Strain']]
else:
    Y = data[['C. Width', 'Tot. Cracks ', 'L. Average', 'Slope', 'Tilt', 'Ang. Dist.', 'Max Strain', 'G. Average']]
target_columns = Y.columns
Y = Y.to_numpy()
######### END: Refactor

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
results = []

model_1_errors_train = []
model_1_errors_test = []
model_2_errors_train = []
model_2_errors_test = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    scaler_x = MaxAbsScaler()
    scaler_y = MaxAbsScaler()

    X_train = scaler_x.fit_transform(X_train)
    Y_train = scaler_y.fit_transform(Y_train)

    X_test = scaler_x.transform(X_test)
    Y_test = scaler_y.transform(Y_test)

    model1 = ModelStrategy(NuSVR())
    model2 = AdapterSGTM(verbose=1)

    X_train = scaler_x.fit_transform(X_train)
    Y_train = scaler_y.fit_transform(Y_train)
    X_test = scaler_x.transform(X_test)

    # Model 1 fit on the model
    Y_train_pred = model1.fit_transform(X_train, Y_train)
    Y_test_pred = model1.transform(X_test)

    # Use decomposition to increase input features by merge X_train & Y_pred
    X_train_new = model2.fit_transform(np.concatenate((X_train, Y_train_pred), axis=1))
    X_test_new = model2.transform(np.concatenate((X_test, Y_test_pred), axis=1))

    poly = PolynomialFeatures(2)
    X_train_poly = poly.fit_transform(X_train_new[:,:3])
    X_test_poly = poly.transform(X_test_new[:,:3])

    X_train_new = np.concatenate((X_train_poly, X_train_new), axis=1)
    X_test_new =  np.concatenate((X_test_poly, X_test_new), axis=1)

    for i, (y_train, y_test) in enumerate(zip(Y_train[:].T, Y_test[:].T)):
        print(f'Training: {target_columns[i]}')
        reg = LazyRegressor(verbose=-10,
                            ignore_warnings=True,
                            custom_metric=None,
                            predictions=True,
                            regressors='all')#regressors)
        models,predictions = reg.fit(X_train_new, X_test_new, y_train, y_test)
        results.append(models)

In [None]:
sorted_results = []
for res in results:
  sorted_results.append(res.sort_values(by=['Model']))

sorted_results

final_results = {target : 0 for target in target_columns}

for i, res in enumerate(sorted_results):
    target = target_columns[i % len(target_columns)]
    final_results[target] = final_results[target] + res.to_numpy()

for target, final_res in final_results.items():
    final_res = final_res / 5
    final_results[target] = final_res

for i, target in enumerate(target_columns):
    print(f'Calculate: {target}')
    print(pd.DataFrame(final_results[target],
                 columns = sorted_results[i].columns,
                 index =   sorted_results[i].index
    ).sort_values(by=['Adjusted R-Squared'], ascending=False))

Calculate: C. Width
                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
LGBMRegressor                                0.66       0.72  0.12        0.43
HistGradientBoostingRegressor                0.66       0.72  0.12        0.98
ExtraTreesRegressor                          0.66       0.72  0.12        0.41
XGBRegressor                                 0.65       0.71  0.12        0.52
GradientBoostingRegressor                    0.65       0.71  0.12        1.05
RandomForestRegressor                        0.64       0.71  0.12        1.11
AdaBoostRegressor                            0.63       0.70  0.12        0.27
BaggingRegressor                             0.63       0.69  0.12        0.13
NuSVR                                        0.61       0.68  0.12        0.29
LarsCV                                       0.59       0.67  0.13        0.09
TweedieRegressor                

In [None]:
for i, target in enumerate(target_columns):
    print(f'Calculate: {target}')
    print(pd.DataFrame(final_results[target],
                 columns = sorted_results[i].columns,
                 index =   sorted_results[i].index
    ).sort_values(by=['Adjusted R-Squared'], ascending=False).iloc[:1])

Calculate: C. Width
               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                         
LGBMRegressor                0.66       0.72  0.12        0.43
Calculate: Tot. Cracks 
                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
HistGradientBoostingRegressor                0.71       0.76  0.11        0.59
Calculate: L. Average
                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
HistGradientBoostingRegressor                0.77       0.81  0.10        0.60
Calculate: Slope
       Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                 
NuSVR                0.87       0.89  0.08        0.33
Calculate: Tilt
                     Adjusted R-Squared  R-Squared  RMSE  Time Taken
Mode