### Homework 4
Michał Komorowski

In [1]:
import dalex as dx
import pandas as pd
import numpy as np
import xgboost as xgb

from math import sqrt

from sklearn.svm import SVR

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error as mse, r2_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
apartments = dx.datasets.load_apartments()

In [3]:
apartments.head()

Unnamed: 0,m2_price,construction_year,surface,floor,no_rooms,district
1,5897,1953,25,3,1,Srodmiescie
2,1818,1992,143,9,5,Bielany
3,3643,1937,56,1,2,Praga
4,3517,1995,93,7,3,Ochota
5,3013,1992,144,6,5,Mokotow


In [4]:
apartments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   m2_price           1000 non-null   int64 
 1   construction_year  1000 non-null   int64 
 2   surface            1000 non-null   int64 
 3   floor              1000 non-null   int64 
 4   no_rooms           1000 non-null   int64 
 5   district           1000 non-null   object
dtypes: int64(5), object(1)
memory usage: 54.7+ KB


In [5]:
X = apartments.loc[:, apartments.columns != "m2_price"]
y = apartments["m2_price"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

#### With scaling

In [7]:
num_features = ['construction_year', 'surface', 'floor', 'no_rooms']
cat_features = ['district']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ]
)

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVR())
])

In [8]:
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', MinMaxScaler(),
                                                  ['construction_year',
                                                   'surface', 'floor',
                                                   'no_rooms']),
                                                 ('cat', OneHotEncoder(),
                                                  ['district'])])),
                ('clf', SVR())])

In [9]:
exp = dx.Explainer(clf, X_test, y_test)
mp = exp.model_performance(model_type = 'regression')

mp.result

Preparation of a new explainer is initiated

  -> data              : 330 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 330 values
  -> model_class       : sklearn.svm._classes.SVR (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x000002819C56D488> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 3.34e+03, mean = 3.38e+03, max = 3.43e+03
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1.74e+03, mean = 1.4e+02, max = 3.17e+03
  -> model_info        : package sklearn

A new explainer has been created!


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,868730.092788,932.056915,0.021402,725.87637,620.488866


In [10]:
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVR())
])

params = {
    'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'clf__gamma': ['scale', 'auto'],
    'clf__degree': list(range(1, 5)),  # ignored if kernel != 'poly'
    'clf__C': [10**i for i in range(-2, 6)]
}

gs = RandomizedSearchCV(clf, params, n_iter=40, cv=5)

In [11]:
gs.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               MinMaxScaler(),
                                                                               ['construction_year',
                                                                                'surface',
                                                                                'floor',
                                                                                'no_rooms']),
                                                                              ('cat',
                                                                               OneHotEncoder(),
                                                                               ['district'])])),
                                             ('clf', SVR())]),
               

In [12]:
gs.best_params_

{'clf__kernel': 'poly',
 'clf__gamma': 'scale',
 'clf__degree': 2,
 'clf__C': 10000}

In [13]:
exp = dx.Explainer(gs.best_estimator_, X_test, y_test)
mp_tuned = exp.model_performance(model_type = 'regression')

mp_tuned.result

Preparation of a new explainer is initiated

  -> data              : 330 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 330 values
  -> model_class       : sklearn.svm._classes.SVR (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x000002819C56D488> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 1.71e+03, mean = 3.51e+03, max = 6.38e+03
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -3.6e+02, mean = 7.54, max = 4.9e+02
  -> model_info        : package sklearn

A new explainer has been created!


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,25016.293541,158.165399,0.97182,123.319991,91.1975


#### Without scaling

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), cat_features)
    ]
)

clf_noscale = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVR())
])

In [15]:
clf_noscale.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  ['district'])])),
                ('clf', SVR())])

In [16]:
exp = dx.Explainer(clf_noscale, X_test, y_test)
mp_noscale_base = exp.model_performance(model_type = 'regression')

mp_noscale_base.result

Preparation of a new explainer is initiated

  -> data              : 330 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 330 values
  -> model_class       : sklearn.svm._classes.SVR (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x000002819C56D488> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 3.37e+03, mean = 3.4e+03, max = 3.46e+03
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1.77e+03, mean = 1.17e+02, max = 3.14e+03
  -> model_info        : package sklearn

A new explainer has been created!


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,859540.423062,927.114029,0.031754,724.008214,636.913593


In [17]:
clf_noscale = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVR())
])

params = {
    'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'clf__gamma': ['scale', 'auto'],
    'clf__degree': list(range(1, 5)),  # ignored if kernel != 'poly'
    'clf__C': [10**i for i in range(-2, 6)]
}

gs = RandomizedSearchCV(clf_noscale, params, n_iter=100, cv=5)

In [18]:
gs.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('cat',
                                                                               OneHotEncoder(),
                                                                               ['district'])])),
                                             ('clf', SVR())]),
                   n_iter=100,
                   param_distributions={'clf__C': [0.01, 0.1, 1, 10, 100, 1000,
                                                   10000, 100000],
                                        'clf__degree': [1, 2, 3, 4],
                                        'clf__gamma': ['scale', 'auto'],
                                        'clf__kernel': ['linear', 'poly', 'rbf',
                                                        'sigmoid']})

In [19]:
gs.best_params_

{'clf__kernel': 'poly', 'clf__gamma': 'scale', 'clf__degree': 3, 'clf__C': 100}

In [20]:
exp = dx.Explainer(gs.best_estimator_, X_test, y_test)
mp_noscale_tuned = exp.model_performance(model_type = 'regression')

mp_noscale_tuned.result

Preparation of a new explainer is initiated

  -> data              : 330 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 330 values
  -> model_class       : sklearn.svm._classes.SVR (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x000002819C56D488> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 3e+03, mean = 3.51e+03, max = 5.07e+03
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1.5e+03, mean = 11.7, max = 1.58e+03
  -> model_info        : package sklearn

A new explainer has been created!


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,400626.257917,632.950439,0.548707,514.807103,444.126758


### Conclusion

In [21]:
print("Model with scaling data")
display(mp_tuned.result)
print("\n")
print("Model without scaling data")
display(mp_noscale_tuned.result)


Model with scaling data


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,25016.293541,158.165399,0.97182,123.319991,91.1975




Model without scaling data


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,400626.257917,632.950439,0.548707,514.807103,444.126758


After tuning both models, there is a big difference in results. <br /> In conlusion, when using SVM algorithm the data should be scaled.

## Concrete Strength dataset

In [22]:
df = pd.read_csv("Concrete_Data_Yeh.csv")

df.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [23]:
df.shape

(1030, 9)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cement            1030 non-null   float64
 1   slag              1030 non-null   float64
 2   flyash            1030 non-null   float64
 3   water             1030 non-null   float64
 4   superplasticizer  1030 non-null   float64
 5   coarseaggregate   1030 non-null   float64
 6   fineaggregate     1030 non-null   float64
 7   age               1030 non-null   int64  
 8   csMPa             1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [25]:
X = df.loc[:, df.columns != "csMPa"]
y = df["csMPa"]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

#### With scaling

In [27]:
num_features = list(X.columns)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), num_features)
    ]
)

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVR())
])

In [28]:
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', MinMaxScaler(),
                                                  ['cement', 'slag', 'flyash',
                                                   'water', 'superplasticizer',
                                                   'coarseaggregate',
                                                   'fineaggregate', 'age'])])),
                ('clf', SVR())])

In [29]:
exp = dx.Explainer(clf, X_test, y_test)

mp_base = exp.model_performance(model_type = 'regression')
mp_base.result

Preparation of a new explainer is initiated

  -> data              : 340 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 340 values
  -> model_class       : sklearn.svm._classes.SVR (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x000002819C56D488> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 16.2, mean = 34.9, max = 58.5
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -27.0, mean = 0.855, max = 31.7
  -> model_info        : package sklearn

A new explainer has been created!


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,103.347055,10.165975,0.598675,8.18407,6.780102


In [30]:
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVR(max_iter=1000))
])

params = {
    'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'clf__gamma': ['scale', 'auto'],
    'clf__degree': list(range(1, 5)),  # ignored if kernel != 'poly'
    'clf__C': [10**i for i in range(-2, 6)]
}

gs = RandomizedSearchCV(clf, params, n_iter=50, cv=5)

In [31]:
gs.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               MinMaxScaler(),
                                                                               ['cement',
                                                                                'slag',
                                                                                'flyash',
                                                                                'water',
                                                                                'superplasticizer',
                                                                                'coarseaggregate',
                                                                                'fineaggregate',
                                                                               

In [32]:
gs.best_params_

{'clf__kernel': 'poly',
 'clf__gamma': 'auto',
 'clf__degree': 4,
 'clf__C': 100000}

In [33]:
exp = dx.Explainer(gs.best_estimator_, X_test, y_test)
mp_tuned = exp.model_performance(model_type = 'regression')
mp_tuned.result

Preparation of a new explainer is initiated

  -> data              : 340 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 340 values
  -> model_class       : sklearn.svm._classes.SVR (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x000002819C56D488> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = -0.0676, mean = 34.5, max = 88.7
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -21.8, mean = 1.26, max = 33.4
  -> model_info        : package sklearn

A new explainer has been created!


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,53.012413,7.280962,0.794138,5.420213,4.184453


#### Without scaling

In [34]:
clf_noscale = SVR()

clf_noscale.fit(X_train, y_train)

SVR()

In [35]:
exp = dx.Explainer(clf_noscale, X_test, y_test)
mp_noscale_base = exp.model_performance(model_type = 'regression')
mp_noscale_base.result

Preparation of a new explainer is initiated

  -> data              : 340 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 340 values
  -> model_class       : sklearn.svm._classes.SVR (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x000002819C56D488> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 27.9, mean = 34.0, max = 42.7
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -26.2, mean = 1.8, max = 42.5
  -> model_info        : package sklearn

A new explainer has been created!


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,199.650601,14.129777,0.224702,11.307331,9.898856


In [36]:
clf = SVR(max_iter=1000)  # Had to limit the number of iterations as it would not converge in a sensible time.

params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': list(range(1, 5)),  # ignored if kernel != 'poly'
    'C': [10**i for i in range(-2, 6)]
}

gs = RandomizedSearchCV(clf, params, n_iter=50, cv=5)

In [37]:
gs.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=SVR(max_iter=1000), n_iter=50,
                   param_distributions={'C': [0.01, 0.1, 1, 10, 100, 1000,
                                              10000, 100000],
                                        'degree': [1, 2, 3, 4],
                                        'gamma': ['scale', 'auto'],
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']})

In [38]:
gs.best_params_

{'kernel': 'rbf', 'gamma': 'scale', 'degree': 4, 'C': 100}

In [39]:
exp = dx.Explainer(gs.best_estimator_, X_test, y_test)

mp_noscale_tuned = exp.model_performance(model_type = 'regression')
mp_noscale_tuned.result

Preparation of a new explainer is initiated

  -> data              : 340 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 340 values
  -> model_class       : sklearn.svm._classes.SVR (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x000002819C56D488> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 7.48, mean = 35.4, max = 74.8
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -27.3, mean = 0.408, max = 25.3
  -> model_info        : package sklearn

A new explainer has been created!


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,68.929421,8.302374,0.732328,6.29009,4.675846


#### Conclusion

In [40]:
print("Model with scaling data")
display(mp_tuned.result)
print("\n")
print("Model without scaling data")
display(mp_noscale_tuned.result)


Model with scaling data


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,53.012413,7.280962,0.794138,5.420213,4.184453




Model without scaling data


Unnamed: 0,mse,rmse,r2,mae,mad
SVR,68.929421,8.302374,0.732328,6.29009,4.675846


Again, the model without scaled data did worse, although the difference is not as big as in the apartments dataset, but it is important to note, that without setting `max_iter` parameter in the unscaled data model, it takes really long time to converge. Therefore, for this dataset the conclusion is again that if using SVM algorithm, the data should be scaled beforehand.

### Extra task

In [41]:
df = pd.read_csv('allegro-api-transactions.csv', usecols=['main_category', 'categories', 'it_location', 'price'])

df

Unnamed: 0,categories,price,it_location,main_category
0,"['Komputery', 'Dyski i napędy', 'Nośniki', 'No...",59.99,Warszawa,Komputery
1,"['Odzież, Obuwie, Dodatki', 'Bielizna damska',...",4.90,Warszawa,"Odzież, Obuwie, Dodatki"
2,"['Dom i Ogród', 'Budownictwo i Akcesoria', 'Śc...",109.90,Leszno,Dom i Ogród
3,"['Książki i Komiksy', 'Poradniki i albumy', 'Z...",18.50,Wola Krzysztoporska,Książki i Komiksy
4,"['Odzież, Obuwie, Dodatki', 'Ślub i wesele', '...",19.90,BIAŁYSTOK,"Odzież, Obuwie, Dodatki"
...,...,...,...,...
420015,"['RTV i AGD', 'Sprzęt audio dla domu', 'Odtwar...",180.00,Kraśnik,RTV i AGD
420016,"['Uroda', 'Makijaż', 'Oczy', 'Tusze do rzęs']",14.99,Dzierżoniów,Uroda
420017,"['Odzież, Obuwie, Dodatki', 'Przebrania, kosti...",5.99,Supraśl,"Odzież, Obuwie, Dodatki"
420018,"['Dla Dzieci', 'Rowery i pojazdy', 'Rowery bie...",200.00,Poznań,Dla Dzieci


As in homework 2, the `it_location` feature could be a bit normalized by either changing values to all uppercase or lowercase. There are also some locations which are not just city names, but for this task I am not going to preprocess that.

In [42]:
df.it_location.value_counts()

Warszawa     23244
Kraków       15135
Łódź         10935
Poznań       10610
Wrocław       8646
             ...  
FB               1
Luban            1
Kępno            1
GROTKI           1
Bieńkówka        1
Name: it_location, Length: 10056, dtype: int64

In [43]:
df.it_location = df.it_location.str.lower()

In [44]:
df.it_location.value_counts()

warszawa                            27042
kraków                              16581
łódź                                12433
poznań                              11197
internet                            10992
                                    ...  
kotowa wola                             1
wielkopolska przeźmierowo poznań        1
jarosław (munina)                       1
nowy jasiniec                           1
chruszczobród                           1
Name: it_location, Length: 7903, dtype: int64

In [45]:
X = df.loc[:, df.columns != "price"]
y = df.price

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

The `smoothing` parameter in `TargetEncoder` is used to balance out the value counts.

In [47]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', TargetEncoder(smoothing=0.5), list(X.columns))
    ]
)

clf_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LinearRegression(n_jobs=-1))
])

clf_lr.fit(X_train, y_train)

y_pred = clf_lr.predict(X_test)

print(f"RMSE: {sqrt(mse(y_test, y_pred))}")
print(f"R2: {r2_score(y_test, y_pred)}")

RMSE: 359.54602952317157
R2: 0.09272522764147972


Lasso uses L1 regularization. The `alpha` parameter controls it, the default is `1.0` and when it's zero then it's same as Linear Regression.

In [48]:
clf_lasso = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', Lasso())
])

clf_lasso.fit(X_train, y_train)

y_pred = clf_lasso.predict(X_test)

print(f"RMSE: {sqrt(mse(y_test, y_pred))}")
print(f"R2: {r2_score(y_test, y_pred)}")

RMSE: 359.545250409118
R2: 0.09272915965408246


ElasticNet uses both L1 and L2 regularization. 

In [49]:
clf_en = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', ElasticNet())
])

clf_en.fit(X_train, y_train)

y_pred = clf_en.predict(X_test)

print(f"RMSE: {sqrt(mse(y_test, y_pred))}")
print(f"R2: {r2_score(y_test, y_pred)}")

RMSE: 359.54516188394405
R2: 0.09272960642009609


In [51]:
clf_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', xgb.XGBRegressor(max_depth=3))
])

clf_xgb.fit(X_train, y_train)

y_pred = clf_xgb.predict(X_test)

print(f"RMSE: {sqrt(mse(y_test, y_pred))}")
print(f"R2: {r2_score(y_test, y_pred)}")

RMSE: 356.641915767781
R2: 0.10732245832547005


The XgBoost regressor did the best in terms of RMSE and R2 metrics, but the data is poorly preprocessed and also tuning hyperparameters so other models could be good too.