#### Imports and configs

In [1]:

import pandas as pd
import numpy as np
from mlxtend.regressor import StackingCVRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neural_network import MLPRegressor
from matplotlib import pyplot as plt
import seaborn as sns
RANDOM_STATE = 42
sns.set_context("talk")
pastel_palette = sns.color_palette("Set2")
sns.set_palette(pastel_palette)

#### Read data and transform columns data types

In [2]:
data = pd.read_csv('ames.csv', delimiter=',')

categoricalColumns = data.select_dtypes(include='object').columns.to_list()
categoricalColumns = categoricalColumns + ["Year_Built", "Year_Remod_Add",
                                           "BsmtFin_SF_1", "Bsmt_Full_Bath", 
                                           "Bsmt_Half_Bath", "Full_Bath", 
                                           "Bedroom_AbvGr", "Kitchen_AbvGr", 
                                           "TotRms_AbvGrd", "Fireplaces", 
                                           "Garage_Cars", "Mo_Sold", "Year_Sold"]
numericalColumns = [column for column in data.columns if column not in categoricalColumns and column != "Sale_Price"]
for column in categoricalColumns:
    data[column] = data[column].astype('object')

#print(data[categoricalColumns].dtypes)


train = data.iloc[0:2000]
test = data.iloc[2000:]
y_train = train.Sale_Price.values
train = train.drop(["Sale_Price"], axis=1)
y_test = test["Sale_Price"].values
test = test.drop(["Sale_Price"], axis=1)
X_train = train
X_test = test

#### Initialize ML-Models

In [3]:
rf = RandomForestRegressor(random_state=RANDOM_STATE)
lasso = Lasso(random_state=RANDOM_STATE)
ridge = Ridge(random_state=RANDOM_STATE)
svr = SVR()
mlp = MLPRegressor(random_state=RANDOM_STATE, hidden_layer_sizes=(X_train.shape[1]), activation='relu', solver='adam', learning_rate='adaptive', early_stopping=True)

#### Create transformers and preprocessor

In [4]:
# OneHotEncoder: Converts categorical features to binary matrix
# StandardScaler: Standardizes features by removing the mean and scaling to unit variance
# SimpleImputer: Imputation transformer for completing missing values

numericalColumnsTransformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
categoricalColumnsTransformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', numericalColumnsTransformer, numericalColumns), ('cat', categoricalColumnsTransformer, categoricalColumns)])

#### Create stack and apply preprocessor to it

In [5]:
stack = StackingCVRegressor(regressors=(rf,ridge, svr, mlp), meta_regressor=lasso)
stackPipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', stack)])

#### Fit stack via grid search and training data

In [6]:
param_grid = {
    ## Random Forest Regressor hyperparameters
    # Number of trees in the forest
    'model__randomforestregressor__n_estimators': [50, 100, 200],  

    ## Ridge Regressor hyperparameters
    # Regularization strength: smaller values specify stronger regularization
    'model__ridge__alpha': [0.1, 1.0, 10.0],  

    ## Support Vector Regressor hyperparameters
    # Regularization parameter: smaller values specify stronger regularization
    'model__svr__C': [0.1, 1.0, 10.0],  
    
    ### Multi-layer Perceptron Regressor hyperparameters
    # Learning rate initial value
    'model__mlpregressor__learning_rate_init': [0.001, 0.01],
    # Batch size for minibatch training
    'model__mlpregressor__batch_size': [100, 200],             
}


In [7]:
gridSearch = GridSearchCV(estimator=stackPipeline, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1, refit=True)

In [8]:
gridSearch.fit(X_train, y_train)
print(f'Best Parameters: {gridSearch.best_params_ }   Score: {gridSearch.best_score_}')
y_test_pred = gridSearch.predict(X_test)



#### Visualize results

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

ax.plot(test.index, y_test, label='Actual')
ax.plot(test.index, y_test_pred, label='Predicted')

ax.set_xlabel('Index')
ax.set_ylabel('Sale Price')
ax.set_title('Actual vs. Predicted Sale Prices')
ax.legend()

plt.show()

r2 = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
mrae = mean_absolute_error(y_test, y_test_pred) / np.mean(y_test)
print(f"R²: {r2}")
print(f"Mean Relative Absolute Error: {mrae*100:.2f}%")


NameError: name 'plt' is not defined