In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error,
    make_scorer
)


In [None]:
data = pd.read_csv("katowice.csv")

In [3]:
X = data.drop(columns=["price"])
y = data["price"]

In [4]:
results = []

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
y_log = np.log1p(y)
y_log

0       13.212206
1       13.120363
2       13.066853
3       13.752348
4       13.761413
          ...    
1677    12.367345
1678    13.082585
1679    12.779876
1680    13.071072
1681    13.079458
Name: price, Length: 1682, dtype: float64

In [7]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

def evaluate_model_log_target(model, X_train, y_train, X_test, y_test):
    # 1. Trenowanie na logarytmowanej zmiennej celu
    y_train_log = np.log1p(y_train)
    model.fit(X_train, y_train_log)
    
    # 2. Predykcja w przestrzeni logarytmicznej
    y_pred_log = model.predict(X_test)

    # 3. Odlogowanie predykcji
    y_pred = np.expm1(y_pred_log)

    # 4. Metryki na oryginalnych danych
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)  # format dziesiętny

    # 5. Znormalizowane metryki
    mean_price = np.mean(y_test)
    mae_norm = mae / mean_price
    rmse_norm = rmse / mean_price

    print("📊 Wyniki modelu (log(price)):", model.__class__.__name__)
    print(f"R²: {r2:.4f}")
    print(f"MAPE: {mape:.4f}")
    print(f"Znormalizowany MAE: {mae_norm:.4f}")
    print(f"Znormalizowany RMSE: {rmse_norm:.4f}")

    results.append({
        "Model": model.__class__.__name__ + " (log)",
        "MAE": mae_norm,
        "RMSE": rmse_norm,
        "R2": r2,
        "MAPE": mape
    })

    return {
        "R2": r2,
        "MAPE": mape,
        "MAE_norm": mae_norm,
        "RMSE_norm": rmse_norm
    }


In [8]:
model = LinearRegression()
evaluate_model_log_target(model, X_train, y_train, X_test, y_test)

📊 Wyniki modelu (log(price)): LinearRegression
R²: 0.6485
MAPE: 0.1475
Znormalizowany MAE: 0.1423
Znormalizowany RMSE: 0.2170


{'R2': 0.6485455398802242,
 'MAPE': 0.14747181744891905,
 'MAE_norm': 0.14233137099395057,
 'RMSE_norm': 0.21700174917751142}

In [9]:
model_rf = RandomForestRegressor()
evaluate_model_log_target(model_rf, X_train, y_train, X_test, y_test)

📊 Wyniki modelu (log(price)): RandomForestRegressor
R²: 0.7782
MAPE: 0.1155
Znormalizowany MAE: 0.1109
Znormalizowany RMSE: 0.1724


{'R2': 0.778197046716264,
 'MAPE': 0.1154958649072625,
 'MAE_norm': 0.11086939936938313,
 'RMSE_norm': 0.1723901181209495}

In [10]:
model_gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42)
results_gb = evaluate_model_log_target(model_gb, X_train, y_train, X_test, y_test)

📊 Wyniki modelu (log(price)): GradientBoostingRegressor
R²: 0.8279
MAPE: 0.1075
Znormalizowany MAE: 0.1038
Znormalizowany RMSE: 0.1518


In [11]:
model_xgb = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
results_xgb = evaluate_model_log_target(model_xgb, X_train, y_train, X_test, y_test)

📊 Wyniki modelu (log(price)): XGBRegressor
R²: 0.8109
MAPE: 0.1087
Znormalizowany MAE: 0.1046
Znormalizowany RMSE: 0.1592


In [12]:
model_knn = KNeighborsRegressor()
results_knn = evaluate_model_log_target(model_knn, X_train, y_train, X_test, y_test)

📊 Wyniki modelu (log(price)): KNeighborsRegressor
R²: 0.7292
MAPE: 0.1556
Znormalizowany MAE: 0.1435
Znormalizowany RMSE: 0.1905


In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
svr = SVR(kernel='rbf', C=100, gamma='scale', epsilon=0.1)
results_svr = evaluate_model_log_target(svr, X_train_scaled, y_train, X_test_scaled, y_test)

📊 Wyniki modelu (log(price)): SVR
R²: 0.7915
MAPE: 0.1338
Znormalizowany MAE: 0.1222
Znormalizowany RMSE: 0.1671


In [15]:
model_mlp = MLPRegressor(
    hidden_layer_sizes=(256, 128, 64), 
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate_init=0.001,
    max_iter=2000,
    random_state=42
)

results_mlp = evaluate_model_log_target(model_mlp, X_train_scaled, y_train, X_test_scaled, y_test)


📊 Wyniki modelu (log(price)): MLPRegressor
R²: -0.1124
MAPE: 0.2305
Znormalizowany MAE: 0.2257
Znormalizowany RMSE: 0.3861


In [16]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("R2", ascending=False)
results_df.reset_index(drop=True, inplace=True)

display(results_df)

Unnamed: 0,Model,MAE,RMSE,R2,MAPE
0,GradientBoostingRegressor (log),0.103801,0.151842,0.827921,0.107479
1,XGBRegressor (log),0.104565,0.159156,0.810945,0.108651
2,SVR (log),0.122247,0.167142,0.791496,0.133825
3,RandomForestRegressor (log),0.110869,0.17239,0.778197,0.115496
4,KNeighborsRegressor (log),0.143496,0.190495,0.729161,0.155581
5,LinearRegression (log),0.142331,0.217002,0.648546,0.147472
6,MLPRegressor (log),0.225703,0.386069,-0.112427,0.23049
