In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error,
    make_scorer
)


In [None]:
data = pd.read_csv("katowice.csv")

In [6]:
X = data.drop(columns=["price"])
y = data["price"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
results_PCA = []

In [None]:
def evaluate_with_pca(name, model, X_train, X_test, y_train, y_test, use_log_target=False, pca_components=0.90):
    """
    Trenuje i ewaluje model w Pipeline z PCA (bez tuningu).
    Zwraca metryki: znormalizowane MAE, RMSE, R² i MAPE.

    Parametry:
    - name: nazwa modelu (str)
    - model: obiekt modelu (np. LinearRegression())
    - X_train, X_test, y_train, y_test: dane
    - use_log_target: czy logarytmować zmienną celu (bool)
    - pca_components: ile składowych PCA (np. 0.95 lub 10)

    Zwraca:
    - słownik z wynikami do dodania do results_pca
    """
    # Logarytmowanie celu (jeśli trzeba)
    if use_log_target:
        y_train_fit = np.log1p(y_train)
        y_test_eval = np.expm1(np.log1p(y_test))
    else:
        y_train_fit = y_train
        y_test_eval = y_test

    # Pipeline: Standaryzacja + PCA + model
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=pca_components)), # PCA przekształca dane w nowe cechy	By zmniejszyć wymiarowość i korelacje
        ("model", model)
    ])

    # Trening
    pipe.fit(X_train, y_train_fit)
    y_pred = pipe.predict(X_test)

    # Odlogarytmowanie (jeśli trzeba)
    if use_log_target:
        y_pred = np.expm1(y_pred)

    mae = mean_absolute_error(y_test_eval, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test_eval, y_pred))
    r2 = r2_score(y_test_eval, y_pred)
    mape = np.mean(np.abs((y_test_eval - y_pred) / y_test_eval)) 

    mean_price = np.mean(y_test_eval)
    mae_norm = mae / mean_price
    rmse_norm = rmse / mean_price

    print(f"\n📊 {name} (z PCA)")
    print(f"R²: {r2:.4f}")
    print(f"Znormalizowane MAE: {mae_norm:.4f}")
    print(f"Znormalizowane RMSE: {rmse_norm:.4f}")
    print(f"MAPE: {mape:.2f}")

    return {
        "Model": name + " (PCA)",
        "MAE_norm": mae_norm,
        "RMSE_norm": rmse_norm,
        "R2": r2,
        "MAPE": mape
    }


In [10]:
model = LinearRegression()
result = evaluate_with_pca("Linear Regression", model, X_train, X_test, y_train, y_test)
results_PCA.append(result)


📊 Linear Regression (z PCA)
R²: 0.6990
Znormalizowane MAE: 0.1531
Znormalizowane RMSE: 0.2008
MAPE: 0.17


In [11]:
model = KNeighborsRegressor()
result = evaluate_with_pca("kNN", model, X_train, X_test, y_train, y_test)
results_PCA.append(result)


📊 kNN (z PCA)
R²: 0.6640
Znormalizowane MAE: 0.1602
Znormalizowane RMSE: 0.2122
MAPE: 0.18


In [12]:
model = MLPRegressor(max_iter=1000, random_state=42)
result = evaluate_with_pca("MLP", model, X_train, X_test, y_train, y_test)
results_PCA.append(result)


📊 MLP (z PCA)
R²: -6.9655
Znormalizowane MAE: 0.9723
Znormalizowane RMSE: 1.0331
MAPE: 0.97




SVR z logiem

In [13]:
from sklearn.svm import SVR

model = SVR()
result = evaluate_with_pca("SVR", model, X_train, X_test, y_train, y_test, use_log_target=True)
results_PCA.append(result)



📊 SVR (z PCA)
R²: 0.8035
Znormalizowane MAE: 0.1209
Znormalizowane RMSE: 0.1622
MAPE: 0.13


In [14]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
result = evaluate_with_pca("Random Forest", model, X_train, X_test, y_train, y_test)
results_PCA.append(result)


📊 Random Forest (z PCA)
R²: 0.7254
Znormalizowane MAE: 0.1363
Znormalizowane RMSE: 0.1918
MAPE: 0.15


In [15]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(random_state=42)
result = evaluate_with_pca("Gradient Boosting", model, X_train, X_test, y_train, y_test)
results_PCA.append(result)


📊 Gradient Boosting (z PCA)
R²: 0.7332
Znormalizowane MAE: 0.1396
Znormalizowane RMSE: 0.1891
MAPE: 0.16


In [16]:
from xgboost import XGBRegressor

model = XGBRegressor(random_state=42, verbosity=0)
result = evaluate_with_pca("XGBoost", model, X_train, X_test, y_train, y_test)
results_PCA.append(result)


📊 XGBoost (z PCA)
R²: 0.7191
Znormalizowane MAE: 0.1336
Znormalizowane RMSE: 0.1940
MAPE: 0.15


In [17]:
results_df = pd.DataFrame(results_PCA)
results_df = results_df.sort_values("R2", ascending=False)
results_df.reset_index(drop=True, inplace=True)

display(results_df)

Unnamed: 0,Model,MAE_norm,RMSE_norm,R2,MAPE
0,SVR (PCA),0.120929,0.16224,0.803547,0.130766
1,Gradient Boosting (PCA),0.13962,0.189063,0.733218,0.156433
2,Random Forest (PCA),0.136258,0.191798,0.725443,0.154221
3,XGBoost (PCA),0.133579,0.193994,0.71912,0.148991
4,Linear Regression (PCA),0.153115,0.200812,0.699031,0.166539
5,kNN (PCA),0.160162,0.212187,0.663968,0.180011
6,MLP (PCA),0.972325,1.033084,-6.965511,0.974691
