# Modele Bazowe - Regresja Liniowa i Prophet

Ten notebook implementuje modele bazowe dla prognozowania popytu, które będą służyć jako punkt odniesienia dla modeli neuronowych.

## Modele bazowe:
- Regresja liniowa z cechami czasowymi
- Prophet (model Facebook do szeregów czasowych)
- Średnia ruchoma
- Naiwny model sezonowy

## 0. Imports & helper to log in

In [1]:
!pip install -q --upgrade pip

!pip install pycaret[full]



In [34]:
from pathlib import Path
import pandas as pd, numpy as np, csv, uuid

from metrics import summary, rmsle
from pycaret.regression import RegressionExperiment

DATA   = Path("/content/data_processed")
RESULT = Path("results.csv")

if not RESULT.exists():
    columns = ["model","run_id","split",
               "MAE","RMSE","RMSLE","MAPE","SMAPE","WAPE","R2"]
    pd.DataFrame(columns=columns).to_csv(RESULT, index=False)

def log(model_name, run_id, split, y_true, y_pred):
    s = summary(y_true, y_pred)
    with RESULT.open("a", newline="") as f:
        csv.writer(f).writerow([model_name, run_id, split] + list(s.values()))

PRED_DIR = Path("predictions")
PRED_DIR.mkdir(exist_ok=True)

def dump_preds(model_name: str,
               split: str,
               dates: pd.Series,
               y_true: pd.Series,
               y_pred: pd.Series) -> None:
    out = pd.DataFrame({
        "date":   dates.values,
        "y_true": y_true.values,
        "y_pred": y_pred.values,
    })
    fn = PRED_DIR / f"{model_name.lower()}_{split}.parquet"
    out.to_parquet(fn, index=False)
    print(f"Saved {fn.name} ({len(out):,} rows)")


## 1. Train / val / test division

In [35]:
train_df = pd.read_parquet(DATA/'train.parquet')
val_df   = pd.read_parquet(DATA/'val.parquet')
test_df  = pd.read_parquet(DATA/'test.parquet')

target       = "sales"
feature_cols = train_df.columns.difference(["sales","date"])


## 2. PyCaret configuration

In [36]:
exp = RegressionExperiment()
exp.setup(
    data          = train_df[feature_cols.tolist() + [target]],
    test_data     = val_df[feature_cols.tolist()   + [target]],
    target        = target,
    session_id    = 42,
    fold_strategy = "timeseries",
    fold          = 3,
    fold_shuffle        = False,
    data_split_shuffle  = False,
    verbose       = False,
    transform_target  = False,
)

def rmsle_clip_pycaret(y_true, y_pred, **kwargs):
    return rmsle(y_true, np.clip(y_pred, 0, None))

exp.remove_metric("rmsle")
exp.add_metric(
    id               = "rmsle",
    name             = "RMSLE",
    score_func       = rmsle_clip_pycaret,
    greater_is_better= False,
)

Unnamed: 0,rmsle
Name,RMSLE
Display Name,RMSLE
Score Function,<function rmsle_clip_pycaret at 0x78559472d1c0>
Scorer,"make_scorer(rmsle_clip_pycaret, greater_is_bet..."
Target,pred
Args,{}
Greater is Better,False
Custom,True


## 3. Training and selection of four base models

In [37]:
best_models = exp.compare_models(
    include = ["xgboost", "lr", "ridge", "lasso"],
    sort    = "RMSLE",
    n_select = 4,
)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE,RMSLE,TT (Sec)
xgboost,Extreme Gradient Boosting,92.8457,109497.276,325.603,0.9001,2.8168,1.1075,15.6267
lasso,Lasso Regression,91.2243,92111.4199,302.5573,0.9191,4.3087,1.4394,15.0233
ridge,Ridge Regression,98.3953,92961.7438,304.0065,0.9183,5.4525,1.543,1.84
lr,Linear Regression,98.3993,92961.6175,304.0063,0.9183,5.4533,1.5432,3.0867


Processing:   0%|          | 0/24 [00:00<?, ?it/s]

## 4. Evaluation on validation and test + record

In [38]:
for m in best_models:
    name = m.__class__.__name__
    run  = f"{name}_{uuid.uuid4().hex[:6]}"

    # ----- validation -----
    pred_val = exp.predict_model(m, data=val_df)["prediction_label"]
    log(name, run, "val", val_df[target], pred_val)
    dump_preds(name, "val", val_df["date"], val_df[target], pred_val)

    # ----- test -----
    pred_test = exp.predict_model(m, data=test_df)["prediction_label"]
    log(name, run, "test", test_df[target], pred_test)
    dump_preds(name, "test", test_df["date"], test_df[target], pred_test)

print("Metrics appended to results.csv  |  predictions written to /predictions")


Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE,RMSLE
0,Extreme Gradient Boosting,86.7202,74768.2344,273.4378,0.9559,1.4928,0.9315


Saved xgbregressor_val.parquet (322,542 rows)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE,RMSLE
0,Extreme Gradient Boosting,79.5057,56488.5508,237.6732,0.9657,1.2179,0.8433


Saved xgbregressor_test.parquet (81,972 rows)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE,RMSLE
0,Lasso Regression,98.4997,101545.6854,318.6623,0.9401,3.0346,1.2975


Saved lasso_val.parquet (322,542 rows)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE,RMSLE
0,Lasso Regression,84.3508,63211.0118,251.418,0.9616,2.4924,1.2079


Saved lasso_test.parquet (81,972 rows)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE,RMSLE
0,Ridge Regression,99.7259,101506.2948,318.6005,0.9401,3.2474,1.3019


Saved ridge_val.parquet (322,542 rows)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE,RMSLE
0,Ridge Regression,84.895,63096.6917,251.1905,0.9617,2.538,1.1846


Saved ridge_test.parquet (81,972 rows)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE,RMSLE
0,Linear Regression,99.7271,101506.5366,318.6009,0.9401,3.2477,1.302


Saved linearregression_val.parquet (322,542 rows)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE,RMSLE
0,Linear Regression,84.8959,63096.7714,251.1907,0.9617,2.5381,1.1846


Saved linearregression_test.parquet (81,972 rows)
Metrics appended to results.csv  |  predictions written to /predictions


## 6. Ranking table

In [40]:
lb = (pd.read_csv(RESULT)
        .pivot(index="model", columns="split", values="RMSLE")
        .sort_values("test"))
display(lb.round(4))

split,test,val
model,Unnamed: 1_level_1,Unnamed: 2_level_1
XGBRegressor,0.8433,0.9315
Ridge,1.1846,1.3019
LinearRegression,1.1846,1.302
Lasso,1.2079,1.2975
