# Full Training Pipeline 

In [1]:

from pathlib import Path
BASE_DIR = Path().resolve()
TRAIN_DIR = BASE_DIR / "TRAIN"
TEST_DIR  = BASE_DIR / "TEST"
OUT_BASE  = BASE_DIR / "OUTPUTS"

for p in (TRAIN_DIR, TEST_DIR, OUT_BASE):
    p.mkdir(parents=True, exist_ok=True)

BASE_DIR, TRAIN_DIR, TEST_DIR, OUT_BASE


(WindowsPath('C:/Users/admin/Desktop/Nowy folder (3)'),
 WindowsPath('C:/Users/admin/Desktop/Nowy folder (3)/TRAIN'),
 WindowsPath('C:/Users/admin/Desktop/Nowy folder (3)/TEST'),
 WindowsPath('C:/Users/admin/Desktop/Nowy folder (3)/OUTPUTS'))

## Import

In [2]:

import json, random, csv
from datetime import datetime
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import joblib
import tensorflow as tf
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Bidirectional, Conv1D, Dense, GRU, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
from keras.regularizers import l2
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder


## Konfiguracja GPU

In [3]:

try:
    gpus = tf.config.list_physical_devices("GPU")
    if gpus:
        for g in gpus:
            try:
                tf.config.set_memory_growth(g, True)
            except Exception:
                pass
        print(f"[tf] GPUs: {len(gpus)} — memory_growth ON")
    else:
        print("[tf] GPU not found")
except Exception:
    pass


[tf] GPUs: 2 — memory_growth ON


## Konfiguracja

In [4]:

# ========= KONFIG – NAGŁÓWKI POD TWOJE CSV =========
INPUT_COLUMNS: List[str] = [
    "feature_01","feature_02","feature_03","feature_04","feature_05","feature_06",
    "feature_07","feature_08","feature_09","feature_10","feature_11","feature_12",
    "comp_active"
]
OUTPUT_COLUMN   = "target"
ACTIVE_COMP_COL = "comp_active"   # sygnał aktywności kompresora

# Cechy kategoryczne (jeżeli masz takie kolumny w INPUT_COLUMNS, dopisz je tutaj):
CATEGORICAL_COLS: List[str] = []

# Wierność/oraz zachowanie
STRICT_SCHEMA       = True   # True: brak kolumny -> błąd; False: brakujące wypełnij 0.0
USE_ONEHOT          = True   # włącz/wyłącz OneHot
ROLLING_WINDOW      = 500    # stałe okno do rolling mean, jak w oryginale
LOSS                = "mse"  # "mse" lub "huber"
ACTIVE_WEIGHT_ALPHA = 0.0    # >0 aby dociążyć aktywne próbki (np. 0.5)

# Opcjonalna jawna lista plików testowych (puste = bierz wszystkie z TEST/)
TEST_FILES_OVERRIDE: List[str] = []  # np. ["test_A.csv","test_B.csv"]


## Utils

In [5]:

def set_seed(seed: int = 50) -> None:
    random.seed(seed); np.random.seed(seed); tf.random.set_seed(seed)
    print(f"[seed] {seed}")

def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def make_session_dir(base: Path) -> Path:
    ensure_dir(base)
    d = base / datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    d.mkdir(parents=True, exist_ok=True)
    return d

def save_feature_config(cols_in: List[str], col_out: str, out_dir: Path) -> None:
    (out_dir / "feature_config.json").write_text(
        json.dumps({"input_columns": cols_in, "output_column": col_out}, indent=2),
        encoding="utf-8"
    )

def save_scalers(scaler: StandardScaler, out_scaler: StandardScaler, encoders: Dict, out_dir: Path) -> None:
    joblib.dump(scaler, out_dir / "scaler.gz")
    joblib.dump(out_scaler, out_dir / "output_scaler.gz")
    joblib.dump(encoders, out_dir / "encoders.gz")

def read_csv_auto(path: Path) -> pd.DataFrame:
    """Spróbuj wykryć delimiter, w razie niepowodzenia fallback na ';' i na domyślne."""
    try:
        sample = path.read_text(encoding="utf-8", errors="ignore")[:10000]
        dialect = csv.Sniffer().sniff(sample, delimiters=[",",";","|","\t"])
        df = pd.read_csv(path, sep=dialect.delimiter)
        if df.shape[1] == 1 and ";" in sample:
            df = pd.read_csv(path, sep=";")
    except Exception:
        try:
            df = pd.read_csv(path, sep=";")
        except Exception:
            df = pd.read_csv(path)
    return df

def _init_onehot_encoder() -> OneHotEncoder:
    # Zgodność z różnymi wersjami sklearn
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)


## Normalizacja danych

In [6]:

from typing import Tuple

def normalize_data(
    df: pd.DataFrame,
    expected_input_cols: List[str],
    output_col: str,
    scaler: Optional[StandardScaler] = None,
    out_scaler: Optional[StandardScaler] = None,
    encoders: Optional[Dict] = None,
    fit_encoders: bool = False,
) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], StandardScaler, StandardScaler, Dict]:
    if df is None or df.empty:
        return None, None, scaler, out_scaler, encoders

    # Walidacja / uzupełnienie kolumn
    missing = [c for c in expected_input_cols if c not in df.columns]
    if missing and STRICT_SCHEMA:
        raise ValueError(f"Brak kolumn w danych: {missing}")
    for c in missing:
        df[c] = 0.0  # łagodny fallback gdy STRICT_SCHEMA=False

    # kolejność jak w expected_input_cols
    use_cols = expected_input_cols.copy()
    if output_col in df.columns and output_col not in use_cols:
        use_cols.append(output_col)
    # Zapewnij, że kolumny są w pożądanej kolejności
    cols_final = [c for c in use_cols if c in df.columns]
    if output_col in df.columns and output_col not in cols_final:
        cols_final.append(output_col)
    df = df[cols_final + ([output_col] if output_col in df.columns and output_col not in cols_final else [])]

    # num vs cat
    num_cols = [c for c in expected_input_cols if pd.api.types.is_numeric_dtype(df[c])]
    cat_cols = [c for c in expected_input_cols if c not in num_cols]
    for c in CATEGORICAL_COLS:
        if c in expected_input_cols and c not in cat_cols:
            cat_cols.append(c)
            if c in num_cols:
                num_cols.remove(c)

    # NA handling
    for c in num_cols:
        if df[c].isna().any():
            df[c] = df[c].fillna(df[c].mean())
    for c in cat_cols:
        if df[c].isna().any():
            df[c] = df[c].astype("object").fillna(method="ffill")

    scaler     = scaler or StandardScaler()
    out_scaler = out_scaler or StandardScaler()
    encoders   = encoders or {"onehot": _init_onehot_encoder()}

    X_num = scaler.fit_transform(df[num_cols]) if not hasattr(scaler,"mean_") else scaler.transform(df[num_cols])
    X_cat = np.empty((len(df), 0))
    if USE_ONEHOT and cat_cols:
        if fit_encoders or not hasattr(encoders["onehot"], "categories_"):
            encoders["onehot"].fit(df[cat_cols])
        X_cat = encoders["onehot"].transform(df[cat_cols])

    X = np.concatenate([X_num, X_cat], axis=1).astype(np.float32)
    X = np.expand_dims(X, axis=-1)  # (n, features, 1)

    if output_col in df.columns:
        y_df = df[[output_col]]
    else:
        y_df = pd.DataFrame(np.zeros((len(df),1)), columns=[output_col])
    y = out_scaler.fit_transform(y_df.values) if not hasattr(out_scaler,"mean_") else out_scaler.transform(y_df.values)
    return X, y, scaler, out_scaler, encoders


## Model i trening

In [7]:

def build_model(num_features: int) -> Sequential:
    m = Sequential([
        Conv1D(64, 1, activation="relu", input_shape=(num_features, 1)),
        Conv1D(128, 1, activation="relu"),
        Dropout(0.2),
        Conv1D(64, 1, activation="relu"),
        Dropout(0.2),
        Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(1e-3))),
        Dropout(0.3),
        Bidirectional(GRU(64, return_sequences=False, kernel_regularizer=l2(1e-3))),
        Dropout(0.3),
        Dense(1, activation="linear"),
    ])
    loss_fn = "mse" if LOSS.lower() == "mse" else tf.keras.losses.Huber()
    m.compile(optimizer=Adam(1e-3), loss=loss_fn, metrics=["mae","mape"])
    return m

def train_model(
    model: Sequential,
    X: np.ndarray,
    y: np.ndarray,
    out_dir: Path,
    epochs: int = 100,
    batch_size: int = 128,
    sample_weight: Optional[np.ndarray] = None,
) -> tf.keras.callbacks.History:
    ensure_dir(out_dir)
    cb = [
        EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
        ReduceLROnPlateau(monitor="val_loss", patience=3, factor=0.5, min_lr=1e-5, verbose=1),
        ModelCheckpoint(out_dir / "best_model.keras", monitor="val_loss", mode="min", save_best_only=True),
        CSVLogger(out_dir / "training_log.csv", append=True),
    ]
    print(f"[train] X={X.shape} y={y.shape} epochs={epochs} batch={batch_size}")
    hist = model.fit(
        X, y, epochs=epochs, batch_size=batch_size, validation_split=0.1,
        callbacks=cb, verbose=1, sample_weight=sample_weight
    )
    return hist


## Wykresy i metryki

In [8]:

def plot_training_curves(history: tf.keras.callbacks.History, out_dir: Path, tag: str) -> None:
    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1); plt.plot(history.history["loss"]); plt.plot(history.history["val_loss"]); plt.title("Loss"); plt.legend(["train","val"])
    plt.subplot(1,2,2); plt.plot(history.history["mae"]);  plt.plot(history.history["val_mae"]);  plt.title("MAE");  plt.legend(["train","val"])
    plt.tight_layout()
    path = out_dir / f"training_curves_{tag}.png"
    plt.savefig(path, dpi=150); plt.close()
    print(f"[plot] {path}")

def plot_pred_diff_interactive(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    X: Optional[np.ndarray] = None,
    in_cols: Optional[List[str]] = None,
    out_dir: Path = Path("."),
    tag: str = "plot",
) -> Dict[str, float]:
    diffs = y_pred - y_true
    win   = ROLLING_WINDOW  # stałe okno jak w oryginale
    if X is not None and in_cols and ACTIVE_COMP_COL in in_cols:
        idx = in_cols.index(ACTIVE_COMP_COL)
        active = X[:, idx, 0] != 0
        filt = np.where(active, diffs, np.nan)
        roll = pd.Series(filt).ffill().rolling(window=win).mean()
        roll_name = f"rolling({win}) [active]"
    else:
        roll = pd.Series(diffs).rolling(window=win).mean()
        roll_name = f"rolling({win})"

    mse = mean_squared_error(y_true, y_pred); rmse = float(np.sqrt(mse))
    mae = mean_absolute_error(y_true, y_pred); r2 = r2_score(y_true, y_pred)
    fig = make_subplots(rows=2, cols=1, subplot_titles=("Truth vs Pred", "Diff + Rolling"))
    fig.add_trace(go.Scatter(y=y_true, mode="lines", name="true"), row=1, col=1)
    fig.add_trace(go.Scatter(y=y_pred, mode="lines", name="pred"), row=1, col=1)
    fig.add_trace(go.Scatter(y=diffs, mode="lines", name="diff"), row=2, col=1)
    fig.add_trace(go.Scatter(y=roll,  mode="lines", name=roll_name), row=2, col=1)
    fig.update_layout(height=800, title_text=f"Prediction analysis — {tag}", showlegend=True)
    path = out_dir / f"prediction_analysis_{tag}.html"
    fig.write_html(path); print(f"[plot] {path}")
    return dict(mse=float(mse), rmse=rmse, mae=float(mae), r2=float(r2))


## Główny pipeline

In [9]:

def run_pipeline(epochs: int = 5, batch_size: int = 128):
    set_seed(50)
    ensure_dir(TRAIN_DIR); ensure_dir(TEST_DIR)
    out_dir = make_session_dir(OUT_BASE)
    save_feature_config(INPUT_COLUMNS, OUTPUT_COLUMN, out_dir)

    train_files = sorted([f for f in TRAIN_DIR.glob("*.csv")])
    test_files  = [TEST_DIR / f for f in TEST_FILES_OVERRIDE] if TEST_FILES_OVERRIDE else sorted([f for f in TEST_DIR.glob("*.csv")])
    if not train_files:
        print(f("[warn] wrzuć CSV do: {TRAIN_DIR}")); return
    if not test_files:
        print(f("[warn] wrzuć CSV do: {TEST_DIR}"))  # dalej i tak potrenuje, ale bez testu

    scaler, out_scaler = StandardScaler(), StandardScaler()
    enc = {"onehot": _init_onehot_encoder()}
    model, used_cols = None, INPUT_COLUMNS

    all_metrics = []
    for i, fpath in enumerate(train_files, start=1):
        df = read_csv_auto(fpath)
        X_tr, y_tr, scaler, out_scaler, enc = normalize_data(
            df, used_cols, OUTPUT_COLUMN, scaler, out_scaler, enc, fit_encoders=(i==1)
        )
        if X_tr is None:
            print(f"[train] skip {fpath.name}"); continue

        if model is None:
            model = build_model(X_tr.shape[1]); model.summary()

        # opcjonalne dociążenie próbek aktywnych
        sample_weight = None
        if ACTIVE_WEIGHT_ALPHA > 0 and ACTIVE_COMP_COL in used_cols:
            idx_act = used_cols.index(ACTIVE_COMP_COL)
            active  = (X_tr[:, idx_act, 0] != 0).astype(np.float32)
            sample_weight = 1.0 + ACTIVE_WEIGHT_ALPHA * active

        hist = train_model(model, X_tr, y_tr, out_dir, epochs=epochs, batch_size=batch_size, sample_weight=sample_weight)
        pd.DataFrame(hist.history).to_csv(out_dir / f"history_train_{i:02d}.csv", index=False)
        plot_training_curves(hist, out_dir, tag=f"train_{i:02d}")
        save_scalers(scaler, out_scaler, enc, out_dir)

        # snapshot modelu po etapie
        model.save(out_dir / f"model_iter_{i:02d}.keras")
        (out_dir / f"model_iter_{i:02d}_config.json").write_text(
            json.dumps({"input_shape": int(X_tr.shape[1])}), encoding="utf-8"
        )

        # test po każdej iteracji
        for j, tpath in enumerate(test_files, start=1):
            if not tpath.exists():
                print(f"[test] missing {tpath.name}"); continue
            df_t = read_csv_auto(tpath)
            X_te, y_te, _, _, _ = normalize_data(df_t, used_cols, OUTPUT_COLUMN, scaler, out_scaler, enc, fit_encoders=False)
            if X_te is None:
                print(f"[test] skip {tpath.name}"); continue

            y_pred_n = model.predict(X_te, verbose=0).flatten()
            y_true = out_scaler.inverse_transform(y_te.reshape(-1,1)).flatten()
            y_pred = out_scaler.inverse_transform(y_pred_n.reshape(-1,1)).flatten()

            m = plot_pred_diff_interactive(y_true, y_pred, X=X_te, in_cols=used_cols, out_dir=out_dir, tag=f"tr{i:02d}_te{j:02d}")
            row = {"train_file": fpath.name, "test_file": tpath.name, **m}
            all_metrics.append(row)
            print(f"[metrics] {tpath.name}: MSE={m['mse']:.3f} RMSE={m['rmse']:.3f} MAE={m['mae']:.3f} R2={m['r2']:.3f}")

    if all_metrics:
        pd.DataFrame(all_metrics).to_csv(out_dir / "test_metrics.csv", index=False)
        print(f"[eval] metrics → {out_dir / 'test_metrics.csv'}")
    print(f"[done] artifacts → {out_dir}")


## Start

In [10]:

# Ustal parametry i odpal
EPOCHS = 5
BATCH  = 128
run_pipeline(epochs=EPOCHS, batch_size=BATCH)


[seed] 50
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 13, 64)            128       
                                                                 
 conv1d_1 (Conv1D)           (None, 13, 128)           8320      
                                                                 
 dropout (Dropout)           (None, 13, 128)           0         
                                                                 
 conv1d_2 (Conv1D)           (None, 13, 64)            8256      
                                                                 
 dropout_1 (Dropout)         (None, 13, 64)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 13, 256)          148992    
 l)                                                              
                                              