In [None]:
# =========================================================
# Server Machine Dataset — Time Series Failure Prediction
# =========================================================

import os
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score, classification_report
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import joblib


# -----------------------------
# 1. PATHS
# -----------------------------

TRAIN_PATH = "ServerMachineDataset/train"
LABEL_PATH = "ServerMachineDataset/interpretation_label"


assert os.path.exists(TRAIN_PATH), "Папка train не найдена"
assert os.path.exists(TRAIN_LABEL), "Папка interpretation_label не найдена"


# -----------------------------
# 2. PARAMETERS
# -----------------------------
WINDOW = 10          # длина окна истории
HORIZON = 1          # прогноз на 1 шаг вперёд
FEATURES = [0, 1, 2, 3]   # используем первые 4 метрики (CPU/MEM/IO/NET)

RANDOM_STATE = 42


# -----------------------------
# 3. LOAD FILE LIST
# -----------------------------
machine_files = sorted([
    f for f in os.listdir(TRAIN_PATH)
    if f.endswith(".txt")
])

print(f"Найдено серверов: {len(machine_files)}")
print("Пример файлов:", machine_files[:3])


# -----------------------------
# 4. BUILD TIME SERIES DATASET
# -----------------------------
X, y = [], []

for machine_file in machine_files:
    data_path = os.path.join(TRAIN_PATH, machine_file)
    label_path = os.path.join(TRAIN_LABEL_PATH, machine_file)

    if not os.path.exists(label_path):
        print(f"[WARN] Нет label для {machine_file}, пропускаем")
        continue

    # загрузка данных
    data = np.loadtxt(data_path, delimiter=",")
    labels = np.loadtxt(label_path, delimiter=",")

    # проверка согласованности
    if len(data) != len(labels):
        print(f"[WARN] Несовпадение длины данных и label: {machine_file}")
        continue

    df = pd.DataFrame(data)
    df["failed"] = labels

    # если ряд слишком короткий — пропускаем
    if len(df) < WINDOW + HORIZON:
        continue

    # sliding window
    for t in range(WINDOW, len(df) - HORIZON):
        window_features = df.iloc[t-WINDOW:t, FEATURES].values.flatten()
        target = df.iloc[t:t+HORIZON]["failed"].max()

        X.append(window_features)
        y.append(target)


# -----------------------------
# 5. FINAL DATASET
# -----------------------------
X = np.array(X)
y = np.array(y)

print("\n=== Итоговый датасет ===")
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Баланс классов:", np.bincount(y))


# -----------------------------
# 6. TRAIN / TEST SPLIT (TIME ORDER)
# -----------------------------
split_idx = int(len(X) * 0.8)

X_train = X[:split_idx]
y_train = y[:split_idx]

X_test = X[split_idx:]
y_test = y[split_idx:]


# -----------------------------
# 7. MODEL
# -----------------------------
model = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", HistGradientBoostingClassifier(
        max_depth=5,
        learning_rate=0.08,
        max_iter=200,
        random_state=RANDOM_STATE
    ))
])


# -----------------------------
# 8. TRAINING
# -----------------------------
print("\nОбучение модели...")
model.fit(X_train, y_train)


# -----------------------------
# 9. EVALUATION
# -----------------------------
proba = model.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("\n=== Метрики ===")
print("ROC-AUC:", roc_auc_score(y_test, proba))
print(classification_report(y_test, pred, digits=4))


# -----------------------------
# 10. SAVE MODEL
# -----------------------------
artifact = {
    "model": model,
    "window": WINDOW,
    "horizon": HORIZON,
    "features_idx": FEATURES,
    "n_features": len(FEATURES) * WINDOW
}

# joblib.dump(artifact, "smd_ts_failure_model.joblib")

# print("\nМодель сохранена: smd_ts_failure_model.joblib")


AssertionError: Папка train_label не найдена