In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib

from sklearn.preprocessing import LabelEncoder


In [2]:
telemetry = pd.read_csv("PdM_telemetry.csv", parse_dates=["datetime"])
errors = pd.read_csv("PdM_errors.csv", parse_dates=["datetime"])
failures = pd.read_csv("PdM_failures.csv", parse_dates=["datetime"])
machines = pd.read_csv("PdM_machines.csv")
maint = pd.read_csv("PdM_maint.csv", parse_dates=["datetime"])

In [3]:
## Feature Engineering

telemetry = telemetry.sort_values(["machineID", "datetime"])

# Estadísticas móviles (3h y 24h)
rolling_3h = telemetry.set_index("datetime").groupby("machineID")[["volt", "rotate", "pressure", "vibration"]].rolling("3h", min_periods=1).agg(['mean', 'std']).reset_index()
rolling_3h.columns = ["machineID", "datetime"] + [f"{col[0]}_{col[1]}_3h" for col in rolling_3h.columns[2:]]

rolling_24h = telemetry.set_index("datetime").groupby("machineID")[["volt", "rotate", "pressure", "vibration"]].rolling("24h", min_periods=1).agg(['mean', 'std']).reset_index()
rolling_24h.columns = ["machineID", "datetime"] + [f"{col[0]}_{col[1]}_24h" for col in rolling_24h.columns[2:]]

# Unir features
features = pd.merge(rolling_3h, rolling_24h, on=["machineID", "datetime"])

In [4]:
# Agregación de errores

error_counts = errors.copy()
error_counts["count"] = 1
error_counts = error_counts.set_index("datetime").groupby(["machineID", "errorID"]).rolling("24h", min_periods=1)["count"].sum().reset_index()
error_pivot = error_counts.pivot_table(index=["machineID", "datetime"], columns="errorID", values="count").fillna(0).reset_index()

In [5]:
features = pd.merge(features, error_pivot, on=["machineID", "datetime"], how="left").fillna(0)

In [6]:
## Información de maquina
features = pd.merge(features, machines, on="machineID", how="left")
features["model"] = LabelEncoder().fit_transform(features["model"])

In [7]:
features.columns

Index(['machineID', 'datetime', 'volt_mean_3h', 'volt_std_3h',
       'rotate_mean_3h', 'rotate_std_3h', 'pressure_mean_3h',
       'pressure_std_3h', 'vibration_mean_3h', 'vibration_std_3h',
       'volt_mean_24h', 'volt_std_24h', 'rotate_mean_24h', 'rotate_std_24h',
       'pressure_mean_24h', 'pressure_std_24h', 'vibration_mean_24h',
       'vibration_std_24h', 'error1', 'error2', 'error3', 'error4', 'error5',
       'model', 'age'],
      dtype='object')

In [8]:
## Etiquetado

failures["label"] = 1
future_failures = failures.copy()
future_failures["datetime"] = future_failures["datetime"] - pd.Timedelta("24h")
labels = telemetry[["machineID", "datetime"]].merge(future_failures, on=["machineID", "datetime"], how="left").fillna(0)
labels["label"] = labels["label"].apply(lambda x: 1 if x != 0 else 0)

# Dataset final
dataset = pd.merge(features, labels[["machineID", "datetime", "label"]], on=["machineID", "datetime"], how="left")
dataset["label"] = dataset["label"].fillna(0).astype(int)
dataset = dataset.dropna()

In [9]:
split_date = dataset["datetime"].quantile(0.8)

train = dataset[dataset["datetime"] < split_date]
test = dataset[dataset["datetime"] >= split_date]

X_train = train.drop(columns=["datetime", "label"])
y_train = train["label"]

X_test = test.drop(columns=["datetime", "label"])
y_test = test["label"]

print("Train class distribution:")
print(y_train.value_counts())

Train class distribution:
label
0    700229
1       603
Name: count, dtype: int64


In [10]:

from sklearn.utils import resample

train_data = pd.concat([X_train, y_train], axis=1)
class_0 = train_data[train_data.label == 0]
class_1 = train_data[train_data.label == 1]

# Undersample clase 0 para lograr un ratio 3:1
class_0_under = class_0.sample(n=len(class_1)*3, random_state=42)
balanced_train = pd.concat([class_0_under, class_1])

X_train_bal = balanced_train.drop(columns="label")
y_train_bal = balanced_train["label"]

print("Balanced class distribution:")
print(y_train_bal.value_counts())


Balanced class distribution:
label
0    1809
1     603
Name: count, dtype: int64


In [11]:
# Entrenamiento de modelo

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train_bal, y_train_bal)

y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)[:, 1]

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"AUC: {roc_auc_score(y_test, y_probs):.4f}")

[[175059    110]
 [     0    140]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    175169
           1       0.56      1.00      0.72       140

    accuracy                           1.00    175309
   macro avg       0.78      1.00      0.86    175309
weighted avg       1.00      1.00      1.00    175309

AUC: 1.0000
