In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils import resample


In [2]:
telemetry = pd.read_csv("train_telemetry.csv", parse_dates=["datetime"])
failures = pd.read_csv("train_failures.csv", parse_dates=["datetime"])
machines = pd.read_csv("PdM_machines.csv")

In [3]:
# Definir punto de corte
#cutoff_date = pd.Timestamp("2015-10-01")

#telemetry = pd.read_csv("PdM_telemetry.csv", parse_dates=["datetime"])
#failures = pd.read_csv("PdM_failures.csv", parse_dates=["datetime"])
#machines = pd.read_csv("PdM_machines.csv")

#errors = pd.read_csv("PdM_errors.csv", parse_dates=["datetime"])
#maint = pd.read_csv("PdM_maint.csv", parse_dates=["datetime"])

In [4]:
telemetry = telemetry.sort_values(by=["machineID", "datetime"])

rolling = telemetry.set_index("datetime") \
    .groupby("machineID")[["volt", "rotate", "pressure", "vibration"]] \
    .rolling("3h", min_periods=1) \
    .agg(['mean', 'std']) \
    .reset_index()

rolling.columns = ["machineID", "datetime"] + [f"{col[0]}_{col[1]}_3h" for col in rolling.columns[2:]]


In [5]:
failures["label"] = 1

telemetry_labels = pd.merge_asof(
    rolling.sort_values("datetime"),
    failures.sort_values("datetime"),
    by="machineID",
    on="datetime",
    direction="forward",
    tolerance=pd.Timedelta("24h")
)

telemetry_labels["label"] = telemetry_labels["label"].fillna(0)


In [6]:
# Merge con datos de máquina
data = pd.merge(telemetry_labels, machines, on="machineID", how="left")

# One-hot encoding
data["model"] = data["model"].astype(str)
data = pd.get_dummies(data, columns=["model"])

# Elimina columna categórica 'failure' que contiene texto
if "failure" in data.columns:
    data = data.drop(columns=["failure"])

# Eliminar columna datetime
data = data.drop(columns=["datetime"])

# Separar X, y
X = data.drop(columns=["label"])
y = data["label"]


In [7]:
train_data = pd.concat([X, y], axis=1)
class_0 = train_data[train_data.label == 0]
class_1 = train_data[train_data.label == 1]

class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1)*3, random_state=42)
balanced = pd.concat([class_1, class_0_downsampled])

X_bal = balanced.drop(columns=["label"])
y_bal = balanced["label"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_bal, y_bal, test_size=0.2, random_state=42, stratify=y_bal
)

In [10]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [11]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))

[[7546  631]
 [ 650 2075]]
              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92      8177
         1.0       0.77      0.76      0.76      2725

    accuracy                           0.88     10902
   macro avg       0.84      0.84      0.84     10902
weighted avg       0.88      0.88      0.88     10902

AUC: 0.9324118555850882


In [15]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))

[[9891  851]
 [ 839 2741]]
              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92     10742
         1.0       0.76      0.77      0.76      3580

    accuracy                           0.88     14322
   macro avg       0.84      0.84      0.84     14322
weighted avg       0.88      0.88      0.88     14322

AUC: 0.9310577366136576
