# Import

In [15]:
import numpy as np
import pandas as pd
import mysql.connector
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score

# Data Load

In [16]:
DB_CONFIG = {
    "user": "root",
    "password": "Gkp>e!eSwrFEi4t",
    "host": "localhost",
    "database": "maintanence_db"
}
def connect_to_database():
    
    conn = mysql.connector.connect(**DB_CONFIG)
    print("Successfully connected to the database")
    return conn


def load_data_from_db():
    conn = connect_to_database()
    if conn is None:
        print("Connection is None - please implement connect_to_database()")
        return None

    try:
        query = """
        SELECT 
            ft.time_key,
            ft.machine_id,
            ft.component_id,
            ft.temperature_c,
            ft.vibration_rms,
            ft.power_kw,
            ft.ambient_temp_c,
            ft.accumulated_stress_index,
            ft.days_since_last_repair,
            ft.units_produced,
            ft.quality_pass,
            ft.failure_imminent,
            ft.component_failed,
            ft.operational_efficiency,
            
            dm.department,
            dm.model,
            dm.rated_capacity_tons,
            dm.software_version,
            dm.is_critical_asset,
            dm.machine_age_days,
            dm.total_repairs_lifetime,
            dm.base_temp,
            dm.base_vibe,
            dm.base_power,
            
            dc.component_type,
            dc.component_category,
            dc.expected_lifetime_hours,
            
            dt.hour,
            dt.shift,
            dt.is_weekend,
            
            do.experience_level,
            
            dtool.tool_type,
            dtool.current_usage_hours,
            dtool.expected_life_hours
            
        FROM fact_machine_telemetry ft
        LEFT JOIN dim_machine dm ON ft.machine_id = dm.machine_id
        LEFT JOIN dim_component dc ON ft.component_id = dc.component_id
        LEFT JOIN dim_time dt ON ft.time_key = dt.time_key
        LEFT JOIN dim_operator do ON ft.operator_id = do.operator_id
        LEFT JOIN dim_tool dtool ON ft.tool_id = dtool.tool_id
        WHERE ft.temperature_c IS NOT NULL 
          AND ft.vibration_rms IS NOT NULL 
          AND ft.power_kw IS NOT NULL
        LIMIT 10000
        """
        
        df = pd.read_sql(query, conn)
        print(f"Loaded {len(df)} records from database")
        return df
        
    except Exception as e:
        print(f"Error loading data: {e}")
        return None
    finally:
        if conn:
            conn.close()

df = load_data_from_db()

if df is None or df.empty:
    raise ValueError("No data loaded. Fix database connection or query.")

print(f"Dataset shape: {df.shape}")
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())


Successfully connected to the database
Loaded 10000 records from database
Dataset shape: (10000, 34)

Columns: ['time_key', 'machine_id', 'component_id', 'temperature_c', 'vibration_rms', 'power_kw', 'ambient_temp_c', 'accumulated_stress_index', 'days_since_last_repair', 'units_produced', 'quality_pass', 'failure_imminent', 'component_failed', 'operational_efficiency', 'department', 'model', 'rated_capacity_tons', 'software_version', 'is_critical_asset', 'machine_age_days', 'total_repairs_lifetime', 'base_temp', 'base_vibe', 'base_power', 'component_type', 'component_category', 'expected_lifetime_hours', 'hour', 'shift', 'is_weekend', 'experience_level', 'tool_type', 'current_usage_hours', 'expected_life_hours']

First few rows:
   time_key machine_id  component_id  temperature_c  vibration_rms   power_kw  \
0     99975    MAC-026             5        39.2497        1.38607  38.399900   
1    180981    ASS-002             5        34.5400        1.63788   0.706663   
2     97030    MAC

  df = pd.read_sql(query, conn)


# Create Failure Lables

In [17]:
for col in ['base_temp', 'base_vibe', 'base_power']:
    if col in df.columns:
        df[col] = df[col].replace(0, np.nan)
        df[col] = df[col].fillna(df[col].median())

df['temp_ratio'] = df['temperature_c'] / df['base_temp']
df['vibe_ratio'] = df['vibration_rms'] / df['base_vibe']
df['power_ratio'] = df['power_kw'] / df['base_power']

df['risk_score'] = (
    0.4 * df['temp_ratio'] +
    0.4 * df['vibe_ratio'] +
    0.2 * (df['accumulated_stress_index'] / 100.0)
)

df['failure_imminent'] = (df['risk_score'] > 1.5).astype(int)

failure_prob = np.clip(df['risk_score'] / 4.0, 0, 1)
df['component_failed'] = (np.random.random(len(df)) < failure_prob).astype(int)

# Feature Engineering

In [18]:
sort_cols = []
if 'machine_id' in df.columns:
    sort_cols.append('machine_id')
if 'time_key' in df.columns:
    sort_cols.append('time_key')

if sort_cols:
    df = df.sort_values(sort_cols)
if 'machine_id' in df.columns:
    group = df.groupby('machine_id')
    df['temp_rolling_mean'] = group['temperature_c'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)
    df['vibe_rolling_std'] = group['vibration_rms'].rolling(5, min_periods=1).std().reset_index(level=0, drop=True)
    df['power_gradient'] = group['power_kw'].diff().reset_index(level=0, drop=True)
else:
    df['temp_rolling_mean'] = df['temperature_c'].rolling(5, min_periods=1).mean()
    df['vibe_rolling_std'] = df['vibration_rms'].rolling(5, min_periods=1).std()
    df['power_gradient'] = df['power_kw'].diff()

df['temp_rolling_mean'] = df['temp_rolling_mean'].fillna(df['temperature_c'])
df['vibe_rolling_std'] = df['vibe_rolling_std'].fillna(0)
df['power_gradient'] = df['power_gradient'].fillna(0)

# Stress growth
df['stress_growth'] = df['accumulated_stress_index'].diff().clip(lower=0).fillna(0)

# Handle Categorical Variables

In [19]:
print("\nPreprocessing data (label encoding)...")

categorical_cols = [
    'machine_id',
    'component_type',
    'experience_level',
    'department',
    'model',
    'shift',
    'tool_type'
]

label_encoders = {}
for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

print("Categorical encoding done.")


Preprocessing data (label encoding)...
Categorical encoding done.


# BUILD FEATURE MATRIX X AND TARGETS y

In [20]:
feature_columns = [
    'temperature_c',
    'vibration_rms',
    'power_kw',
    'ambient_temp_c',
    'accumulated_stress_index',
    'days_since_last_repair',
    'units_produced',
    'quality_pass',
    'machine_age_days',
    'total_repairs_lifetime',
    'hour',
    'is_weekend',
    'current_usage_hours',
    'temp_ratio',
    'vibe_ratio',
    'power_ratio',
    'temp_rolling_mean',
    'vibe_rolling_std',
    'power_gradient',
    'stress_growth',
    'risk_score'
]

for col in ['component_type', 'experience_level', 'department', 'model', 'shift', 'tool_type']:
    if col in df.columns:
        feature_columns.append(col)

feature_columns = [c for c in feature_columns if c in df.columns]

X = df[feature_columns].fillna(0)
if 'RUL' in df.columns:
    y_rul = df['RUL']
else:
    base_rul = 6000
    penalty_age = df['machine_age_days'].fillna(df['machine_age_days'].median()) * 0.5
    penalty_stress = df['accumulated_stress_index'].fillna(df['accumulated_stress_index'].median()) * 5
    noise = np.random.normal(0, 500, size=len(df))
    y_rul = np.maximum(100, base_rul - penalty_age - penalty_stress + noise)

y_machine_failure = df['failure_imminent']
y_component_failure = df['component_failed']

print(f"\nFeature matrix shape: {X.shape}")
print(f"RUL target shape: {y_rul.shape}")
print(f"Machine failure target shape: {y_machine_failure.shape}")
print(f"Component failure target shape: {y_component_failure.shape}")


Feature matrix shape: (10000, 27)
RUL target shape: (10000,)
Machine failure target shape: (10000,)
Component failure target shape: (10000,)


# SINGLE TRAIN/TEST SPLIT FOR ALL TARGETS

In [21]:
train_idx, test_idx = train_test_split(
    df.index,
    test_size=0.2,
    random_state=42
)

X_train = X.loc[train_idx]
X_test  = X.loc[test_idx]

y_rul_train = y_rul.loc[train_idx]
y_rul_test  = y_rul.loc[test_idx]

y_machine_train = y_machine_failure.loc[train_idx]
y_machine_test  = y_machine_failure.loc[test_idx]

y_component_train = y_component_failure.loc[train_idx]
y_component_test  = y_component_failure.loc[test_idx]

print("\nTrain/test split done.")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")


Train/test split done.
X_train: (8000, 27), X_test: (2000, 27)


In [22]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled  = X_test.copy()

X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test_scaled[num_cols]  = scaler.transform(X_test[num_cols])

print("Scaling done.")

Scaling done.


# SIMPLE AUTOML FOR RUL 
####   Try multiple models + hyperparameters

In [23]:
print("\n" + "="*60)
print("AUTO ML FOR RUL PREDICTION (REGRESSION)")
print("="*60)

models_and_params = {
    "LinearRegression": {
        "model": LinearRegression(),
        "params": {}
    },
    "RandomForestRegressor": {
        "model": RandomForestRegressor(random_state=42, n_jobs=-1),
        "params": {
            "n_estimators": [100, 300, 500],
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "GradientBoostingRegressor": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.05, 0.1],
            "max_depth": [3, 5, 7]
        }
    },
    "SVR": {
        "model": SVR(),
        "params": {
            "C": [0.1, 1, 10],
            "gamma": ['scale', 'auto'],
            "kernel": ['rbf']
        }
    }
}

rul_results = {}
best_overall_model = None
best_overall_name = None
best_overall_mae = float('inf')

for name, cfg in models_and_params.items():
    print(f"\n>>> Tuning {name}...")
    base_model = cfg["model"]
    param_dist = cfg["params"]

    if param_dist:
        search = RandomizedSearchCV(
            base_model,
            param_distributions=param_dist,
            n_iter=10,
            scoring="neg_mean_absolute_error",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        search.fit(X_train_scaled, y_rul_train)
        best_model = search.best_estimator_
        print(f"Best params for {name}: {search.best_params_}")
    else:
        best_model = base_model
        best_model.fit(X_train_scaled, y_rul_train)
        print(f"{name} has no hyperparameters tuned (used default).")

    y_pred = best_model.predict(X_test_scaled)
    mae = mean_absolute_error(y_rul_test, y_pred)
    mse = mean_squared_error(y_rul_test, y_pred)

    rul_results[name] = {
        "model": best_model,
        "MAE": mae,
        "MSE": mse
    }

    print(f"{name} - Test MAE: {mae:.2f}, MSE: {mse:.2f}")

    if mae < best_overall_mae:
        best_overall_mae = mae
        best_overall_model = best_model
        best_overall_name = name

print("\n" + "="*60)
print("RUL MODEL COMPARISON (AUTO ML RESULT)")
print("="*60)
for name, res in rul_results.items():
    print(f"{name:25} - MAE: {res['MAE']:.2f}, MSE: {res['MSE']:.2f}")

print(f"\nBest RUL Model: {best_overall_name} with MAE = {best_overall_mae:.2f}")


AUTO ML FOR RUL PREDICTION (REGRESSION)

>>> Tuning LinearRegression...
LinearRegression has no hyperparameters tuned (used default).
LinearRegression - Test MAE: 397.65, MSE: 248608.81

>>> Tuning RandomForestRegressor...
Best params for RandomForestRegressor: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 10}
RandomForestRegressor - Test MAE: 401.71, MSE: 252205.80

>>> Tuning GradientBoostingRegressor...
Best params for GradientBoostingRegressor: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01}
GradientBoostingRegressor - Test MAE: 400.04, MSE: 250739.31

>>> Tuning SVR...




Best params for SVR: {'kernel': 'rbf', 'gamma': 'scale', 'C': 10}
SVR - Test MAE: 404.09, MSE: 255878.69

RUL MODEL COMPARISON (AUTO ML RESULT)
LinearRegression          - MAE: 397.65, MSE: 248608.81
RandomForestRegressor     - MAE: 401.71, MSE: 252205.80
GradientBoostingRegressor - MAE: 400.04, MSE: 250739.31
SVR                       - MAE: 404.09, MSE: 255878.69

Best RUL Model: LinearRegression with MAE = 397.65


# MACHINE FAILURE PREDICTION (CLASSIFICATION)

In [24]:
print("\n" + "="*60)
print("AUTO ML FOR MACHINE FAILURE PREDICTION (CLASSIFICATION)")
print("="*60)

machine_models_and_params = {
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=500),
        "params": {
            "C": [0.1, 1, 10],
            "solver": ["lbfgs", "liblinear"]
        }
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(random_state=42, n_jobs=-1),
        "params": {
            "n_estimators": [100, 300],
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5]
        }
    },
    "GradientBoostingClassifier": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 5]
        }
    },
    "SVC": {
        "model": SVC(probability=True),
        "params": {
            "C": [0.1, 1, 10],
            "gamma": ['scale', 'auto'],
            "kernel": ['rbf']
        }
    }
}

machine_results = {}
best_machine_model = None
best_machine_name = None
best_machine_auc = -1  # higher is better

for name, cfg in machine_models_and_params.items():
    print(f"\n>>> Tuning {name}...")
    base_model = cfg["model"]
    param_dist = cfg["params"]

    search = RandomizedSearchCV(
        base_model,
        param_distributions=param_dist,
        n_iter=6,
        scoring="roc_auc",
        cv=2,
        random_state=42,
        n_jobs=-1
    )
    search.fit(X_train_scaled, y_machine_train)

    best_model = search.best_estimator_
    print(f"Best params for {name}: {search.best_params_}")

    # Predictions
    y_proba = best_model.predict_proba(X_test_scaled)[:, 1]

    auc = roc_auc_score(y_machine_test, y_proba)
    acc = accuracy_score(y_machine_test, best_model.predict(X_test_scaled))

    machine_results[name] = {"AUC": auc, "Accuracy": acc}

    print(f"{name} - AUC: {auc:.4f}, Accuracy: {acc:.4f}")

    if auc > best_machine_auc:
        best_machine_auc = auc
        best_machine_model = best_model
        best_machine_name = name


print("\n" + "="*60)
print("MACHINE FAILURE MODEL COMPARISON")
print("="*60)
for name, res in machine_results.items():
    print(f"{name:25} - AUC: {res['AUC']:.4f}, ACC: {res['Accuracy']:.4f}")

print(f"\nBest Machine Failure Model: {best_machine_name} with AUC = {best_machine_auc:.4f}")



AUTO ML FOR MACHINE FAILURE PREDICTION (CLASSIFICATION)

>>> Tuning LogisticRegression...
Best params for LogisticRegression: {'solver': 'lbfgs', 'C': 1}
LogisticRegression - AUC: 1.0000, Accuracy: 0.9995

>>> Tuning RandomForestClassifier...
Best params for RandomForestClassifier: {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': None}
RandomForestClassifier - AUC: 1.0000, Accuracy: 1.0000

>>> Tuning GradientBoostingClassifier...
Best params for GradientBoostingClassifier: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}
GradientBoostingClassifier - AUC: 1.0000, Accuracy: 1.0000

>>> Tuning SVC...
Best params for SVC: {'kernel': 'rbf', 'gamma': 'scale', 'C': 0.1}
SVC - AUC: 1.0000, Accuracy: 0.9925

MACHINE FAILURE MODEL COMPARISON
LogisticRegression        - AUC: 1.0000, ACC: 0.9995
RandomForestClassifier    - AUC: 1.0000, ACC: 1.0000
GradientBoostingClassifier - AUC: 1.0000, ACC: 1.0000
SVC                       - AUC: 1.0000, ACC: 0.9925

Best Machine Failure

# COMPONENT FAILURE PREDICTION (CLASSIFICATION)

In [25]:
print("\n" + "="*60)
print("AUTO ML FOR COMPONENT FAILURE PREDICTION (CLASSIFICATION)")
print("="*60)

component_models_and_params = {
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=500),
        "params": {
            "C": [0.1, 1, 10],
            "solver": ["lbfgs", "liblinear"]
        }
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(random_state=42, n_jobs=-1),
        "params": {
            "n_estimators": [100, 300],
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5]
        }
    },
    "GradientBoostingClassifier": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 5]
        }
    },
    "SVC": {
        "model": SVC(probability=True),
        "params": {
            "C": [0.1, 1, 10],
            "gamma": ['scale', 'auto'],
            "kernel": ['rbf']
        }
    }
}

component_results = {}
best_component_model = None
best_component_name = None
best_component_auc = -1

for name, cfg in component_models_and_params.items():

    print(f"\n>>> Tuning {name}...")
    base_model = cfg["model"]
    param_dist = cfg["params"]

    search = RandomizedSearchCV(
        base_model,
        param_distributions=param_dist,
        n_iter=6,
        scoring="roc_auc",
        cv=2,
        random_state=42,
        n_jobs=-1
    )

    search.fit(X_train_scaled, y_component_train)
    best_model = search.best_estimator_

    print(f"Best params for {name}: {search.best_params_}")

    y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_component_test, y_proba)
    acc = accuracy_score(y_component_test, best_model.predict(X_test_scaled))

    component_results[name] = {"AUC": auc, "Accuracy": acc}

    print(f"{name} - AUC: {auc:.4f}, Accuracy: {acc:.4f}")

    if auc > best_component_auc:
        best_component_auc = auc
        best_component_model = best_model
        best_component_name = name


print("\n" + "="*60)
print("COMPONENT FAILURE MODEL COMPARISON")
print("="*60)
for name, res in component_results.items():
    print(f"{name:25} - AUC: {res['AUC']:.4f}, ACC: {res['Accuracy']:.4f}")

print(f"\nBest Component Failure Model: {best_component_name} with AUC = {best_component_auc:.4f}")



AUTO ML FOR COMPONENT FAILURE PREDICTION (CLASSIFICATION)

>>> Tuning LogisticRegression...
Best params for LogisticRegression: {'solver': 'lbfgs', 'C': 10}
LogisticRegression - AUC: 0.5167, Accuracy: 0.8080

>>> Tuning RandomForestClassifier...
Best params for RandomForestClassifier: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 10}
RandomForestClassifier - AUC: 0.5161, Accuracy: 0.8070

>>> Tuning GradientBoostingClassifier...
Best params for GradientBoostingClassifier: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.05}
GradientBoostingClassifier - AUC: 0.5083, Accuracy: 0.8090

>>> Tuning SVC...
Best params for SVC: {'kernel': 'rbf', 'gamma': 'scale', 'C': 1}
SVC - AUC: 0.5061, Accuracy: 0.8065

COMPONENT FAILURE MODEL COMPARISON
LogisticRegression        - AUC: 0.5167, ACC: 0.8080
RandomForestClassifier    - AUC: 0.5161, ACC: 0.8070
GradientBoostingClassifier - AUC: 0.5083, ACC: 0.8090
SVC                       - AUC: 0.5061, ACC: 0.8065

Best Component Fail