In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report

# Step 1: Load your dataset
df = pd.read_csv('../data/engineered_sensor_data.csv')

# Step 2: Calculate descriptive statistics
temp_threshold = df['temperature'].quantile(0.95)  # Example: 95th percentile
vibration_threshold = df['vibration'].mean() + 2 * df['vibration'].std()  # Example: Mean + 2 standard deviations
pressure_threshold = 100  # Example: Arbitrary threshold value (adjust as per domain knowledge)

print(f"Temperature Threshold: {temp_threshold}")
print(f"Vibration Threshold: {vibration_threshold}")
print(f"Pressure Threshold: {pressure_threshold}")

# Step 3: Define maintenance indicator based on thresholds
df['maintenance_needed'] = 0  # Initialize maintenance indicator

# Set maintenance indicator to 1 if any threshold is exceeded
df.loc[(df['temperature'] >= temp_threshold) |
       (df['vibration'] >= vibration_threshold) |
       (df['pressure'] >= pressure_threshold), 'maintenance_needed'] = 1

# Assuming 'maintenance_required' is your target variable
X = df.drop(columns=['maintenance_required'])  # Features
y = df['maintenance_required']  # Target

# Step 4: Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Step 5: Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))  # Impute missing values with mean
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical columns
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Step 6: Handle NaN values in y (target)
# Check if y contains NaN values
if y.isnull().sum() > 0:
    # Impute missing values in y with most frequent class
    y_imputer = SimpleImputer(strategy='most_frequent')
    y = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()

# Step 7: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Initialize classifiers
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(random_state=42)
lr_model = LogisticRegression(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)

# Step 9: Create pipelines combining preprocessing and models with maintenance logic
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', rf_model)])

svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', svm_model)])

lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', lr_model)])

dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', dt_model)])

# List of pipelines for ease of iteration
pipelines = [rf_pipeline, svm_pipeline, lr_pipeline, dt_pipeline]
pipeline_names = ['Random Forest', 'SVM', 'Logistic Regression', 'Decision Tree']

# Step 10: Fit the pipelines and evaluate with maintenance logic
for pipeline, name in zip(pipelines, pipeline_names):
    print(f"Training {name}...")
    pipeline.fit(X_train, y_train)
    print(f"Evaluating {name}...")
    y_pred = pipeline.predict(X_test)
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print("-----------------------------------------------------")

# Assuming models are already trained as per previous example

# Step 11: Predict maintenance requirement
for pipeline, name in zip(pipelines, pipeline_names):
    print(f"Predicting maintenance requirement using {name}...")
    y_pred = pipeline.predict(X_test)
    maintenance_pred = (y_pred == 1)  # Assuming 1 indicates maintenance is predicted
    print(f"{name} Predictions:")
    print(maintenance_pred)
    print("-----------------------------------------------------")

# # Step 12: Evaluate model performance
# for pipeline, name in zip(pipelines, pipeline_names):
#     print(f"Evaluating {name} model performance...")
#     y_pred = pipeline.predict(X_test)
#     print(f"Classification Report for {name}:")
#     print(classification_report(y_test, y_pred))
#     print("-----------------------------------------------------")



Temperature Threshold: 25.855
Vibration Threshold: 0.0610818510677892
Pressure Threshold: 100
Training Random Forest...
Evaluating Random Forest...
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

-----------------------------------------------------
Training SVM...
Evaluating SVM...
Classification Report for SVM:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

-----------------------------------------------------
Training Logistic Regression...
Evaluating Logistic Regression...
Classification Report for 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
