In [1]:
# Step 1 & 2 - Prepare the data (from ChatGPT)
import pandas as pd

# Load data
df = pd.read_csv("../data/raw/predictive_maintenance_data.csv")

# Quick sanity checks
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])
df.head()
# Select features (inputs)
X = df[['Pressure', 'TempA', 'TempB', 'Speed', 'Thickness']]

# Target (what we want to predict)
y = df['Status']

print(X.head())
print(y.value_counts())


Rows: 34560
Columns: 7
   Pressure      TempA      TempB      Speed  Thickness
0  9.433970  83.107422  91.113042  62.784998  20.830466
1  8.746509  83.461400  92.776336  68.370730  20.933260
2  8.632601  83.832500  94.443207  61.081876  21.586823
3  8.329277  83.021493  91.271475  64.555394  20.522823
4  8.732501  81.608363  93.732991  60.089657  20.275720
Status
0    34485
1       75
Name: count, dtype: int64


In [2]:
# Step 3 - Train/Test split + Scaling (from ChatGPT)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split into train/test (keeps same % of Status 0/1 in each set)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape, " Test size:", X_test.shape)

# Scale features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled shapes:", X_train_scaled.shape, X_test_scaled.shape)


Train size: (27648, 5)  Test size: (6912, 5)
Scaled shapes: (27648, 5) (6912, 5)


In [3]:
# Step 4 — Train Logistic Regression model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

print("Model trained ✅")


Model trained ✅


In [4]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(X_test_scaled)

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification report:")
print(classification_report(y_test, y_pred))


Confusion matrix:
[[6897    0]
 [   7    8]]

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6897
           1       1.00      0.53      0.70        15

    accuracy                           1.00      6912
   macro avg       1.00      0.77      0.85      6912
weighted avg       1.00      1.00      1.00      6912



In [5]:
import numpy as np

# Get predicted probabilities
y_prob = model.predict_proba(X_test_scaled)[:, 1]

# Try lower threshold
threshold = 0.2
y_pred_custom = (y_prob >= threshold).astype(int)

print("Confusion matrix (threshold = 0.2):")
print(confusion_matrix(y_test, y_pred_custom))

print("\nClassification report:")
print(classification_report(y_test, y_pred_custom))


Confusion matrix (threshold = 0.2):
[[6893    4]
 [   3   12]]

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6897
           1       0.75      0.80      0.77        15

    accuracy                           1.00      6912
   macro avg       0.87      0.90      0.89      6912
weighted avg       1.00      1.00      1.00      6912

