In [None]:
# ==============================
# Step 0: Imports
# ==============================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier



Dataset shape: (20631, 26)


Unnamed: 0,unit_nr,time_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [None]:

# ==============================
# Step 1: Load Dataset (CMAPSS FD001)
# ==============================
def load_cmapss_data(filename):
    # CMAPSS FD001 has 26 columns -> unit_nr, time_cycles + 21 sensors + 3 settings
    col_names = ['unit_nr', 'time_cycles'] + [f'operational_setting_{i}' for i in range(1,4)] + [f'sensor_{i}' for i in range(1,22)]
    df = pd.read_csv(filename, sep=" ", header=None)
    df = df.dropna(axis=1, how='all')   # remove empty columns
    df.columns = col_names
    return df

df = load_cmapss_data("/Users/rohanpatil/Desktop/ai_ml_projects/project_1/data/CMAPSSData/train_FD001.txt")
print("Dataset shape:", df.shape)
df.head()


In [46]:
# ==============================
# Step 2: Add Rolling Features
# ==============================
def add_features(df):
    for col in [c for c in df.columns if 'sensor' in c]:
        df[col+"_rollmean"] = df[col].rolling(window=5, min_periods=1).mean()
        df[col+"_diff"] = df[col].diff().fillna(0)
    return df

df = add_features(df)

# ==============================
# Step 3: Calculate Remaining Useful Life (RUL)
# ==============================
def calculate_rul(df):
    rul_df = df.groupby('unit_nr')['time_cycles'].max().reset_index()
    rul_df.columns = ['unit_nr','max_cycles']
    df = df.merge(rul_df, on='unit_nr', how='left')
    df['RUL'] = df['max_cycles'] - df['time_cycles']
    return df,rul_df

df,rul_df = calculate_rul(df)

# ==============================
# Step 4: Binary Classification Label (Failure Risk <= 30 cycles)
# ==============================
def add_labels(df, threshold=30):
    df['label'] = (df['RUL'] <= threshold).astype(int)
    return df

df = add_labels(df)
df[['unit_nr','time_cycles','RUL','label']].head(10)


Unnamed: 0,unit_nr,time_cycles,RUL,label
0,1,1,191,0
1,1,2,190,0
2,1,3,189,0
3,1,4,188,0
4,1,5,187,0
5,1,6,186,0
6,1,7,185,0
7,1,8,184,0
8,1,9,183,0
9,1,10,182,0


In [48]:
df

Unnamed: 0,unit_nr,time_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18_diff,sensor_19_rollmean,sensor_19_diff,sensor_20_rollmean,sensor_20_diff,sensor_21_rollmean,sensor_21_diff,max_cycles,RUL,label
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,0.0,100.0,0.0,39.060000,0.00,23.419000,0.0000,192,191,0
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,0.0,100.0,0.0,39.030000,-0.06,23.421300,0.0046,192,190,0
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,0.0,100.0,0.0,39.003333,-0.05,23.395600,-0.0794,192,189,0
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,0.0,100.0,0.0,38.972500,-0.07,23.390175,0.0297,192,188,0
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,0.0,100.0,0.0,38.958000,0.02,23.393020,0.0305,192,187,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,0.0,100.0,0.0,38.408000,0.35,23.079640,-0.2188,200,4,1
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,0.0,100.0,0.0,38.356000,-0.19,23.096120,0.1859,200,3,1
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,0.0,100.0,0.0,38.350000,0.14,23.078180,-0.2261,200,2,1
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,0.0,100.0,0.0,38.332000,-0.15,23.064500,0.1307,200,1,1


In [47]:
df.head()

Unnamed: 0,unit_nr,time_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18_diff,sensor_19_rollmean,sensor_19_diff,sensor_20_rollmean,sensor_20_diff,sensor_21_rollmean,sensor_21_diff,max_cycles,RUL,label
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,0.0,100.0,0.0,39.06,0.0,23.419,0.0,192,191,0
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,0.0,100.0,0.0,39.03,-0.06,23.4213,0.0046,192,190,0
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,0.0,100.0,0.0,39.003333,-0.05,23.3956,-0.0794,192,189,0
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,0.0,100.0,0.0,38.9725,-0.07,23.390175,0.0297,192,188,0
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,0.0,100.0,0.0,38.958,0.02,23.39302,0.0305,192,187,0


In [23]:
# ==============================
# Step 5: Train/Test Split
# ==============================
features = [c for c in df.columns if 'sensor' in c]
X = df[features]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ==============================
# Logistic Regression
# ==============================
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
preds_lr = lr.predict(X_test)
print("Logistic Regression Report:\n", classification_report(y_test, preds_lr))


Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      3538
           1       0.84      0.85      0.85       589

    accuracy                           0.96      4127
   macro avg       0.91      0.91      0.91      4127
weighted avg       0.96      0.96      0.96      4127



In [24]:
# ==============================
# Random Forest
# ==============================
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
preds_rf = rf.predict(X_test)
print("Random Forest Report:\n", classification_report(y_test, preds_rf))

Random Forest Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      3538
           1       0.89      0.84      0.86       589

    accuracy                           0.96      4127
   macro avg       0.93      0.91      0.92      4127
weighted avg       0.96      0.96      0.96      4127



In [27]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
preds_dt = dt.predict(X_test)
print("Decision Tree Report:\n", classification_report(y_test, preds_dt))

Decision Tree Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      3538
           1       0.80      0.82      0.81       589

    accuracy                           0.95      4127
   macro avg       0.89      0.89      0.89      4127
weighted avg       0.95      0.95      0.95      4127

