In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
    
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from pandas import DataFrame
from fastapi import HTTPException
from typing import List, Any, Dict


In [37]:
# load the dataset
path = '../data/processed/FD001'

df = pd.read_csv(path)

In [38]:
df.head()

Unnamed: 0,Unit,Cycle,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,...,sensor_11_rolling_std,sensor_12_rolling_std,sensor_13_rolling_std,sensor_14_rolling_std,sensor_15_rolling_std,sensor_17_rolling_std,sensor_20_rolling_std,sensor_21_rolling_std,RUL,RUL_class
0,1,10,641.71,1591.24,1400.46,553.59,2388.05,9051.7,47.03,521.79,...,0.144207,0.400949,0.020111,3.333946,0.022654,0.918937,0.070111,0.044857,182,medium
1,1,11,642.28,1581.75,1400.64,554.54,2388.05,9049.61,47.15,521.4,...,0.130213,0.442267,0.021628,3.779342,0.023876,0.918937,0.065794,0.052522,181,medium
2,1,12,642.06,1583.41,1400.15,554.52,2388.09,9049.37,47.18,521.8,...,0.096661,0.448969,0.02044,3.804252,0.023,0.948683,0.071149,0.052264,180,medium
3,1,13,643.07,1582.19,1400.83,553.44,2388.12,9046.82,47.38,521.85,...,0.109747,0.435871,0.02406,4.061997,0.022867,0.816497,0.072296,0.062411,179,medium
4,1,14,642.35,1592.95,1399.16,554.48,2388.09,9047.37,47.44,521.67,...,0.124316,0.328843,0.023688,4.093274,0.019155,0.875595,0.088468,0.062331,178,medium


In [39]:
sensor_names = ['sensor_2', 'sensor_3', 'sensor_4', 'sensor_7',
    'sensor_8', 'sensor_9', 'sensor_11', 'sensor_12', 'sensor_13',
    'sensor_14', 'sensor_15', 'sensor_17', 'sensor_20', 'sensor_21']

df = df.drop(columns=sensor_names)

In [40]:
X = df.drop(['Unit', 'Cycle', 'RUL', 'RUL_class'], axis=1)
y = df['RUL_class']

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (19731, 42)
Shape of y: (19731,)


In [41]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(y)

le_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

print(le_mapping)


{'long': 0, 'medium': 1, 'short': 2, 'urgent': 3}


In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of the training set: {X_train.shape}")

print(f"Shape of the test set: {X_test.shape}")

print(f"Percentage of classes in the training set:")
y_train_df = pd.DataFrame(y_train, columns=['class'])
print(y_train_df.value_counts(normalize=True))


print(f"Percentage of classes in the test set:")
y_test_df = pd.DataFrame(y_test, columns=['class'])
print(y_test_df.value_counts(normalize=True))


Shape of the training set: (15784, 42)
Shape of the test set: (3947, 42)
Percentage of classes in the training set:
class
2        0.378294
1        0.281298
3        0.259883
0        0.080525
Name: proportion, dtype: float64
Percentage of classes in the test set:
class
2        0.385609
1        0.281733
3        0.252850
0        0.079807
Name: proportion, dtype: float64


In [43]:
# Save the training and test sets
train_df = pd.DataFrame(X_train, columns=X.columns)
train_df['RUL_class'] = y_train

test_df = pd.DataFrame(X_test, columns=X.columns)
test_df['RUL_class'] = y_test

train_df.to_csv('../data/production/train.csv', index=False)
test_df.to_csv('../data/production/test.csv', index=False)

In [44]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(random_state=42)

rf.fit(X_train_scaled, y_train)

y_pred = rf.predict(X_test_scaled)

print('Random Forest Classifier')
print(classification_report(y_test, y_pred))

Random Forest Classifier
              precision    recall  f1-score   support

           0       0.99      0.74      0.85       315
           1       0.93      0.96      0.94      1112
           2       0.91      0.97      0.94      1522
           3       0.98      0.92      0.95       998

    accuracy                           0.93      3947
   macro avg       0.95      0.90      0.92      3947
weighted avg       0.94      0.93      0.93      3947



In [46]:
# Fit a KNN model
model = KNeighborsClassifier()

# Fit the model
model.fit(X_train_scaled, y_train)

# Predict the categories
y_pred = model.predict(X_test_scaled)

# Classification report
print("KNN Classifier")
print(classification_report(y_test, y_pred))


KNN Classifier
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       315
           1       0.97      0.96      0.96      1112
           2       0.96      0.97      0.96      1522
           3       0.98      0.97      0.98       998

    accuracy                           0.97      3947
   macro avg       0.96      0.96      0.96      3947
weighted avg       0.97      0.97      0.97      3947



In [28]:
# Export the model to pickle file
import pickle

with open('../models/rf_model.pkl', 'wb') as f:
    pickle.dump(rf, f)

with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('../models/le.pkl', 'wb') as f:
    pickle.dump(le, f)

with open('../models/knn_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Models exported successfully")

Models exported successfully
