In [1]:
from scipy.stats import pointbiserialr
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
#  ^ VERİ YÜKLEME ^
# Veri incelendi. Fail_Occurrence = 1 olan kayıtlar özellikle incelendi,
# bu kayıtlar ile Fail_Occurrence = 0 olan kayıtlar karşılaştırıldı.
# Bu karşılaştırma sonucunda önemli olabilecek özellikler (feature selection) seçildi.

In [3]:
use_cols = ['Timestamp', 'Machine_ID', 'Machine_Type', 'Failure_Occurrence',
            'Idle_Time_Duration', 'Component_Health_Score', 'Oil_Viscosity',
            'RUL', 'Lighting_Condition', 'Energy_Efficiency_Index',
            'Ventilation_Level', 'Ambient_Temperature',
            'Operational_Mode', 'Job_Code', 'Maintenance_Frequency','Voltage_Phase_A','Maintenance_Personnel_ID',
            'Maintenance_Type_Label', 'Replaced_Components_List','Maintenance_Type',
            'Shift_Code', 'Operator_ID', 'Machine_Location_Zone','Power_Consumption',
            'Last_Maintenance_Date','Peak_Vibration','Communication_Latency','Nearby_Machine_Load','Alarm_Count_24hr','Data_Packet_Loss_Percent']

df = pd.read_csv("RealTime_IoT_PredictiveMaintenance_Dataset.csv", usecols=use_cols)

In [4]:
# ^ PENCERELEME ^

In [5]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])  # Zaman damgaları datetime formatına çevrilir
df['Hour'] = df['Timestamp'].dt.floor('12H')  # 12 saatlik zaman pencerelerine yuvarlama yapılır (bu yöntem deneysel olarak iyi sonuç vermiştir)
df['Machine_Unique'] = df['Machine_ID'].astype(str) + '_' + df['Machine_Type']  # Her makine için benzersiz kimlik oluşturulur

In [6]:
# ^ SÜTUNLARIN AYRILMASI ^

In [7]:
numeric_cols = [
    'Oil_Viscosity', 'Power_Consumption', 'Energy_Efficiency_Index','Voltage_Phase_A',
    'Idle_Time_Duration', 'Component_Health_Score', 'Ambient_Temperature',
    'Lighting_Condition', 'Ventilation_Level', 'RUL','Peak_Vibration','Communication_Latency','Nearby_Machine_Load','Alarm_Count_24hr','Data_Packet_Loss_Percent'
]

In [8]:
# ^ AGGREGATION ^

In [9]:
agg_funcs = ['mean', 'median', 'min', 'max']
df_numeric = df.groupby(['Machine_Unique', 'Hour'])[numeric_cols].agg(agg_funcs)
df_numeric.columns = ['_'.join(col) for col in df_numeric.columns]
df_numeric = df_numeric.reset_index()
df_failure = df.groupby(['Machine_Unique', 'Hour'])['Failure_Occurrence'].max().reset_index()

In [10]:
# ^ FEATURE EXTRACTION ^

In [11]:
df['Last_Maintenance_Date'] = pd.to_datetime(df['Last_Maintenance_Date'], errors='coerce') # Eğer ki son maintenance tarihi geçmiş ise 1, değilse 0 
df_maintenance = df.groupby(['Machine_Unique', 'Hour'])[['Last_Maintenance_Date']].max().reset_index()
df_maintenance['Maintenance_Before_Hour'] = (df_maintenance['Last_Maintenance_Date'] < df_maintenance['Hour']).astype(int)
df_maintenance = df_maintenance.drop(columns=['Last_Maintenance_Date'])

In [12]:
# ^ MERGE ^

In [13]:
merged_df = pd.merge(df_numeric, df_failure, on=['Machine_Unique', 'Hour'])
merged_df = pd.merge(merged_df, df_maintenance, on=['Machine_Unique', 'Hour'])

In [14]:
# Train dosyasında korelasyon hesaplanmış ve ona göre kullanılacak featurelar seçilmiştir.
# ^ FEATURE SELECTION ^
# Fail_Occurence ile yüksek korelasyona sahip olan (yaklaşık 0.4 civarı) featurelar eğitim için seçildi.

In [15]:
features = [
    'Alarm_Count_24hr_max',
    'Voltage_Phase_A_max',
    'Power_Consumption_min',
    'Nearby_Machine_Load_max',
    'Energy_Efficiency_Index_max',
    'Oil_Viscosity_max',
    'RUL_max',
    'Idle_Time_Duration_max',
    'Component_Health_Score_max',
    'Ventilation_Level_max',
    'Peak_Vibration_min',
    'Ambient_Temperature_max',
    'Communication_Latency_max',
    'Lighting_Condition_max',
    'Data_Packet_Loss_Percent_max'
]

In [16]:
# ^ GEÇMİŞ 6 ZAMAN ADIMINDAN ÖZELLİK MATRİSİ OLUŞTURMA (SLIDING WINDOW) ^
# Farklı pencere boyutları (örneğin 3, 6, 12) denenmiş; model performansı ve veri kaybı dengesi göz önünde bulundurularak
# 6 zaman adımı (örneğin 72 saat) ile oluşturulan pencere boyutunun en uygun sonuçları verdiği gözlemlenmiştir.
# Her bir pencere için geçmiş veriler tek satıra indirgenmiş, ilgili pencerenin hemen sonraki arıza durumu etiketlenmiştir.

In [17]:
window_size = 6

X_windows = []
y_labels = []

for machine_id, group in merged_df.groupby('Machine_Unique'):
    group = group.sort_values('Hour')
    values = group[features].values
    labels = group['Failure_Occurrence'].values

    for i in range(len(group) - window_size):
        X_windows.append(values[i:i+window_size].flatten())
        y_labels.append(labels[i + window_size])

X_sliding = pd.DataFrame(X_windows)
y_sliding = pd.Series(y_labels)

In [18]:
# ^ VERİYİ BÖLME ^

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X_sliding, y_sliding, test_size=0.2, stratify=y_sliding, random_state=50
)

In [20]:
# ^ SMOTE ^
# Fail_Occurence=1 olan durumların az olduğu gözlemlendi ve veride dengesizlik olduğu anlaşıldı.
# Bu yüzden SMOTE ile eğitim seti için sentetik veri oluşturuldu

In [21]:
smote = SMOTE(random_state=50)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [22]:
# ^ THRESHOLD TUNING ^
# Sınıflardaki dengesizlik nedeniyle modelin varsayılan 0.5 eşik değeri uygun olmayabilir.
# Bu yüzden F1 skorunu maksimize eden en iyi eşik değeri bulunuyor.

In [23]:
def find_best_threshold(y_true, y_proba):
    thresholds = np.linspace(0.1, 0.9, 81)
    f1_scores = []
    for thr in thresholds:
        preds = (y_proba >= thr).astype(int)
        f1_scores.append(f1_score(y_true, preds))
    best_idx = np.argmax(f1_scores)
    return thresholds[best_idx]

In [24]:
# ^ RANDOM FOREST MODELİ TEST ^

In [25]:
rf = joblib.load("models/rf_model_v1.joblib")
y_proba_rf = rf.predict_proba(X_test)[:, 1]
rf_threshold = find_best_threshold(y_test, y_proba_rf)
y_pred_rf_thresh = (y_proba_rf >= rf_threshold).astype(int)

print(f"\nRandom Forest (Threshold = {rf_threshold:.2f}):")
print(confusion_matrix(y_test, y_pred_rf_thresh))
print(classification_report(y_test, y_pred_rf_thresh, digits=3))


Random Forest (Threshold = 0.41):
[[1883  769]
 [ 548 2034]]
              precision    recall  f1-score   support

           0      0.775     0.710     0.741      2652
           1      0.726     0.788     0.755      2582

    accuracy                          0.748      5234
   macro avg      0.750     0.749     0.748      5234
weighted avg      0.750     0.748     0.748      5234



In [26]:
# ^ XGBOOST MODELİ TEST ^

In [27]:
xgb = joblib.load("models/xgb_model_v1.joblib")
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]
xgb_threshold = find_best_threshold(y_test, y_proba_xgb)
y_pred_xgb_thresh = (y_proba_xgb >= xgb_threshold).astype(int)

print(f"\nXGBoost (Threshold = {xgb_threshold:.2f}):")
print(confusion_matrix(y_test, y_pred_xgb_thresh))
print(classification_report(y_test, y_pred_xgb_thresh, digits=3))


XGBoost (Threshold = 0.40):
[[1879  773]
 [ 556 2026]]
              precision    recall  f1-score   support

           0      0.772     0.709     0.739      2652
           1      0.724     0.785     0.753      2582

    accuracy                          0.746      5234
   macro avg      0.748     0.747     0.746      5234
weighted avg      0.748     0.746     0.746      5234



In [28]:
# ^ ENSEMBLE MODEL DENEMESİ (RANDOM FOREST+XGBOOST)

In [29]:
rf_model = joblib.load("models/rf_model_v1.joblib")
xgb_model = joblib.load("models/xgb_model_v1.joblib")
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
ensemble_proba = (y_proba_rf + y_proba_xgb) / 2

ensemble_threshold = find_best_threshold(y_test, ensemble_proba)
y_pred_ensemble = (ensemble_proba >= ensemble_threshold).astype(int)

print(f"\nENSEMBLE (Mean of RF + XGB, Threshold = {ensemble_threshold:.2f}):")
print(confusion_matrix(y_test, y_pred_ensemble))
print(classification_report(y_test, y_pred_ensemble, digits=3))


ENSEMBLE (Mean of RF + XGB, Threshold = 0.41):
[[1919  733]
 [ 576 2006]]
              precision    recall  f1-score   support

           0      0.769     0.724     0.746      2652
           1      0.732     0.777     0.754      2582

    accuracy                          0.750      5234
   macro avg      0.751     0.750     0.750      5234
weighted avg      0.751     0.750     0.750      5234



In [30]:
# ^ MLP MODELİ TEST ^

In [31]:
mlp = joblib.load('models/mlp_model.joblib')
scaler = joblib.load('models/scaler.joblib')
X_test_scaled = scaler.transform(X_test)  
y_proba_mlp = mlp.predict_proba(X_test_scaled)[:, 1]
mlp_threshold = find_best_threshold(y_test, y_proba_mlp)
y_pred_mlp_thresh = (y_proba_mlp >= mlp_threshold).astype(int)

print(f"\nMLP (Scaled + Deep, Threshold = {mlp_threshold:.2f}):")
print(confusion_matrix(y_test, y_pred_mlp_thresh))
print(classification_report(y_test, y_pred_mlp_thresh, digits=3))


MLP (Scaled + Deep, Threshold = 0.36):
[[1744  908]
 [ 473 2109]]
              precision    recall  f1-score   support

           0      0.787     0.658     0.716      2652
           1      0.699     0.817     0.753      2582

    accuracy                          0.736      5234
   macro avg      0.743     0.737     0.735      5234
weighted avg      0.743     0.736     0.735      5234



In [32]:
# ^ ENSEMBLE MODEL DENEMESİ (RANDOM FOREST+XGBOOST+MLP)

In [33]:
rf_model = joblib.load("models/rf_model_v1.joblib")
xgb_model = joblib.load("models/xgb_model_v1.joblib")
mlp_model = joblib.load('models/mlp_model.joblib')

y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
y_proba_mlp = mlp_model.predict_proba(X_test)[:, 1]

ensemble_proba_3 = (y_proba_rf + y_proba_xgb + y_proba_mlp) / 3
ensemble_threshold_3 = find_best_threshold(y_test, ensemble_proba_3)
y_pred_ensemble_3 = (ensemble_proba_3 >= ensemble_threshold_3).astype(int)
print(f"\nENSEMBLE (RF + XGB + MLP):")
print(confusion_matrix(y_test, y_pred_ensemble_3))
print(classification_report(y_test, y_pred_ensemble_3, digits=3))  



ENSEMBLE (RF + XGB + MLP):
[[1938  714]
 [ 585 1997]]
              precision    recall  f1-score   support

           0      0.768     0.731     0.749      2652
           1      0.737     0.773     0.755      2582

    accuracy                          0.752      5234
   macro avg      0.752     0.752     0.752      5234
weighted avg      0.753     0.752     0.752      5234

