In [1]:
from scipy.stats import pointbiserialr
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import numpy as np
from sklearn.metrics import f1_score

In [2]:
#  ^ VERİ YÜKLEME ^
# Veri incelendi. Fail_Occurrence = 1 olan kayıtlar özellikle incelendi,
# bu kayıtlar ile Fail_Occurrence = 0 olan kayıtlar karşılaştırıldı.
# Bu karşılaştırma sonucunda önemli olabilecek özellikler (feature selection) seçildi.

In [3]:
use_cols = ['Timestamp', 'Machine_ID', 'Machine_Type', 'Failure_Occurrence',
            'Idle_Time_Duration', 'Component_Health_Score', 'Oil_Viscosity',
            'RUL', 'Lighting_Condition', 'Energy_Efficiency_Index',
            'Ventilation_Level', 'Ambient_Temperature',
            'Operational_Mode', 'Job_Code', 'Maintenance_Frequency','Voltage_Phase_A','Maintenance_Personnel_ID',
            'Maintenance_Type_Label', 'Replaced_Components_List','Maintenance_Type',
            'Shift_Code', 'Operator_ID', 'Machine_Location_Zone','Power_Consumption',
            'Last_Maintenance_Date','Peak_Vibration','Communication_Latency','Nearby_Machine_Load','Alarm_Count_24hr','Data_Packet_Loss_Percent']

df = pd.read_csv("RealTime_IoT_PredictiveMaintenance_Dataset.csv", usecols=use_cols)

In [4]:
# ^ PENCERELEME ^

In [5]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])  # Zaman damgaları datetime formatına çevrilir
df['Hour'] = df['Timestamp'].dt.floor('12H')  # 12 saatlik zaman pencerelerine yuvarlama yapılır (bu yöntem deneysel olarak iyi sonuç vermiştir)
df['Machine_Unique'] = df['Machine_ID'].astype(str) + '_' + df['Machine_Type']  # Her makine için benzersiz kimlik oluşturulur

In [6]:
# ^ SÜTUNLARIN AYRILMASI ^

In [7]:
numeric_cols = [
    'Oil_Viscosity', 'Power_Consumption', 'Energy_Efficiency_Index','Voltage_Phase_A',
    'Idle_Time_Duration', 'Component_Health_Score', 'Ambient_Temperature',
    'Lighting_Condition', 'Ventilation_Level', 'RUL','Peak_Vibration','Communication_Latency','Nearby_Machine_Load','Alarm_Count_24hr','Data_Packet_Loss_Percent'
]

In [8]:
# ^ AGGREGATION ^

In [9]:
agg_funcs = ['mean', 'median', 'min', 'max']
df_numeric = df.groupby(['Machine_Unique', 'Hour'])[numeric_cols].agg(agg_funcs)
df_numeric.columns = ['_'.join(col) for col in df_numeric.columns]
df_numeric = df_numeric.reset_index()
df_failure = df.groupby(['Machine_Unique', 'Hour'])['Failure_Occurrence'].max().reset_index()

In [10]:
# ^ FEATURE EXTRACTION ^

In [11]:
df['Last_Maintenance_Date'] = pd.to_datetime(df['Last_Maintenance_Date'], errors='coerce') # Eğer ki son maintenance tarihi geçmiş ise 1, değilse 0 
df_maintenance = df.groupby(['Machine_Unique', 'Hour'])[['Last_Maintenance_Date']].max().reset_index()
df_maintenance['Maintenance_Before_Hour'] = (df_maintenance['Last_Maintenance_Date'] < df_maintenance['Hour']).astype(int)
df_maintenance = df_maintenance.drop(columns=['Last_Maintenance_Date'])

In [12]:
# ^ MERGE ^

In [13]:
merged_df = pd.merge(df_numeric, df_failure, on=['Machine_Unique', 'Hour'])
merged_df = pd.merge(merged_df, df_maintenance, on=['Machine_Unique', 'Hour'])

In [14]:
# Train dosyasında korelasyon hesaplanmış ve ona göre kullanılacak featurelar seçilmiştir.
# ^ FEATURE SELECTION ^
# Fail_Occurence ile yüksek korelasyona sahip olan (yaklaşık 0.4 civarı) featurelar eğitim için seçildi.

In [15]:
features = [
    'Alarm_Count_24hr_max',
    'Voltage_Phase_A_max',
    'Power_Consumption_min',
    'Nearby_Machine_Load_max',
    'Energy_Efficiency_Index_max',
    'Oil_Viscosity_max',
    'RUL_max',
    'Idle_Time_Duration_max',
    'Component_Health_Score_max',
    'Ventilation_Level_max',
    'Peak_Vibration_min',
    'Ambient_Temperature_max',
    'Communication_Latency_max',
    'Lighting_Condition_max',
    'Data_Packet_Loss_Percent_max'
]

In [16]:
# ^ GEÇMİŞ 6 ZAMAN ADIMINDAN ÖZELLİK MATRİSİ OLUŞTURMA (SLIDING WINDOW) ^
# Farklı pencere boyutları (örneğin 3, 6, 12) denenmiş; model performansı ve veri kaybı dengesi göz önünde bulundurularak
# 6 zaman adımı (örneğin 72 saat) ile oluşturulan pencere boyutunun en uygun sonuçları verdiği gözlemlenmiştir.
# Her bir pencere için geçmiş veriler tek satıra indirgenmiş, ilgili pencerenin hemen sonraki arıza durumu etiketlenmiştir.

In [17]:
window_size = 6

X_windows = []
y_labels = []

for machine_id, group in merged_df.groupby('Machine_Unique'):
    group = group.sort_values('Hour')
    values = group[features].values
    labels = group['Failure_Occurrence'].values

    for i in range(len(group) - window_size):
        X_windows.append(values[i:i+window_size].flatten())
        y_labels.append(labels[i + window_size])

X_sliding = pd.DataFrame(X_windows)
y_sliding = pd.Series(y_labels)

In [18]:
# ^ VERİYİ BÖLME ^

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X_sliding, y_sliding, test_size=0.2, stratify=y_sliding, random_state=50
)

In [20]:
# ^ SMOTE ^
# Fail_Occurence=1 olan durumların az olduğu gözlemlendi ve veride dengesizlik olduğu anlaşıldı.
# Bu yüzden SMOTE ile eğitim seti için sentetik veri oluşturuldu

In [21]:
smote = SMOTE(random_state=50)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [22]:
# ^ THRESHOLD TUNING ^
# Sınıflardaki dengesizlik nedeniyle modelin varsayılan 0.5 eşik değeri uygun olmayabilir.
# Bu yüzden F1 skorunu maksimize eden en iyi eşik değeri bulunuyor.

In [23]:
def find_best_threshold(y_true, y_proba):
    thresholds = np.linspace(0.1, 0.9, 81)
    f1_scores = []
    for thr in thresholds:
        preds = (y_proba >= thr).astype(int)
        f1_scores.append(f1_score(y_true, preds))
    best_idx = np.argmax(f1_scores)
    return thresholds[best_idx]

In [24]:
# ^ RANDOM FOREST MODELİ TEST ^

In [25]:
rf = joblib.load("rf_model_v1.joblib")

# Test verisi için tahmin olasılıkları
y_proba_rf = rf.predict_proba(X_test)[:, 1]

# En iyi threshold'u belirle
rf_threshold = find_best_threshold(y_test, y_proba_rf)

# Threshold'a göre sınıflandırma
y_pred_rf_thresh = (y_proba_rf >= rf_threshold).astype(int)

print(f"\n📌 Random Forest (Threshold = {rf_threshold:.2f}):")
print(confusion_matrix(y_test, y_pred_rf_thresh))
print(classification_report(y_test, y_pred_rf_thresh, digits=3))


📌 Random Forest (Threshold = 0.41):
[[1826  826]
 [ 517 2065]]
              precision    recall  f1-score   support

           0      0.779     0.689     0.731      2652
           1      0.714     0.800     0.755      2582

    accuracy                          0.743      5234
   macro avg      0.747     0.744     0.743      5234
weighted avg      0.747     0.743     0.743      5234



In [26]:
# ^ XGBOOST MODELİ TEST ^

In [27]:
# Modeli yükle
xgb = joblib.load("xgb_model_v1.joblib")

# Test setinde olasılık tahmini
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

# En iyi threshold’u bul
xgb_threshold = find_best_threshold(y_test, y_proba_xgb)

# Threshold ile sınıflandır
y_pred_xgb_thresh = (y_proba_xgb >= xgb_threshold).astype(int)

print(f"\n📌 XGBoost (Threshold = {xgb_threshold:.2f}):")
print(confusion_matrix(y_test, y_pred_xgb_thresh))
print(classification_report(y_test, y_pred_xgb_thresh, digits=3))


📌 XGBoost (Threshold = 0.38):
[[1785  867]
 [ 499 2083]]
              precision    recall  f1-score   support

           0      0.782     0.673     0.723      2652
           1      0.706     0.807     0.753      2582

    accuracy                          0.739      5234
   macro avg      0.744     0.740     0.738      5234
weighted avg      0.744     0.739     0.738      5234



In [28]:
# ^ ENSEMBLE MODEL DENEMESİ (RANDOM FOREST+XGBOOST)

In [29]:
rf_model = joblib.load("rf_model_v1.joblib")
xgb_model = joblib.load("xgb_model_v1.joblib")

# Test setinde olasılık tahminleri
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Ensemble olasılıkları (ortalama)
ensemble_proba = (y_proba_rf + y_proba_xgb) / 2

# En iyi threshold'u bul
ensemble_threshold = find_best_threshold(y_test, ensemble_proba)

# Threshold'a göre sınıflandırma yap
y_pred_ensemble = (ensemble_proba >= ensemble_threshold).astype(int)

print(f"\n📌 ENSEMBLE (Mean of RF + XGB, Threshold = {ensemble_threshold:.2f}):")
print(confusion_matrix(y_test, y_pred_ensemble))
print(classification_report(y_test, y_pred_ensemble, digits=3))


📌 ENSEMBLE (Mean of RF + XGB, Threshold = 0.38):
[[1718  934]
 [ 458 2124]]
              precision    recall  f1-score   support

           0      0.790     0.648     0.712      2652
           1      0.695     0.823     0.753      2582

    accuracy                          0.734      5234
   macro avg      0.742     0.735     0.732      5234
weighted avg      0.743     0.734     0.732      5234



In [30]:
# ^ MLP MODELİ TEST ^

In [31]:
# --- Kaydedilen model ve scaler'ı yükle ---
mlp = joblib.load('mlp_model.joblib')
scaler = joblib.load('scaler.joblib')

# --- Test verisini scaler ile dönüştür ---
X_test_scaled = scaler.transform(X_test)  # asla testte fit yapma!

# --- Tahmin olasılıkları ---
y_proba_mlp = mlp.predict_proba(X_test_scaled)[:, 1]

# --- En iyi threshold'u bul ---
mlp_threshold = find_best_threshold(y_test, y_proba_mlp)

# --- Tahminleri belirlenen threshold ile yap ---
y_pred_mlp_thresh = (y_proba_mlp >= mlp_threshold).astype(int)

print(f"\n📌 MLP (Scaled + Deep, Threshold = {mlp_threshold:.2f}):")
print(confusion_matrix(y_test, y_pred_mlp_thresh))
print(classification_report(y_test, y_pred_mlp_thresh, digits=3))


📌 MLP (Scaled + Deep, Threshold = 0.33):
[[1583 1069]
 [ 394 2188]]
              precision    recall  f1-score   support

           0      0.801     0.597     0.684      2652
           1      0.672     0.847     0.749      2582

    accuracy                          0.720      5234
   macro avg      0.736     0.722     0.717      5234
weighted avg      0.737     0.720     0.716      5234



In [32]:
# ^ ENSEMBLE MODEL DENEMESİ (RANDOM FOREST+XGBOOST+MLP)

In [33]:
rf_model = joblib.load("rf_model_v1.joblib")
xgb_model = joblib.load("xgb_model_v1.joblib")
mlp_model = joblib.load('mlp_model.joblib')

# Test setinde olasılık tahminleri
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
y_proba_mlp = mlp_model.predict_proba(X_test)[:, 1]

ensemble_proba_3 = (y_proba_rf + y_proba_xgb + y_proba_mlp) / 3
ensemble_threshold_3 = find_best_threshold(y_test, ensemble_proba_3)
y_pred_ensemble_3 = (ensemble_proba_3 >= ensemble_threshold_3).astype(int)
print(f"\n📌 ENSEMBLE (RF + XGB + MLP, Threshold = {ensemble_threshold_3:.2f}):")
print(confusion_matrix(y_test, y_pred_ensemble_3))
print(classification_report(y_test, y_pred_ensemble_3, digits=3))  



📌 ENSEMBLE (RF + XGB + MLP, Threshold = 0.58):
[[1671  981]
 [ 437 2145]]
              precision    recall  f1-score   support

           0      0.793     0.630     0.702      2652
           1      0.686     0.831     0.752      2582

    accuracy                          0.729      5234
   macro avg      0.739     0.730     0.727      5234
weighted avg      0.740     0.729     0.727      5234



In [34]:
# ^ CATBOOST MODELİ TEST ^

In [35]:
# Modeli yükle
cat_model = CatBoostClassifier()
cat_model.load_model('catboost_model.cbm')

# Test verisi üzerinde tahmin olasılıkları
y_proba_cat_test = cat_model.predict_proba(X_test)[:, 1]

# En iyi threshold'u bul
cat_threshold = find_best_threshold(y_test, y_proba_cat_test)

# Bu threshold ile tahminleri oluştur
y_pred_cat_thresh = (y_proba_cat_test >= cat_threshold).astype(int)

print(f"\n📌 CatBoost (Threshold = {cat_threshold:.2f}):")
print(confusion_matrix(y_test, y_pred_cat_thresh))
print(classification_report(y_test, y_pred_cat_thresh, digits=3))



📌 CatBoost (Threshold = 0.31):
[[1491 1161]
 [ 326 2256]]
              precision    recall  f1-score   support

           0      0.821     0.562     0.667      2652
           1      0.660     0.874     0.752      2582

    accuracy                          0.716      5234
   macro avg      0.740     0.718     0.710      5234
weighted avg      0.741     0.716     0.709      5234



In [36]:
# ^ ENSEMBLE MODEL DENEMESİ (RANDOM FOREST+XGBOOST+MLP+XGBOOST)

In [37]:
rf_model = joblib.load("rf_model_v1.joblib")
xgb_model = joblib.load("xgb_model_v1.joblib")
mlp_model = joblib.load('mlp_model.joblib')
cat_model = CatBoostClassifier()
cat_model.load_model('catboost_model.cbm')

# Test verisi üzerinde tahmin olasılıkları
y_proba_cat_test = cat_model.predict_proba(X_test)[:, 1]

# Test setinde olasılık tahminleri
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
y_proba_mlp = mlp_model.predict_proba(X_test)[:, 1]

ensemble_proba_4 = (y_proba_rf + y_proba_xgb + y_proba_mlp + y_proba_cat_test) / 4
ensemble_threshold_4 = find_best_threshold(y_test, ensemble_proba_4)
y_pred_ensemble_4 = (ensemble_proba_4 >= ensemble_threshold_4).astype(int)
print(f"\n📌 ENSEMBLE (RF + XGB + MLP+ CATBOOST, Threshold = {ensemble_threshold_4:.2f}):")
print(confusion_matrix(y_test, y_pred_ensemble_4))
print(classification_report(y_test, y_pred_ensemble_4, digits=4))  



📌 ENSEMBLE (RF + XGB + MLP+ CATBOOST, Threshold = 0.54):
[[1776  876]
 [ 489 2093]]
              precision    recall  f1-score   support

           0     0.7841    0.6697    0.7224      2652
           1     0.7050    0.8106    0.7541      2582

    accuracy                         0.7392      5234
   macro avg     0.7445    0.7401    0.7382      5234
weighted avg     0.7451    0.7392    0.7380      5234

