In [1]:
#Kütüphanelerin Yüklenmesi
from scipy.stats import pointbiserialr
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
#  ^ VERİ YÜKLEME ^
# Veri incelendi. Fail_Occurrence = 1 olan kayıtlar özellikle incelendi,
# bu kayıtlar ile Fail_Occurrence = 0 olan kayıtlar karşılaştırıldı.
# Bu karşılaştırma sonucunda önemli olabilecek özellikler (feature selection) seçildi.

In [3]:
use_cols = ['Timestamp', 'Machine_ID', 'Machine_Type', 'Failure_Occurrence',
            'Idle_Time_Duration', 'Component_Health_Score', 'Oil_Viscosity',
            'RUL', 'Lighting_Condition', 'Energy_Efficiency_Index',
            'Ventilation_Level', 'Ambient_Temperature',
            'Operational_Mode', 'Job_Code', 'Maintenance_Frequency','Voltage_Phase_A','Maintenance_Personnel_ID',
            'Maintenance_Type_Label', 'Replaced_Components_List','Maintenance_Type',
            'Shift_Code', 'Operator_ID', 'Machine_Location_Zone','Power_Consumption',
            'Last_Maintenance_Date','Peak_Vibration','Communication_Latency','Nearby_Machine_Load','Alarm_Count_24hr','Data_Packet_Loss_Percent']

df = pd.read_csv("RealTime_IoT_PredictiveMaintenance_Dataset.csv", usecols=use_cols)

In [4]:
# ^ PENCERELEME ^

In [5]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])  # Zaman damgaları datetime formatına çevrilir
df['Hour'] = df['Timestamp'].dt.floor('12H')  # 12 saatlik zaman pencerelerine yuvarlama yapılır (bu yöntem deneysel olarak iyi sonuç vermiştir)
df['Machine_Unique'] = df['Machine_ID'].astype(str) + '_' + df['Machine_Type']  # Her makine için benzersiz kimlik oluşturulur

In [6]:
# ^ SÜTUNLARIN AYRILMASI ^

In [7]:
numeric_cols = [
    'Oil_Viscosity', 'Power_Consumption', 'Energy_Efficiency_Index','Voltage_Phase_A',
    'Idle_Time_Duration', 'Component_Health_Score', 'Ambient_Temperature',
    'Lighting_Condition', 'Ventilation_Level', 'RUL','Peak_Vibration','Communication_Latency','Nearby_Machine_Load','Alarm_Count_24hr','Data_Packet_Loss_Percent'
]

categorical_cols = [
    'Operational_Mode', 'Job_Code', 'Maintenance_Frequency','Maintenance_Personnel_ID',
    'Maintenance_Type_Label', 'Replaced_Components_List', 'Shift_Code',
    'Operator_ID', 'Machine_Location_Zone','Maintenance_Type'
]

In [8]:
# ^ AGGREGATION ^

In [9]:
agg_funcs = ['mean', 'median', 'min', 'max']
df_numeric = df.groupby(['Machine_Unique', 'Hour'])[numeric_cols].agg(agg_funcs)
df_numeric.columns = ['_'.join(col) for col in df_numeric.columns]
df_numeric = df_numeric.reset_index()
df_failure = df.groupby(['Machine_Unique', 'Hour'])['Failure_Occurrence'].max().reset_index()

In [10]:
# ^ FEATURE EXTRACTION ^

In [11]:
df['Last_Maintenance_Date'] = pd.to_datetime(df['Last_Maintenance_Date'], errors='coerce') # Eğer ki son maintenance tarihi geçmiş ise 1, değilse 0 
df_maintenance = df.groupby(['Machine_Unique', 'Hour'])[['Last_Maintenance_Date']].max().reset_index()
df_maintenance['Maintenance_Before_Hour'] = (df_maintenance['Last_Maintenance_Date'] < df_maintenance['Hour']).astype(int)
df_maintenance = df_maintenance.drop(columns=['Last_Maintenance_Date'])

In [12]:
# ^ MERGE ^

In [13]:
merged_df = pd.merge(df_numeric, df_failure, on=['Machine_Unique', 'Hour'])
merged_df = pd.merge(merged_df, df_maintenance, on=['Machine_Unique', 'Hour'])

In [14]:
# ^ KORELASYON HESABI ^

In [15]:
correlations = {} 
for col in df_numeric.columns:
    if col not in ['Machine_Unique', 'Hour']:
        try:
            corr, _ = pointbiserialr(merged_df['Failure_Occurrence'], merged_df[col])
            correlations[col] = corr
        except Exception as e:
            print(f"Hata: {col} için hesaplama yapılamadı. {e}")
            
top_20_corr = pd.Series(correlations).dropna().reindex(
    pd.Series(correlations).abs().sort_values(ascending=False).index
).head(20)
print("Failure_Occurrence ile en yüksek 20 korelasyon:")
print(top_20_corr)

Failure_Occurrence ile en yüksek 20 korelasyon:
Alarm_Count_24hr_max            0.416025
Voltage_Phase_A_max             0.414161
Power_Consumption_min          -0.413841
Nearby_Machine_Load_max         0.412925
Energy_Efficiency_Index_max     0.411379
Oil_Viscosity_max               0.409890
RUL_max                         0.409762
Idle_Time_Duration_max          0.409564
Component_Health_Score_max      0.407822
Ventilation_Level_max           0.407243
Peak_Vibration_min             -0.406950
Ambient_Temperature_max         0.404662
Communication_Latency_max       0.404176
Lighting_Condition_max          0.403778
Data_Packet_Loss_Percent_max    0.403485
Alarm_Count_24hr_min           -0.245313
Component_Health_Score_min     -0.242346
Peak_Vibration_max              0.241104
Ambient_Temperature_min        -0.238486
RUL_min                        -0.237864
dtype: float64


In [16]:
# ^ FEATURE SELECTION ^
# Fail_Occurence ile yüksek korelasyona sahip olan (yaklaşık 0.4 civarı) featurelar eğitim için seçildi.

In [17]:
features = [
    'Alarm_Count_24hr_max',
    'Voltage_Phase_A_max',
    'Power_Consumption_min',
    'Nearby_Machine_Load_max',
    'Energy_Efficiency_Index_max',
    'Oil_Viscosity_max',
    'RUL_max',
    'Idle_Time_Duration_max',
    'Component_Health_Score_max',
    'Ventilation_Level_max',
    'Peak_Vibration_min',
    'Ambient_Temperature_max',
    'Communication_Latency_max',
    'Lighting_Condition_max',
    'Data_Packet_Loss_Percent_max'
]

In [18]:
# ^ GEÇMİŞ 6 ZAMAN ADIMINDAN ÖZELLİK MATRİSİ OLUŞTURMA (SLIDING WINDOW) ^
# Farklı pencere boyutları (örneğin 3, 6, 12) denenmiş; model performansı ve veri kaybı dengesi göz önünde bulundurularak
# 6 zaman adımı (örneğin 72 saat) ile oluşturulan pencere boyutunun en uygun sonuçları verdiği gözlemlenmiştir.
# Her bir pencere için geçmiş veriler tek satıra indirgenmiş, ilgili pencerenin hemen sonraki arıza durumu etiketlenmiştir.

In [19]:
window_size = 6

X_windows = []
y_labels = []

for machine_id, group in merged_df.groupby('Machine_Unique'):
    group = group.sort_values('Hour')
    values = group[features].values
    labels = group['Failure_Occurrence'].values

    for i in range(len(group) - window_size):
        X_windows.append(values[i:i+window_size].flatten())
        y_labels.append(labels[i + window_size])

X_sliding = pd.DataFrame(X_windows)
y_sliding = pd.Series(y_labels)

In [20]:
# ^ VERİYİ BÖLME ^

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X_sliding, y_sliding, test_size=0.2, stratify=y_sliding, random_state=50
)

In [22]:
# ^ SMOTE ^
# Fail_Occurence=1 olan durumların az olduğu gözlemlendi ve veride dengesizlik olduğu anlaşıldı.
# Bu yüzden SMOTE ile eğitim seti için sentetik veri oluşturuldu

In [23]:
smote = SMOTE(random_state=50)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [24]:
# ^ RANDOM FOREST MODELİ ^

In [25]:
rf = RandomForestClassifier(max_depth=10, n_estimators=100, class_weight='balanced', random_state=50)
rf.fit(X_train_res, y_train_res)

y_train_proba_rf = rf.predict_proba(X_train_res)[:, 1]

y_train_pred_rf = (y_train_proba_rf >= 0.5).astype(int)
print("\nRandom Forest - Eğitim Verisi Sonuçları:")
print(confusion_matrix(y_train_res, y_train_pred_rf))
print(classification_report(y_train_res, y_train_pred_rf, digits=3))

joblib.dump(rf, "models/rf_model_v1.joblib")


Random Forest - Eğitim Verisi Sonuçları:
[[9461 1146]
 [1525 9082]]
              precision    recall  f1-score   support

           0      0.861     0.892     0.876     10607
           1      0.888     0.856     0.872     10607

    accuracy                          0.874     21214
   macro avg      0.875     0.874     0.874     21214
weighted avg      0.875     0.874     0.874     21214



['models/rf_model_v1.joblib']

In [26]:
# ^ XGBOOST MODELİ ^

In [27]:
pos_weight = len(y_train_res[y_train_res == 0]) / len(y_train_res[y_train_res == 1])

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    scale_pos_weight=pos_weight,
    eval_metric='logloss',
    random_state=50,
    learning_rate=0.1
)
xgb.fit(X_train_res, y_train_res)

y_train_proba_xgb = xgb.predict_proba(X_train_res)[:, 1]

y_train_pred_xgb = (y_train_proba_xgb >= 0.5).astype(int)

print("\nXGBoost - Eğitim Verisi Sonuçları:")
print(confusion_matrix(y_train_res, y_train_pred_xgb))
print(classification_report(y_train_res, y_train_pred_xgb, digits=3))

joblib.dump(xgb, "models/xgb_model_v1.joblib")


XGBoost - Eğitim Verisi Sonuçları:
[[9160 1447]
 [2497 8110]]
              precision    recall  f1-score   support

           0      0.786     0.864     0.823     10607
           1      0.849     0.765     0.804     10607

    accuracy                          0.814     21214
   macro avg      0.817     0.814     0.814     21214
weighted avg      0.817     0.814     0.814     21214



['models/xgb_model_v1.joblib']

In [28]:
# ^ MLP MODELİ ^

In [29]:
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res) 
mlp = MLPClassifier(
    hidden_layer_sizes=(50,30),
    activation='relu',
    solver='adam',
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=5,
    random_state=50
)
mlp.fit(X_train_res_scaled, y_train_res)
y_train_proba_mlp = mlp.predict_proba(X_train_res_scaled)[:, 1]
y_train_pred_mlp = (y_train_proba_mlp >= 0.5).astype(int)
print("\nMLP - Eğitim Verisi Sonuçları:")
print(confusion_matrix(y_train_res, y_train_pred_mlp))
print(classification_report(y_train_res, y_train_pred_mlp, digits=3))

joblib.dump(mlp, 'models/mlp_model.joblib')
joblib.dump(scaler, 'models/scaler.joblib')


MLP - Eğitim Verisi Sonuçları:
[[8817 1790]
 [3307 7300]]
              precision    recall  f1-score   support

           0      0.727     0.831     0.776     10607
           1      0.803     0.688     0.741     10607

    accuracy                          0.760     21214
   macro avg      0.765     0.760     0.758     21214
weighted avg      0.765     0.760     0.758     21214



['models/scaler.joblib']