In [5]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from collections import Counter

# 1. Load the dataset
df = pd.read_csv("fault type.csv")
df.fillna(df.mean(numeric_only=True), inplace=True)

# 2. Split features and target
X = df.drop("fault type", axis=1)
y = df["fault type"]

# 3. Feature selection
selector = SelectKBest(score_func=f_classif, k=min(15, X.shape[1]))
X_selected = selector.fit_transform(X, y)

# 4. Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# 5. Handle class imbalance using SMOTE
class_counts = Counter(y)
min_class_count = min(class_counts.values())
k_neighbors = min(5, min_class_count - 1)
if k_neighbors < 1:
    k_neighbors = 1

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 6. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.50, random_state=42, stratify=y_resampled
)

# 7. Define machine learning models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_leaf=2, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=200, max_depth=15, min_samples_leaf=2, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_leaf=5, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=2000, C=1.0, penalty='l2', random_state=42),
    "SVM": SVC(C=1.0, kernel='rbf', gamma='scale')
}

accuracies = {}

# 🔧 Ensure 'outputs' folder exists
os.makedirs("outputs", exist_ok=True)

# 8. Train and evaluate models
for name, model in models.items():
    print(f"\n--- {name} ---")
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    accuracies[name] = test_acc

    print(f"Train Accuracy: {train_acc:.3f}")
    print(f"Test Accuracy:  {test_acc:.3f}")
    print("Classification Report:\n", classification_report(y_test, y_test_pred))

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(f"outputs/{name}_confusion_matrix_improved.png", dpi=300)
    plt.close()

# 9. Plot accuracy comparison
plt.figure(figsize=(8, 5))
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette="viridis")
plt.title("Improved Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig("outputs/improved_accuracy_comparison.png", dpi=300)
plt.close()



--- Random Forest ---
Train Accuracy: 0.999
Test Accuracy:  0.892
Classification Report:
               precision    recall  f1-score   support

          D1       0.85      0.82      0.84        85
          D2       0.82      0.80      0.81        85
        HCCD       0.98      0.94      0.96        86
        LCCD       0.89      0.86      0.88        86
        MCCD       0.88      0.97      0.92        86
           N       0.93      1.00      0.97        85
          PD       1.00      0.95      0.98        86
          T1       0.95      0.92      0.93        85
          T2       0.77      0.88      0.82        85
          T3       0.86      0.78      0.82        86

    accuracy                           0.89       855
   macro avg       0.89      0.89      0.89       855
weighted avg       0.89      0.89      0.89       855


--- Extra Trees ---
Train Accuracy: 0.935
Test Accuracy:  0.842
Classification Report:
               precision    recall  f1-score   support

      


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette="viridis")


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import os

# تأكد من وجود مجلد لحفظ الصور
os.makedirs("outputs", exist_ok=True)

# 1. تحميل البيانات الأصلية للتدريب
df_full = pd.read_csv("fault type.csv")
df_full.fillna(df_full.mean(numeric_only=True), inplace=True)

# 2. فصل الخصائص (X) والهدف (y)
X_train = df_full.drop("fault type", axis=1)
y_train = df_full["fault type"]

# 3. تحميل البيانات بدون عمود "fault type" للتنبؤ
df_unlabeled = pd.read_csv("fault type.csv").drop("fault type", axis=1)
df_unlabeled.fillna(df_unlabeled.mean(numeric_only=True), inplace=True)

# 4. اختيار أفضل الخصائص
selector = SelectKBest(score_func=f_classif, k=min(15, X_train.shape[1]))
X_train_selected = selector.fit_transform(X_train, y_train)
X_unlabeled_selected = selector.transform(df_unlabeled)

# 5. تطبيع البيانات
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_unlabeled_scaled = scaler.transform(X_unlabeled_selected)

# 6. تدريب النموذج
model = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_leaf=2, random_state=42)
model.fit(X_train_scaled, y_train)

# 7. التنبؤ بنوع العطل للبيانات غير المصنفة
predictions = model.predict(X_unlabeled_scaled)

# 8. حفظ النتائج في ملف جديد
df_predictions = df_unlabeled.copy()
df_predictions["Predicted Fault Type"] = predictions
df_predictions.to_csv("predicted_faults.csv", index=False)
print("✅ تم التنبؤ بنوع العطل، والنتائج محفوظة في ملف predicted_faults.csv")

# 9. حساب الدقة ونسبة التنبؤات الصحيحة
accuracy = accuracy_score(y_train, predictions)
print(f"🔍 نسبة التنبؤات الصحيحة: {accuracy * 100:.2f}%")
print("📊 تقرير التصنيف:\n", classification_report(y_train, predictions))

# 10. رسم مصفوفة الارتباك
cm = confusion_matrix(y_train, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=model.classes_, yticklabels=model.classes_)
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig("outputs/confusion_matrix.png", dpi=300)
plt.close()
print("📈 تم حفظ مصفوفة الارتباك في outputs/confusion_matrix.png")

✅ تم التنبؤ بنوع العطل، والنتائج محفوظة في ملف predicted_faults.csv
🔍 نسبة التنبؤات الصحيحة: 98.71%
📊 تقرير التصنيف:
               precision    recall  f1-score   support

          D1       1.00      1.00      1.00        63
          D2       0.99      1.00      0.99        80
        HCCD       1.00      0.82      0.90        11
        LCCD       0.98      1.00      0.99       171
        MCCD       0.98      1.00      0.99        61
           N       1.00      1.00      1.00        97
          PD       1.00      0.96      0.98        23
          T1       1.00      0.92      0.96        13
          T2       0.98      0.89      0.93        45
          T3       0.98      1.00      0.99       131

    accuracy                           0.99       695
   macro avg       0.99      0.96      0.97       695
weighted avg       0.99      0.99      0.99       695

📈 تم حفظ مصفوفة الارتباك في outputs/confusion_matrix.png
