<a href="https://colab.research.google.com/github/nourhan-transformerML/Advanced-Intelligent-Fault-Diagnosis-of-Power-Transformers-Using-Machine-Learning-on-DGA-Data/blob/main/predicted_output_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Transformer Fault Diagnosis with SHAP and CatBoost - Complete Optimized Code
# Author: [Your Name]
# Date: [Current Date]

# Install required packages
CATBOOST_AVAILABLE = False
try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
except ImportError:
    print("CatBoost not available. Proceeding without CatBoost.")

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time  # IMPORT TIME AT TOP LEVEL
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                           f1_score, precision_score, recall_score, roc_auc_score,
                           roc_curve, auc)
from sklearn.ensemble import (GradientBoostingClassifier,
                            ExtraTreesClassifier, VotingClassifier, StackingClassifier)
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from imblearn.over_sampling import SMOTE, RandomOverSampler
from collections import Counter
import shap
import warnings
import os
warnings.filterwarnings('ignore')

# ======================
# CUSTOM ROC PLOT FUNCTION
# ======================
def plot_roc_curve(y_true, y_proba, title='ROC Curves'):
    """Custom function to plot ROC curves"""
    plt.figure(figsize=(10, 8))
    if y_proba.ndim == 1 or y_proba.shape[1] == 2:
        fpr, tpr, _ = roc_curve(y_true, y_proba if y_proba.ndim == 1 else y_proba[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    else:
        n_classes = y_proba.shape[1]
        for i in range(n_classes):
            fpr, tpr, _ = roc_curve(y_true == i, y_proba[:, i])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'Class {i} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.tight_layout()
    return plt

# ======================
# MODEL TRAINING AND EVALUATION
# ======================
def main():
    try:
        print("Starting analysis...")
        start_time = time.time()  # NOW TIME IS DEFINED

        # Parameters
        random_state = 42
        test_size = 0.25
        n_splits = 5
        top_features = 5

        # 1. Load and preprocess data
        print("\n[1/5] Loading and preprocessing data...")
        try:
            df = pd.read_csv("data_transformer.csv")
            if df.empty:
                raise ValueError("Dataset is empty")
            if "fault type" not in df.columns:
                raise ValueError("'fault type' column not found")

            # Store original fault type names
            fault_types = df["fault type"].unique()
            print("Found fault types:", fault_types)

            # Encode target
            label_encoder = LabelEncoder()
            original_labels = df["fault type"].copy()
            df["fault type encoded"] = label_encoder.fit_transform(df["fault type"])
            print("\nClass distribution:", Counter(df["fault type encoded"]))

        except Exception as e:
            print(f"Data loading error: {str(e)}")
            return None

        # 2. Prepare data
        print("\n[2/5] Preparing data...")
        X = df.drop(["fault type", "fault type encoded"], axis=1)
        y = df["fault type encoded"]
        feature_names = X.columns.tolist()
        print("Features used:", feature_names)

        # Scale features
        scaler = RobustScaler()
        X_scaled = scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=feature_names)

        # Handle imbalance
        min_samples = min(Counter(y).values())
        n_neighbors = min(5, min_samples - 1) if min_samples > 1 else 1

        try:
            smote = SMOTE(k_neighbors=n_neighbors, random_state=random_state)
            X_res, y_res = smote.fit_resample(X_scaled, y)
            print(f"Used SMOTE with k_neighbors={n_neighbors}")
        except:
            print("Using RandomOverSampler")
            ros = RandomOverSampler(random_state=random_state)
            X_res, y_res = ros.fit_resample(X_scaled, y)

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X_res, y_res,
            test_size=test_size,
            random_state=random_state,
            stratify=y_res
        )

        # 3. Initialize models
        print("\n[3/5] Initializing models...")
        models = {
            "LightGBM": LGBMClassifier(
                n_estimators=200,
                max_depth=-1,
                learning_rate=0.05,
                random_state=random_state,
                class_weight='balanced'
            ),
            "Gradient Boosting": GradientBoostingClassifier(
                n_estimators=200,
                max_depth=5,
                learning_rate=0.05,
                random_state=random_state
            ),
            "Extra Trees": ExtraTreesClassifier(
                n_estimators=200,
                max_depth=10,
                random_state=random_state,
                class_weight='balanced'
            ),
            "MLP": MLPClassifier(
                hidden_layer_sizes=(100, 50),
                max_iter=300,
                random_state=random_state,
                early_stopping=True
            ),
            "StackingClassifier": StackingClassifier(
                estimators=[
                    ('lgbm', LGBMClassifier(n_estimators=200, max_depth=-1, learning_rate=0.05,
                                          random_state=random_state, class_weight='balanced')),
                    ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=random_state, class_weight='balanced'))
                ],
                final_estimator=LogisticRegression(max_iter=1000, random_state=random_state),
                stack_method='auto'
            )
        }

        if CATBOOST_AVAILABLE:
            models["CatBoost"] = CatBoostClassifier(
                iterations=200,
                depth=6,
                learning_rate=0.05,
                random_state=random_state,
                verbose=0
            )

        # 4. Train and evaluate models
        print("\n[4/5] Training and evaluating models...")
        results = []
        best_model = {'name': None, 'model': None, 'accuracy': 0}

        for name, model in models.items():
            try:
                print(f"\nTraining {name}...")
                model_start = time.time()

                # Cross-validation
                cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
                cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

                # Train and predict
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='macro')
                recall = recall_score(y_test, y_pred, average='macro')
                f1 = f1_score(y_test, y_pred, average='macro')
                roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr') if y_proba is not None else None

                # Store results
                results.append({
                    "Model": name,
                    "Accuracy": accuracy,
                    "Precision": precision,
                    "Recall": recall,
                    "F1-Score": f1,
                    "ROC AUC": roc_auc if roc_auc is not None else 'N/A',
                    "CV Score": f"{np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}",
                    "Time": f"{time.time() - model_start:.2f}s"
                })

                # Update best model
                if accuracy > best_model['accuracy']:
                    best_model = {
                        'name': name,
                        'model': model,
                        'accuracy': accuracy,
                        'predictions': y_pred,
                        'probabilities': y_proba
                    }

                print(f"{name} completed in {time.time() - model_start:.2f} seconds")

            except Exception as e:
                print(f"Error with {name}: {str(e)}")
                continue

        # 5. Generate final report
        print("\n[5/5] Generating final report...")
        report_time = time.time()

        # Create comparison table
        results_df = pd.DataFrame(results)
        print("\nModel Performance Comparison:")
        print(results_df.to_markdown(index=False))

        # Prepare return object
        result = {
            'best_model': best_model,
            'label_encoder': label_encoder,
            'scaler': scaler,
            'feature_names': feature_names,
            'original_data': {
                'X': X,
                'y_true': original_labels,
                'y_encoded': y
            }
        }

        print(f"\nTotal execution time: {time.time() - start_time:.2f} seconds")
        return result

    except KeyboardInterrupt:
        print("\nProcess interrupted by user")
        return None
    except Exception as e:
        print(f"\nError in main process: {str(e)}")
        return None

if __name__ == "__main__":
    result = main()

    if result is not None:
        from joblib import dump
        import numpy as np
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

        # Extract components from result
        best_model = result['best_model']
        label_encoder = result['label_encoder']
        scaler = result['scaler']
        feature_names = result['feature_names']
        X_original = result['original_data']['X']
        y_true_string = result['original_data']['y_true']
        y_true_encoded = result['original_data']['y_encoded']

        # Scale original data
        X_original_scaled = scaler.transform(X_original)
        X_original_scaled = pd.DataFrame(X_original_scaled, columns=feature_names)

        # Predict using best model
        model = best_model['model']
        y_pred = model.predict(X_original_scaled)
        y_proba = model.predict_proba(X_original_scaled)

        # Calculate overall accuracy
        overall_accuracy = accuracy_score(y_true_encoded, y_pred)

        # Calculate accuracy per fault type
        fault_types = label_encoder.classes_
        fault_results = []

        for i, fault_type in enumerate(fault_types):
            # Create mask for current fault type
            mask = (y_true_encoded == i)

            # Calculate metrics
            correct = np.sum(y_pred[mask] == i)
            total = np.sum(mask)
            accuracy = correct / total if total > 0 else 0.0
            avg_prob = np.mean(y_proba[mask, i])

            fault_results.append({
                'Fault Type': fault_type,
                'Accuracy': accuracy,
                'Correct Predictions': correct,
                'Total Samples': total,
                'Avg Probability': f"{avg_prob:.2%}"
            })

        # Create DataFrame for fault accuracy
        fault_accuracy_df = pd.DataFrame(fault_results)

        # Add overall accuracy as a row
        overall_row = pd.DataFrame({
            'Fault Type': ['OVERALL ACCURACY'],
            'Accuracy': [overall_accuracy],
            'Correct Predictions': [np.sum(y_pred == y_true_encoded)],
            'Total Samples': [len(y_true_encoded)],
            'Avg Probability': ['N/A']
        })

        final_results_df = pd.concat([fault_accuracy_df, overall_row], ignore_index=True)

        # Create model summary
        model_summary_df = pd.DataFrame({
            'Best Model': [best_model['name']],
            'Overall Accuracy': [f"{overall_accuracy:.2%}"],
            'Macro Precision': [f"{precision_score(y_true_encoded, y_pred, average='macro'):.2%}"],
            'Macro Recall': [f"{recall_score(y_true_encoded, y_pred, average='macro'):.2%}"],
            'Macro F1-Score': [f"{f1_score(y_true_encoded, y_pred, average='macro'):.2%}"]
        })

        # Save to Excel
        with pd.ExcelWriter('fault_accuracy_report.xlsx') as writer:
            final_results_df.to_excel(writer, sheet_name='Fault Accuracy', index=False)
            model_summary_df.to_excel(writer, sheet_name='Model Summary', index=False)

        print("\nReport saved to 'fault_accuracy_report.xlsx'")
        print("\nAccuracy per fault type and overall accuracy:")
        print(final_results_df.to_markdown(index=False))

    else:
        print("\nAnalysis failed or was interrupted")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1172
[LightGBM] [Info] Number of data points in the train set: 1128, number of used features: 7
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Auto-choosing col-wise multi-threadi