In [None]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import os
import math
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC  # Import SVM Classifier
from sklearn.metrics import ( # Import evaluation metrics
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt # For plotting ROC curve
import traceback

    # 5. --- Train SVM Classifier ---
                    print("\n--- Training Support Vector Machine (SVM) ---")
                    # Instantiate SVM - RBF kernel is common, C controls regularization
                    # probability=True allows predict_proba but slows down training
                    svm_model = SVC(kernel='rbf', C=1.0, random_state=42, probability=True)

                    print(f"Training SVM on {X_train_scaled.shape[0]} samples...")
                    svm_model.fit(X_train_scaled, y_train)
                    print("Training complete.")

                    # 6. --- Evaluate SVM Model ---
                    # Check if there is test data for evaluation
                    if X_test_scaled is not None and y_test is not None and X_test_scaled.shape[0] > 0:
                        print("\n--- Evaluating SVM Model on Test Set ---")
                        y_pred_svm = svm_model.predict(X_test_scaled)
                        y_pred_proba_svm = svm_model.predict_proba(X_test_scaled)[:, 1] # Prob for class 1

                        # Calculate metrics
                        accuracy_svm = accuracy_score(y_test, y_pred_svm)
                        report_svm = classification_report(y_test, y_pred_svm)
                        cm_svm = confusion_matrix(y_test, y_pred_svm)
                        roc_auc_svm = roc_auc_score(y_test, y_pred_proba_svm)

                        # Print metrics
                        print(f"SVM Accuracy: {accuracy_svm:.4f}")
                        print("\nSVM Classification Report:")
                        print(report_svm)
                        print("\nSVM Confusion Matrix:")
                        print(cm_svm)
                        print(f"\nSVM ROC AUC Score: {roc_auc_svm:.4f}")

                        # Plot ROC Curve
                        fpr, tpr, _ = roc_curve(y_test, y_pred_proba_svm)
                        plt.figure(figsize=(8, 6))
                        plt.plot(fpr, tpr, color='blue', lw=2, label=f'SVM ROC curve (area = {roc_auc_svm:.2f})')
                        plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')
                        plt.xlim([0.0, 1.0])
                        plt.ylim([0.0, 1.05])
                        plt.xlabel('False Positive Rate')
                        plt.ylabel('True Positive Rate')
                        plt.title('Receiver Operating Characteristic (ROC) - SVM')
                        plt.legend(loc="lower right")
                        plt.grid(alpha=0.3)
                        # You might want to save the plot:
                        # plt.savefig('svm_roc_curve.png')
                        plt.show()

                    else:
                        print("\nNo test data available for evaluation.")


                    # 7. --- Hyperparameter Tuning (Optional but Recommended) ---
                    print("\n--- Hyperparameter Tuning for SVM (using GridSearchCV) ---")
                    # Define parameter grid (adjust ranges based on typical values/initial results)
                    param_grid_svm = {
                        'C': [0.1, 1, 10, 50],           # Regularization parameter
                        'gamma': ['scale', 'auto', 0.1, 1], # Kernel coefficient for 'rbf'
                        'kernel': ['rbf'] # Can add 'linear', 'poly' if desired
                    }

                    # Set up GridSearchCV
                    # cv=3 for faster tuning, increase for more robustness (e.g., 5)
                    # scoring='roc_auc' is often good for imbalanced datasets
                    grid_search_svm = GridSearchCV(
                        SVC(random_state=42, probability=True), # Base estimator
                        param_grid=param_grid_svm,
                        cv=3,
                        scoring='roc_auc', # Or 'accuracy', 'f1', etc.
                        n_jobs=-1, # Use all available CPU cores
                        verbose=1 # Show progress
                    )

                    print("Running GridSearchCV...")
                    grid_search_svm.fit(X_train_scaled, y_train)

                    print("\nGridSearchCV Complete.")
                    print(f"Best Parameters found: {grid_search_svm.best_params_}")
                    print(f"Best cross-validation ROC AUC score: {grid_search_svm.best_score_:.4f}")

                    # Evaluate the best model found by GridSearchCV on the test set
                    if X_test_scaled is not None and y_test is not None and X_test_scaled.shape[0] > 0:
                        print("\n--- Evaluating Best SVM Model from GridSearchCV on Test Set ---")
                        best_svm_model = grid_search_svm.best_estimator_
                        y_pred_best_svm = best_svm_model.predict(X_test_scaled)
                        y_pred_proba_best_svm = best_svm_model.predict_proba(X_test_scaled)[:, 1]

                        accuracy_best_svm = accuracy_score(y_test, y_pred_best_svm)
                        report_best_svm = classification_report(y_test, y_pred_best_svm)
                        roc_auc_best_svm = roc_auc_score(y_test, y_pred_proba_best_svm)

                        print(f"Best SVM Accuracy: {accuracy_best_svm:.4f}")
                        print("\nBest SVM Classification Report:")
                        print(report_best_svm)
                        print(f"\nBest SVM ROC AUC Score: {roc_auc_best_svm:.4f}")
                    else:
                         print("\nNo test data available for evaluating the best tuned model.")

                else:
                     print("\nSkipping SVM training and evaluation as training data is unavailable.")

            except ValueError as e:
                 print(f"\nError during processing or splitting: {e}")

        else:
            print("\nFeature extraction failed or resulted in empty data. Cannot proceed.")
    else:
        print("\nFailed to load JSON data. Cannot proceed.")