In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_curve
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data1=pd.read_csv('/content/drive/MyDrive/dosdata/data1.csv')
data2=pd.read_csv('/content/drive/MyDrive/dosdata/data2.csv')
data3=pd.read_csv('/content/drive/MyDrive/dosdata/data3.csv')

train_data=pd.concat([data1,data2,data3]).reset_index(drop=True)

train_data = train_data.drop(columns='Unnamed: 0')

train_data["Label"] = train_data["Label"].replace(["Benign","Anomaly"],[0,1])
for label, index in zip(['Benign', 'Anomaly'], [0, 1]):
    print(f"{label}: {index}")

Mounted at /content/drive
Benign: 0
Anomaly: 1


In [None]:
print(train_data["Label"].value_counts())

from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

X = train_data.drop("Label", axis=1)
y = train_data["Label"]

undersample = RandomUnderSampler(sampling_strategy='majority')
X, y = undersample.fit_resample(X, y)

print(Counter(y))

Label
0    1567950
1    1000448
Name: count, dtype: int64
Counter({0: 1000448, 1: 1000448})


In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_fold_performance(estimator, X, y, fold_idx, fold):

    train_idx, val_idx = fold
    X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
    y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]

    scaler = StandardScaler()
    X_fold_train = scaler.fit_transform(X_fold_train)
    X_fold_val = scaler.transform(X_fold_val)

    estimator.fit(X_fold_train, y_fold_train)
    y_pred = estimator.predict(X_fold_val)
    y_pred_proba = estimator.predict_proba(X_fold_val)[:, 1]

    metrics = {
        'fold': fold_idx + 1,
        'accuracy': accuracy_score(y_fold_val, y_pred),
        'precision': precision_score(y_fold_val, y_pred),
        'recall': recall_score(y_fold_val, y_pred),
        'f1': f1_score(y_fold_val, y_pred),
        'auc_roc': roc_auc_score(y_fold_val, y_pred_proba)
    }

    cm = confusion_matrix(y_fold_val, y_pred)
    metrics['tn'], metrics['fp'], metrics['fn'], metrics['tp'] = cm.ravel()

    return metrics, y_pred_proba, y_fold_val

def optimize_svm_detailed(X, y, cv=5):

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    # Parametre aralıklarını küçülttük
    svm_kernels = {
        'linear': (SVC(probability=True), {
            'C': [0.1, 1],
        }),
        'rbf': (SVC(probability=True), {
            'C': [1],
            'gamma': ['scale', 0.1],
        }),
        'poly': (SVC(probability=True), {
            'C': [1],
            'degree': [2, 3],
            'gamma': ['scale'],
        })
    }

    best_results = {}
    all_metrics = []

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    for kernel_type, (classifier, param_grid) in svm_kernels.items():
        print(f"\n{'-'*50}")
        print(f"Evaluating SVM with {kernel_type} kernel...")

        grid_search = GridSearchCV(
            estimator=classifier,
            param_grid=param_grid,
            cv=skf,
            scoring='accuracy',
            n_jobs=-1,
            verbose=1
        )

        grid_search.fit(X_scaled, y)

        print(f"\nBest Parameters for {kernel_type}:", grid_search.best_params_)
        print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")

        best_model = grid_search.best_estimator_
        fold_metrics = []
        fold_predictions = []
        fold_true_values = []

        for fold_idx, fold in enumerate(skf.split(X_scaled, y)):
            metrics, y_pred_proba, y_true = evaluate_fold_performance(
                best_model, X_scaled, y, fold_idx, fold
            )
            metrics['kernel_type'] = kernel_type
            fold_metrics.append(metrics)
            fold_predictions.append(y_pred_proba)
            fold_true_values.append(y_true)

            print(f"\nFold {fold_idx + 1} Metrics:")
            print(f"Accuracy: {metrics['accuracy']:.4f}")
            print(f"Precision: {metrics['precision']:.4f}")
            print(f"Recall: {metrics['recall']:.4f}")
            print(f"F1-Score: {metrics['f1']:.4f}")
            print(f"AUC-ROC: {metrics['auc_roc']:.4f}")
            print("\nConfusion Matrix:")
            print(f"TN: {metrics['tn']}, FP: {metrics['fp']}")
            print(f"FN: {metrics['fn']}, TP: {metrics['tp']}")

        all_metrics.extend(fold_metrics)

        plt.figure(figsize=(8, 6))
        for fold_idx in range(cv):
            fpr, tpr, _ = roc_curve(
                fold_true_values[fold_idx],
                fold_predictions[fold_idx]
            )
            plt.plot(
                fpr, tpr,
                label=f'Fold {fold_idx + 1} (AUC = {fold_metrics[fold_idx]["auc_roc"]:.4f})'
            )

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curves for SVM with {kernel_type} kernel (All Folds)')
        plt.legend()
        plt.show()

        best_results[kernel_type] = {
            'model': best_model,
            'params': grid_search.best_params_,
            'score': grid_search.best_score_
        }

    metrics_df = pd.DataFrame(all_metrics)

    print("\nAverage Metrics by Kernel Type:")
    avg_metrics = metrics_df.groupby('kernel_type').agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'auc_roc': ['mean', 'std']
    })
    print(avg_metrics)

    best_kernel_type = max(best_results.items(), key=lambda x: x[1]['score'])[0]
    best_model = best_results[best_kernel_type]['model']

    print(f"\nBest performing kernel: {best_kernel_type}")
    print(f"Best parameters: {best_results[best_kernel_type]['params']}")

    return best_model, best_results, metrics_df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
best_model, best_results, metrics_df = optimize_svm_detailed(X_train, y_train)


--------------------------------------------------
Evaluating SVM with linear kernel...
Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [None]:
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)
y_pred = best_model.predict(X_test_scaled)
print("\nFinal Test Set Performance:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix on Test Set')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_fold_performance_svm(estimator, X, y, fold_idx, fold):
    """
    Evaluate model performance for a single fold in SVM.
    """
    train_idx, val_idx = fold
    X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
    y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]

    scaler = StandardScaler()
    X_fold_train = scaler.fit_transform(X_fold_train)
    X_fold_val = scaler.transform(X_fold_val)

    estimator.fit(X_fold_train, y_fold_train)
    y_pred = estimator.predict(X_fold_val)
    y_pred_proba = estimator.decision_function(X_fold_val)

    metrics = {
        'fold': fold_idx + 1,
        'accuracy': accuracy_score(y_fold_val, y_pred),
        'precision': precision_score(y_fold_val, y_pred),
        'recall': recall_score(y_fold_val, y_pred),
        'f1': f1_score(y_fold_val, y_pred),
        'auc_roc': roc_auc_score(y_fold_val, y_pred_proba)
    }

    cm = confusion_matrix(y_fold_val, y_pred)
    metrics['tn'], metrics['fp'], metrics['fn'], metrics['tp'] = cm.ravel()

    return metrics, y_pred_proba, y_fold_val

def optimize_svm_detailed(X, y, cv=5):

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    svm = SVC(probability=True)
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }

    grid_search = GridSearchCV(
        estimator=svm,
        param_grid=param_grid,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    grid_search.fit(X_scaled, y)

    print("\nBest Parameters for SVM:", grid_search.best_params_)
    print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")

    best_model = grid_search.best_estimator_
    fold_metrics = []
    fold_predictions = []
    fold_true_values = []

    for fold_idx, fold in enumerate(skf.split(X_scaled, y)):
        metrics, y_pred_proba, y_true = evaluate_fold_performance_svm(
            best_model, X_scaled, y, fold_idx, fold
        )
        fold_metrics.append(metrics)
        fold_predictions.append(y_pred_proba)
        fold_true_values.append(y_true)

        print(f"\nFold {fold_idx + 1} Metrics:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1-Score: {metrics['f1']:.4f}")
        print(f"AUC-ROC: {metrics['auc_roc']:.4f}")
        print("\nConfusion Matrix:")
        print(f"TN: {metrics['tn']}, FP: {metrics['fp']}")
        print(f"FN: {metrics['fn']}, TP: {metrics['tp']}")

    plt.figure(figsize=(8, 6))
    for fold_idx in range(cv):
        fpr, tpr, _ = roc_curve(
            fold_true_values[fold_idx],
            fold_predictions[fold_idx]
        )
        plt.plot(
            fpr, tpr,
            label=f'Fold {fold_idx + 1} (AUC = {fold_metrics[fold_idx]["auc_roc"]:.4f})'
        )

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for SVM (All Folds)')
    plt.legend()
    plt.show()

    metrics_df = pd.DataFrame(fold_metrics)

    print("\nAverage Metrics for SVM:")
    avg_metrics = metrics_df.agg({
        'accuracy': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'auc_roc': ['mean', 'std']
    })
    print(avg_metrics)

    return best_model, metrics_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
best_model, metrics_df = optimize_svm_detailed(X_train, y_train)

y_pred = best_model.predict(X_test)
print("\nFinal Test Set Performance:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix on Test Set')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

def optimize_svm(X, y, cv=5):

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
    }

    svm = SVC(random_state=42)

    grid_search = GridSearchCV(
        estimator=svm,
        param_grid=param_grid,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X, y)

    print("Best Parameters:", grid_search.best_params_)
    print("Best Cross-Validation Score:", grid_search.best_score_)

    cv_results = pd.DataFrame(grid_search.cv_results_)
    cv_results = cv_results[['params', 'mean_test_score', 'std_test_score']]
    cv_results = cv_results.sort_values('mean_test_score', ascending=False).head()

    return grid_search.best_estimator_, cv_results

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
best_model, cv_results = optimize_svm(X_train, y_train)

In [None]:
print("\nTop 5 model performance:")
print(cv_results)

In [None]:
y_pred = best_model.predict(X_test)
print("\nTest set performance metrics:")
print(classification_report(y_test, y_pred))