In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_curve
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data1=pd.read_csv('/content/drive/MyDrive/dosdata/data1.csv')
data2=pd.read_csv('/content/drive/MyDrive/dosdata/data2.csv')
data3=pd.read_csv('/content/drive/MyDrive/dosdata/data3.csv')

train_data=pd.concat([data1,data2,data3]).reset_index(drop=True)

train_data = train_data.drop(columns='Unnamed: 0')

train_data["Label"] = train_data["Label"].replace(["Benign","Anomaly"],[0,1])
for label, index in zip(['Benign', 'Anomaly'], [0, 1]):
    print(f"{label}: {index}")

Mounted at /content/drive
Benign: 0
Anomaly: 1


In [None]:
print(train_data["Label"].value_counts())

from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

X = train_data.drop("Label", axis=1)
y = train_data["Label"]

undersample = RandomUnderSampler(sampling_strategy='majority')
X, y = undersample.fit_resample(X, y)

print(Counter(y))

Label
0    1567950
1    1000448
Name: count, dtype: int64
Counter({0: 1000448, 1: 1000448})


In [None]:
def optimize_svc(X, y, cv=3):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    param_grid = {
        'C': [0.1, 1],
        'gamma': ['scale'],
        'kernel': ['linear']
    }

    svc = SVC(random_state=42)

    grid_search = GridSearchCV(
        estimator=svc,
        param_grid=param_grid,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X, y)

    print("Best Parameters:", grid_search.best_params_)
    print("Best Cross-Validation Score:", grid_search.best_score_)

    cv_results = pd.DataFrame(grid_search.cv_results_)
    cv_results = cv_results[['params', 'mean_test_score', 'std_test_score']]
    cv_results = cv_results.sort_values('mean_test_score', ascending=False).head()

    return grid_search.best_estimator_, cv_results


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
best_model, cv_results = optimize_svc(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [None]:
print("\nTop 5 model performance:")
print(cv_results)

In [None]:
y_pred = best_model.predict(X_test)
print("\nTest set performance metrics:")
print(classification_report(y_test, y_pred))