### Import necessary Libraries

In [2]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *
from collections import Counter
from sklearn.model_selection import LearningCurveDisplay, learning_curve

### Load and save model

In [3]:
import pickle
# Save model
def save_model(file_name, model):
    with open('/kaggle/working/'+file_name,'wb') as f:
        pickle.dump(model,f)
    f.close()

# Load model
def load_model(file_name):
    with open('/kaggle/working/'+file_name,'rb') as f:
        model = pickle.load(f)
    f.close()
    return model

### Load data

In [4]:
df = pd.read_csv('/kaggle/input/balance-data/balanced.csv')

### Split train and test set

In [5]:
X = df.iloc[:,:-1].to_numpy()
Y = df.iloc[:,-1].to_numpy()

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=random.randint(1,999))
print(X_train.shape,X_test.shape)

(8224, 37) (2056, 37)


#### Plot learning curve function

In [44]:
def plot_learning_curve(model):
    train_sizes, train_scores, test_scores = learning_curve(estimator=model, X=X_train, y=Y_train,
                                                        cv=10, train_sizes=np.linspace(0.1, 1.0, 10),
                                                        n_jobs=-1)
    #
    # Calculate training and test mean and std
    #
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    #
    # Plot the learning curve
    #
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training Accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', marker='o', markersize=5, label='Validation Accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.title('Learning Curve')
    plt.xlabel('Training Data Size')
    plt.ylabel('Model accuracy')
    plt.grid()
    plt.legend(loc='lower right')
    plt.show()

In [10]:
from sklearn.model_selection import GridSearchCV

### RF

In [8]:
RF_clf = RandomForestClassifier()

In [13]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],  
    'max_leaf_nodes': [1000, 5000, 10000, 20000]  
}

In [14]:
grid_search = GridSearchCV(RF_clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, Y_train)

In [15]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(best_params)
print(best_score)

{'max_leaf_nodes': 1000, 'n_estimators': 400}
0.8771894482284294


In [None]:
RF_clf = RandomForestClassifier(max_leaf_nodes= 1000, n_estimators= 400)
RF_clf.fit(X_train, Y_train)
save_model('RF.pkl', RF_clf)

In [48]:
RF_pred = RF_clf.predict(X_test)
print(classification_report(Y_test, RF_pred))

              precision    recall  f1-score   support

      Benign       0.84      0.84      0.84       261
  Bruteforce       0.74      0.80      0.77       262
        DDos       1.00      0.99      1.00       263
         Dos       1.00      0.99      0.99       220
       Mirai       1.00      0.99      0.99       274
       Recon       0.83      0.70      0.76       243
    Spoofing       0.88      0.79      0.84       272
   Web-based       0.70      0.84      0.77       261

    accuracy                           0.87      2056
   macro avg       0.87      0.87      0.87      2056
weighted avg       0.87      0.87      0.87      2056



### SVM

In [7]:
SVM_clf = make_pipeline(
    StandardScaler(),
    PCA(n_components=20),
    SVC()
)

In [33]:
param_grid2 = {
    'svc__C': [1, 10, 18, 20], 
    'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__gamma': ['scale', 'auto']
}

In [34]:
grid_search2 = GridSearchCV(SVM_clf, param_grid2, cv=5, scoring='accuracy', n_jobs=-1)
grid_search2.fit(X_train, Y_train)

In [35]:
best_params2 = grid_search2.best_params_
best_score2 = grid_search2.best_score_

print(best_params2)
print(best_score2)

{'svc__C': 18, 'svc__gamma': 'auto', 'svc__kernel': 'rbf'}
0.6856773826163483


In [None]:
SVM_clf = make_pipeline(StandardScaler(),PCA(n_components=20), SVC(C=18,gamma='auto'))
SVM_clf.fit(X_train, Y_train)               
save_model('SVM.pkl', SVM_clf)

In [50]:
SVM_pred = SVM_clf.predict(X_test)
print(classification_report(Y_test, SVM_pred))

              precision    recall  f1-score   support

      Benign       0.63      0.65      0.64       261
  Bruteforce       0.52      0.53      0.52       262
        DDos       0.83      0.60      0.70       263
         Dos       0.65      0.85      0.73       220
       Mirai       1.00      0.99      1.00       274
       Recon       0.80      0.53      0.64       243
    Spoofing       0.77      0.53      0.63       272
   Web-based       0.47      0.75      0.58       261

    accuracy                           0.68      2056
   macro avg       0.71      0.68      0.68      2056
weighted avg       0.71      0.68      0.68      2056

