In [1]:
#Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_csv("Datasets.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5472 entries, 0 to 5471
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5472 non-null   object 
 1   Sender_IP           5472 non-null   object 
 2   Sender_Port         5472 non-null   int64  
 3   Target_IP           5472 non-null   object 
 4   Target_Port         5472 non-null   int64  
 5   Transport_Protocol  5472 non-null   int64  
 6   Duration            5472 non-null   float64
 7   AvgDuration         5472 non-null   float64
 8   PBS                 5472 non-null   int64  
 9   AvgPBS              5472 non-null   float64
 10  TBS                 5472 non-null   int64  
 11  PBR                 5472 non-null   int64  
 12  AvgPBR              5472 non-null   float64
 13  TBR                 5472 non-null   int64  
 14  Missed_Bytes        5472 non-null   int64  
 15  Packets_Sent        5472 non-null   int64  
 16  Packet

In [4]:
df.isnull().sum()

ID                    0
Sender_IP             0
Sender_Port           0
Target_IP             0
Target_Port           0
Transport_Protocol    0
Duration              0
AvgDuration           0
PBS                   0
AvgPBS                0
TBS                   0
PBR                   0
AvgPBR                0
TBR                   0
Missed_Bytes          0
Packets_Sent          0
Packets_Received      0
SRPR                  0
class                 0
dtype: int64

In [6]:
X = df.drop('class', axis=1) # Defining Independend Varible
y = df['class']              # Defining Target Varible

## Preprocess and modeling using pipeline

In [5]:
#Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [7]:
# As "info()" funtion show dataset contains both numeric and categorical features. we need to enocode them separately.
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns   # Select numeric columns
categorical_columns = X.select_dtypes(include=['object']).columns         # Select categorical columns

In [8]:
# Create transformers for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a column transformer to apply transformers to respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [9]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

## Fit and train the model

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Sender_Port', 'Target_Port', 'Transport_Protocol', 'Duration',
       'AvgDuration', 'PBS', 'AvgPBS', 'TBS', 'PBR', 'AvgPBR', 'TBR',
       'Missed_Bytes', 'Packets_Sent', 'Packets_Received', 'SRPR'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['ID', 'Sender_IP', 'Target_IP'], dtype='object'))])),
                ('classifier', RandomForestClassifier(random_state=42))])

In [16]:
# Make predictions on the testing set with default estimators
y_pred_a = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_a)
print(f'Accuracy: \033[1m{accuracy*100}\033[0m')
print(classification_report(y_test,y_pred_a))

Accuracy: [1m91.05022831050228[0m
              precision    recall  f1-score   support

           0       0.88      0.88      0.88       407
           1       0.93      0.93      0.93       688

    accuracy                           0.91      1095
   macro avg       0.90      0.90      0.90      1095
weighted avg       0.91      0.91      0.91      1095



## Hyper parameter tuning using GridSearchCV

In [13]:
# Define hyperparameters to tune
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Use Grid Search with Cross-Validation to find the best parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [20]:
# Make predictions on the testing set with Best estimators by GridSearchCv
y_pred_b = grid_search.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_b)
print(f'Accuracy: \033[1m{accuracy*100}\033[0m')
print(classification_report(y_test,y_pred_b))

Accuracy: [1m99.54337899543378[0m
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       407
           1       0.99      1.00      1.00       688

    accuracy                           1.00      1095
   macro avg       1.00      0.99      1.00      1095
weighted avg       1.00      1.00      1.00      1095

