<a href="https://colab.research.google.com/github/mallikourtis/george4/blob/main/100620241432askisi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# In this step, we’ll load the dataset, handle missing values (if any), encode categorical features
# also, we split the data into training and testing sets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# examining the classification models' triad Random Forest, Neural Networks, Support Vector Machines
# we choose Random Forest, the ensemble method based on decision trees
# The selection is based on our bibliography articles and the our existing knowledge base level
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report



In [None]:
# Load the dataset
url = 'https://github.com/kdemertzis/EKPA/raw/main/Data/DarkNet.csv'
df = pd.read_csv(url, low_memory=False)
print(df.shape)
print(df.columns)

(68580, 83)
Index(['Src_IP', 'Src_Port', 'Dst_IP', 'Dst_Port', 'Protocol', 'Flow_Duration',
       'Total_Fwd_Packet', 'Total_Bwd_packets', 'Total_Length_of_Fwd_Packet',
       'Total_Length_of_Bwd_Packet', 'Fwd_Packet_Length_Max',
       'Fwd_Packet_Length_Min', 'Fwd_Packet_Length_Mean',
       'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
       'Bwd_Packet_Length_Min', 'Bwd_Packet_Length_Mean',
       'Bwd_Packet_Length_Std', 'Flow_Bytes/s', 'Flow_Packets/s',
       'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min',
       'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max',
       'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean', 'Bwd_IAT_Std',
       'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_PSH_Flags', 'Bwd_PSH_Flags',
       'Fwd_URG_Flags', 'Bwd_URG_Flags', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
       'Packet_Length_Min', 'Packet_Length_Max', 'Packet_Length_Mean',
       'Packet_Length_Std', 'Packet_Length_Variance

In [None]:

# Encode categorical features
label_encoder = LabelEncoder()
df['Protocol'] = label_encoder.fit_transform(df['Protocol'])
# One-hot encode the 'Protocol' column
df = pd.get_dummies(df, columns=['Protocol'])


In [None]:
# Split into features (X) and target (y)
X = df.drop(columns=['Label-2'])
y = df['Label-2']



In [None]:
# List of columns to exclude from scaling (non-numeric columns)
non_numeric_columns = ['Src_IP', 'Dst_IP', 'Label-2']

# Filter out non-numeric columns
X_numeric = X.drop(columns=non_numeric_columns)

# Replace '?' with NaN (missing value placeholder)
X_numeric.replace('?', np.nan, inplace=True)

# Initialize the imputer for numeric features
imputer_numeric = SimpleImputer(strategy='most_frequent')

# Fit and transform the imputer on your training data (numeric features only)
X_numeric_imputed = pd.DataFrame(imputer_numeric.fit_transform(X_numeric), columns=X_numeric.columns)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_numeric_imputed, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler for numeric features
scaler_numeric = StandardScaler()

In [None]:
# Fit and transform the scaler on your training data (numeric features only)
X_train_scaled_numeric = scaler_numeric.fit_transform(X_train)

# Transform the scaler on your test data (numeric features only)
X_test_scaled_numeric = scaler_numeric.transform(X_test)

print("Data preprocessing completed successfully.")

In [None]:
# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train the model
model.fit(X_train_scaled, y_train)

# Predictions on the test set

y_pred = model.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print(classification_report(y_test, y_pred))


NameError: name 'X_train_scaled' is not defined