# Import Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipaddress
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Dataset

In [None]:
df = pd.read_csv('dataset-final.csv', header=0, encoding='utf-8')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Check NaN Value
df.isnull().sum()

# Data Preprocessing

## Handling IP Address Value

In [None]:
# Function to convert IPv4 addresses to numeric form
def ip_to_numeric(ip):
    return int(ipaddress.IPv4Address(ip))

In [None]:
# Apply the function to the DataFrame
df['netflow.ipv4_src_addr'] = df['netflow.ipv4_src_addr'].apply(ip_to_numeric)
df['netflow.ipv4_dst_addr'] = df['netflow.ipv4_dst_addr'].apply(ip_to_numeric)
df.head()

## Handling Timestamp Value

In [None]:
# Calculate duration
df['duration'] = pd.to_datetime(df['netflow.last_switched'], errors='coerce') - pd.to_datetime(df['netflow.first_switched'], errors='coerce')
df['duration'] = df['duration'].dt.total_seconds()
df.head()

## Drop Unused Columns

In [None]:
df = df.drop(['Unnamed: 0', "@timestamp", "netflow.last_switched", "netflow.first_switched"], axis=1)
df.head()

# Correlation between Independent Variables

In [None]:
#Re-arrange feature
df = df[['netflow.ipv4_src_addr', 'netflow.in_bytes', 'netflow.protocol', 'netflow.tcp_flags', 'netflow.in_pkts',	'netflow.ipv4_dst_addr', 'netflow.l4_src_port',	'netflow.l4_dst_port', 'duration', 'label']]

In [None]:
#Heatmap correlation
corr_matrix = df.corr()
plt.figure(figsize=(30, 20))
sns.heatmap(corr_matrix,
            annot=True,
            linewidths=0.5,
            fmt= ".2f",
            cmap="YlGnBu");

# Data Partitioning

In [None]:
# Separate the features (X) and the target (y)
X = df.drop(columns=["label"])
y = df["label"]

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modeling

In [None]:
xgboost = xgb.XGBClassifier(n_estimators=10)

In [None]:
xgboost.fit(X, y)

In [None]:
y_pred = xgboost.predict(X_test)

# Evaluation

## Accuracy, Precision, Recall, and F1 Score

In [None]:
accuration = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

In [None]:
print(f'Accuracy: {accuration}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:')
print(cm)
print(classification_report)

## Confusion Matrix

In [None]:
group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]

labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)

sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')