In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix

In [2]:
# Load dataset
col_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", 
             "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
             "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", 
             "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", 
             "srv_diff_host_rate", "dst_host_count","dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", 
             "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
             "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]

df = pd.read_csv("../../../data/kddcup99/kddcup.data_10_percent", header=None, names=col_names)

In [3]:
# Encode categorical variables
categorical_features = ["protocol_type", "service", "flag"]
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Encode target variable (attack label): Binary classification (normal vs attack)
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal.' else 1)  

In [4]:
# Split features and target
x = df.drop("label", axis=1)
y = df["label"]

# Split into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
# Scale the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# Train Random Forest model
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    min_samples_leaf=1,
    max_features="sqrt",
    n_jobs=-1
)
model.fit(x_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(x_test)

# Evaluate the model
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "f1_score": f1_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "confusion_matrix": confusion_matrix(y_test, y_pred)
}
print(metrics)