In [1]:
import pandas as pd
from data_preprocessing import read_data, handle_missing_values, drop_constant_columns, fill_missing_values_with_mode,replace_dash_with_mode
from outlier_detection import detect_outliers_zscore, remove_outliers
from data_analysis import plot_correlation_heatmap, plot_pairplot
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, classification_report
)
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

: 

In [2]:

# Step 1: Read Data
file_path = 'dataset/http.csv'
column_names = [
    'timestamp', 'session_id', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol', 
    'method', 'host', 'url', 'params', 'http_version', 'user_agent', 'unknown1', 'response_size',
    'status_code', 'status_message', 'unknown2', 'unknown3', 'empty1', 'unknown4', 'unknown5',
    'unknown6', 'unknown7', 'unknown8', 'unknown9', 'unknown10', 'token', 'unknown11', 'content_type'
]

print("\nStep 1: Reading Data...")
data = read_data(file_path, column_names)



Step 1: Reading Data...


In [3]:
# Step 2: Preprocess Data
print("\nStep 2: Handling Missing Values and Dropping Constant Columns...")
data = handle_missing_values(data, threshold=0.5)
data = drop_constant_columns(data, threshold=0.9)
data = fill_missing_values_with_mode(data)


Step 2: Handling Missing Values and Dropping Constant Columns...


In [4]:
# Step 3: Detect and Remove Outliers
print("\nStep 3: Detecting and Removing Outliers...")
outliers = detect_outliers_zscore(data, threshold=3)
data_cleaned = remove_outliers(data, outliers)

# Save cleaned data
data_cleaned.to_csv('result/preprocessed_http_cleaned.csv', index=False)
print("Preprocessed data saved as 'result/preprocessed_http_cleaned.csv'.")


Step 3: Detecting and Removing Outliers...
Preprocessed data saved as 'result/preprocessed_http_cleaned.csv'.


In [5]:
# Step 4: Analyze Patterns and Correlations (currently disabled)
# print("\nStep 4: Analyzing Patterns and Correlations...")
# plot_correlation_heatmap(data_cleaned)
# plot_pairplot(data_cleaned)


In [6]:
# Step 5: Process Second File
print("\nStep 5: Processing 'dataset/httpnormal.csv'...")
file_path_normal = 'dataset/httpnormal.csv'
data_normal = read_data(file_path_normal, column_names)


# Align columns and preprocess
data_normal = data_normal[data_cleaned.columns]
data_normal = fill_missing_values_with_mode(data_normal)
data_normal = replace_dash_with_mode(data_normal)
data_normal.to_csv('result/preprocessed_httpnormal_cleaned.csv', index=False)
print("Preprocessed 'dataset/httpnormal.csv' saved as 'result/preprocessed_httpnormal_cleaned.csv'.")



Step 5: Processing 'dataset/httpnormal.csv'...
Preprocessed 'dataset/httpnormal.csv' saved as 'result/preprocessed_httpnormal_cleaned.csv'.


In [7]:
# Step 6: Balance and Combine Data
print("\nStep 6: Balancing and Combining Data...")
num_normal = len(data_normal)
data_cleaned_sampled = data_cleaned.sample(n=num_normal, random_state=42)

data_cleaned_sampled['attack'] = 1
data_normal['attack'] = 0

final_data = pd.concat([data_cleaned_sampled, data_normal], axis=0)
final_data.to_csv('result/final_http_data_balanced.csv', index=False)
print("Balanced final data saved as 'result/final_http_data_balanced.csv'.")



Step 6: Balancing and Combining Data...
Balanced final data saved as 'result/final_http_data_balanced.csv'.


In [8]:
# Step 1: Read Balanced Data
print("Step 1: Reading Balanced Data...")
file_path = 'result/final_http_data_balanced.csv'
data = pd.read_csv(file_path)

Step 1: Reading Balanced Data...


In [9]:
# Step 2: Preprocess Data
print("\nStep 2: Preprocessing Data...")
# Drop unnecessary columns (timestamp, session_id, unknowns)
columns_to_drop = ['timestamp', 'session_id', 'status_message'] + [
    col for col in data.columns if 'unknown' in col
]
data = data.drop(columns=columns_to_drop, axis=1)

print("Remaining Columns:", data.columns)

# Encode Categorical Variables
print("\nEncoding categorical variables...")
categorical_cols = ['src_ip', 'dst_ip', 'protocol', 'method', 'host', 'url', 'user_agent', 'content_type']
label_encoder = LabelEncoder()

for col in categorical_cols:
    if col in data.columns:
        data[col] = label_encoder.fit_transform(data[col])

# Correlation Analysis
print("\nStep 2.1: Correlation Analysis...")
correlation_matrix = data.corr()
print("Top Correlated Features:")
print(correlation_matrix['attack'].sort_values(ascending=False))

# Scale Numerical Data
print("\nScaling numerical data...")
scaler = StandardScaler()
numerical_cols = ['src_port', 'dst_ip', 'response_size', 'status_code']

# Avoid Data Leakage: fit scaler only on the training set
X = data.drop(columns=['attack'])
y = data['attack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


Step 2: Preprocessing Data...
Remaining Columns: Index(['src_ip', 'src_port', 'dst_ip', 'host', 'url', 'user_agent',
       'response_size', 'status_code', 'attack'],
      dtype='object')

Encoding categorical variables...

Step 2.1: Correlation Analysis...
Top Correlated Features:
attack           1.00
status_code      0.98
user_agent       0.79
src_ip           0.15
dst_ip           0.14
response_size   -0.05
host            -0.12
url             -0.20
src_port        -0.24
Name: attack, dtype: float64

Scaling numerical data...


In [10]:
# Step 3: Lazy Predict to Compare Models
print("\nStep 3: Comparing Models using Lazy Predict...")
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)


Step 3: Comparing Models using Lazy Predict...


 97%|█████████▋| 31/32 [00:01<00:00, 26.22it/s]

[LightGBM] [Info] Number of positive: 845, number of negative: 855
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 850
[LightGBM] [Info] Number of data points in the train set: 1700, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497059 -> initscore=-0.011765
[LightGBM] [Info] Start training from score -0.011765


100%|██████████| 32/32 [00:01<00:00, 19.45it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 1.00               1.00     1.00      1.00   
RandomForestClassifier             1.00               1.00     1.00      1.00   
BaggingClassifier                  1.00               1.00     1.00      1.00   
DecisionTreeClassifier             1.00               1.00     1.00      1.00   
LabelSpreading                     1.00               1.00     1.00      1.00   
ExtraTreeClassifier                1.00               1.00     1.00      1.00   
ExtraTreesClassifier               1.00               1.00     1.00      1.00   
LabelPropagation                   1.00               1.00     1.00      1.00   
KNeighborsClassifier               1.00               1.00     1.00      1.00   
LGBMClassifier                     1.00               1.00     1.00      1.00   
SGDClassifier               




In [11]:
# Step 4: Fine-Tune Random Forest with Cost Sensitivity
print("\nStep 4: Fine-Tuning Random Forest Model with Cost Sensitivity...")
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("\nRandom Forest Model Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Cross-Validation for Random Forest
print("\nCross-Validation for Random Forest...")
rf_cv_scores = cross_val_score(best_rf, X, y, cv=5, scoring='f1')
print("Cross-Validation F1 Scores:", rf_cv_scores)
print("Mean F1 Score:", rf_cv_scores.mean())


Step 4: Fine-Tuning Random Forest Model with Cost Sensitivity...

Random Forest Model Results:
Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       208
           1       1.00      1.00      1.00       218

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426


Cross-Validation for Random Forest...
Cross-Validation F1 Scores: [0.98383372 1.         1.         1.         0.99297424]
Mean F1 Score: 0.9953615914241365


In [12]:
# Step 5: Ensemble Method (Voting Classifier)
print("\nStep 5: Implementing Ensemble Method with Cost Sensitivity...")
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
svm = SVC(probability=True, class_weight='balanced')

ensemble = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('lr', lr),
    ('svm', svm)
], voting='soft')

ensemble.fit(X_train, y_train)
y_pred_ensemble = ensemble.predict(X_test)

print("\nEnsemble Model Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("F1 Score:", f1_score(y_test, y_pred_ensemble))
print("Precision:", precision_score(y_test, y_pred_ensemble))
print("Recall:", recall_score(y_test, y_pred_ensemble))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ensemble))

# Cross-Validation for Ensemble
print("\nCross-Validation for Ensemble Model...")
ensemble_cv_scores = cross_val_score(ensemble, X, y, cv=5, scoring='f1')
print("Cross-Validation F1 Scores:", ensemble_cv_scores)
print("Mean F1 Score:", ensemble_cv_scores.mean())


Step 5: Implementing Ensemble Method with Cost Sensitivity...

Ensemble Model Results:
Accuracy: 0.9976525821596244
F1 Score: 0.9977011494252873
Precision: 1.0
Recall: 0.9954128440366973

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       208
           1       1.00      1.00      1.00       218

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426


Cross-Validation for Ensemble Model...
Cross-Validation F1 Scores: [0.96818182 0.99528302 0.99764706 0.99763593 0.99297424]
Mean F1 Score: 0.9903444137110593


In [13]:
# Step 6: Outlier Analysis and Anomaly Detection
print("\nStep 6: Outlier Analysis using Isolation Forest...")
iso_forest = IsolationForest(contamination=0.01, random_state=42)
outliers = iso_forest.fit_predict(X)
data['anomaly'] = outliers

print("Outliers Detected:")
print(data['anomaly'].value_counts())



Step 6: Outlier Analysis using Isolation Forest...
Outliers Detected:
anomaly
 1    2104
-1      22
Name: count, dtype: int64


In [14]:
# Step 7: Save Final Preprocessed Data (Optional)
print("\nStep 7: Saving Processed Data...")
data.to_csv('result/processed_http_data.csv', index=False)
print("Processed data saved to 'result/processed_http_data.csv'")



Step 7: Saving Processed Data...
Processed data saved to 'result/processed_http_data.csv'
