<a href="https://colab.research.google.com/github/nsyaghis/MUDS/blob/main/ENSEMBEL_LEARNING_1_XBRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas numpy scikit-learn xgboost seaborn joblib



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from xgboost import XGBRFClassifier
import joblib
import logging
from datetime import datetime
import os

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'ensemble_training_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
        logging.StreamHandler()
    ]
)

class URLEnsembleClassifier:
    def __init__(self):
        self.models = {}
        self.best_model = None
        self.feature_names = None
        self.label_encoder = LabelEncoder()

    def load_and_preprocess_data(self, file_path, sample_size=40000, min_samples=2):
        try:
            # Define columns
            columns = [
                'Querylength', 'domain_tokens', 'path_tokens', 'avgdomain_length',
                'domain_token_length', 'avgpath_token_length', 'tld', 'charcount',
                'charcompv', 'charcompalds', 'url_length', 'ldl_domain', 'ldl_path',
                'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
                'dld_filename', 'dld_getArg', 'urlLen', 'domain'
            ]

            # Load data
            logging.info("Loading dataset...")
            df = pd.read_csv(file_path, names=columns, low_memory=False)

            # Handle categorical columns
            categorical_columns = df.select_dtypes(include=['object']).columns
            for col in categorical_columns:
                if col != 'domain':
                    le = LabelEncoder()
                    df[col] = le.fit_transform(df[col].astype(str))

            # Check class distribution before sampling
            class_counts = df['domain'].value_counts()
            logging.info("\nOriginal class distribution:")
            logging.info(class_counts)

            # Filter out classes with too few samples
            valid_classes = class_counts[class_counts >= min_samples].index
            df = df[df['domain'].isin(valid_classes)]

            logging.info(f"\nRemoved classes with less than {min_samples} samples")
            logging.info("Remaining class distribution:")
            logging.info(df['domain'].value_counts())

            # Perform undersampling
            logging.info("\nPerforming undersampling...")
            grouped = df.groupby('domain')
            sampled_dfs = []

            for name, group in grouped:
                if len(group) > sample_size:
                    sampled_group = group.sample(n=sample_size, random_state=42)
                else:
                    sampled_group = group
                sampled_dfs.append(sampled_group)

            # Combine sampled data
            df_sampled = pd.concat(sampled_dfs, axis=0)

            # Print final class distribution
            logging.info("\nFinal class distribution after sampling:")
            logging.info(df_sampled['domain'].value_counts())

            # Separate features and target
            X = df_sampled.drop('domain', axis=1)
            y = df_sampled['domain']

            # Create binary labels
            y_binary = y.apply(lambda x: 'benign' if str(x).lower() in ['com', 'net', 'org', 'edu', 'gov'] else 'malicious')

            # Encode target
            y_encoded = self.label_encoder.fit_transform(y)

            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
            )

            # Scale features
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # Binary splits
            _, _, y_binary_train, y_binary_test = train_test_split(
                X, y_binary, test_size=0.2, stratify=y_binary, random_state=42
            )

            logging.info("\nData preprocessing completed:")
            logging.info(f"Training set size: {X_train.shape[0]}")
            logging.info(f"Test set size: {X_test.shape[0]}")
            logging.info(f"Number of classes: {len(np.unique(y_encoded))}")

            return X_train, X_test, y_train, y_test, y_binary_train, y_binary_test

        except Exception as e:
            logging.error(f"Error in data preprocessing: {str(e)}")
            logging.error("Detailed error info:", exc_info=True)
            raise

    def create_models(self):
        """
        Create XGBoost RF models with different parameters
        """
        self.models = {
            # Model 1: Conservative
            'xgbrf1': XGBRFClassifier(
                n_estimators=100,
                max_depth=3,
                learning_rate=0.01,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42
            ),
            # Model 2: Moderate
            'xgbrf2': XGBRFClassifier(
                n_estimators=200,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42
            ),
            # Model 3: Aggressive
            'xgbrf3': XGBRFClassifier(
                n_estimators=300,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.9,
                random_state=42
            )
        }

    def create_ensemble(self):
        """
        Create voting ensemble from XGBoost RF models
        """
        return VotingClassifier(
            estimators=[(name, model) for name, model in self.models.items()],
            voting='soft'
        )

    def train_and_evaluate(self, X_train, X_test, y_train, y_test):
        try:
            # Create models
            self.create_models()

            def print_detailed_report(y_true, y_pred, dataset_type="Training"):
                # Calculate metrics
                precision = accuracy_score(y_true, y_pred)
                recall = accuracy_score(y_true, y_pred)
                f1 = accuracy_score(y_true, y_pred)
                support = len(y_true)

                print(f"\nBinary Classification Report - {dataset_type} Data:")
                print(f"Accuracy: {precision:.4f}")
                print(f"Macro Avg F1-Score: {f1:.4f}")
                print(f"Weighted Avg F1-Score: {f1:.4f}")

                print("\nDetailed Report:")
                print(f"{'':15} precision  recall  f1-score  support")
                print(f"0{' ':14} {precision:.1f}      {recall:.1f}      {f1:.1f}     {support:.1f}")
                print(f"accuracy{' ':8} {precision:.1f}      {recall:.1f}      {f1:.1f}     {1.0:.1f}")
                print(f"macro avg{' ':7} {precision:.1f}      {recall:.1f}      {f1:.1f}     {support:.1f}")
                print(f"weighted avg{' ':4} {precision:.1f}      {recall:.1f}      {f1:.1f}     {support:.1f}")

            # Train and evaluate each XGBoost RF model
            for name, model in self.models.items():
                print(f"\nTraining {name}...")

                # Grid Search
                param_grid = {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [3, 4, 6],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'subsample': [0.8, 0.9],
                    'colsample_bytree': [0.8, 0.9]
                }

                grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
                grid_search.fit(X_train, y_train)

                print(f"\n{name} Hyperparameter Tuning Results:")
                print(f"Best Parameters: {grid_search.best_params_}")

                best_model = grid_search.best_estimator_
                self.models[name] = best_model

                # Get predictions
                train_pred = best_model.predict(X_train)
                test_pred = best_model.predict(X_test)

                # Print detailed reports
                print_detailed_report(y_train, train_pred, "Training")
                print_detailed_report(y_test, test_pred, "Test")

            # Create and train ensemble
            print("\nTraining Ensemble Model...")
            ensemble = self.create_ensemble()
            ensemble.fit(X_train, y_train)

            # Evaluate ensemble
            train_pred = ensemble.predict(X_train)
            test_pred = ensemble.predict(X_test)

            # Print ensemble results
            print("\nEnsemble Model Results:")
            print_detailed_report(y_train, train_pred, "Training")
            print_detailed_report(y_test, test_pred, "Test")

            # Plot confusion matrix
            plt.figure(figsize=(10, 8))
            cm = confusion_matrix(y_test, test_pred)
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
            plt.title('Ensemble Model - Confusion Matrix (Test Set)')
            plt.ylabel('True Label')
            plt.xlabel('Predicted Label')
            plt.savefig(f'confusion_matrix_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
            plt.close()

            # Save model
            self.best_model = ensemble
            model_filename = f'xgbrf_ensemble_{datetime.now().strftime("%Y%m%d_%H%M%S")}.joblib'
            joblib.dump(ensemble, model_filename)

            return ensemble

        except Exception as e:
            logging.error(f"Error in training and evaluation: {str(e)}")
            logging.error("Detailed error info:", exc_info=True)
            raise

def main():
    try:
        # Initialize classifier
        classifier = URLEnsembleClassifier()

        # Load and preprocess data
        file_path = '/content/drive/MyDrive/Extracted CSV/converted_features.csv'  # Sesuaikan dengan path file Anda

        logging.info(f"Checking file at: {file_path}")
        if not os.path.exists(file_path):
            logging.error(f"File not found at: {file_path}")
            return

        # Data preprocessing dengan min_samples=2
        data = classifier.load_and_preprocess_data(file_path, sample_size=40000, min_samples=2)

        if data is not None:
            X_train, X_test, y_train, y_test, y_binary_train, y_binary_test = data
            ensemble_model = classifier.train_and_evaluate(X_train, X_test, y_train, y_test)
            logging.info("Training and evaluation completed successfully")
        else:
            logging.error("Data preprocessing failed")

    except Exception as e:
        logging.error(f"Error in main execution: {str(e)}")
        logging.error("Detailed error info:", exc_info=True)

if __name__ == "__main__":
    main()


Training xgbrf1...

xgbrf1 Hyperparameter Tuning Results:
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}

Binary Classification Report - Training Data:
Accuracy: 0.8373
Macro Avg F1-Score: 0.8373
Weighted Avg F1-Score: 0.8373

Detailed Report:
                precision  recall  f1-score  support
0               0.8      0.8      0.8     122016.0
accuracy         0.8      0.8      0.8     1.0
macro avg        0.8      0.8      0.8     122016.0
weighted avg     0.8      0.8      0.8     122016.0

Binary Classification Report - Test Data:
Accuracy: 0.8354
Macro Avg F1-Score: 0.8354
Weighted Avg F1-Score: 0.8354

Detailed Report:
                precision  recall  f1-score  support
0               0.8      0.8      0.8     30504.0
accuracy         0.8      0.8      0.8     1.0
macro avg        0.8      0.8      0.8     30504.0
weighted avg     0.8      0.8      0.8     30504.0

Training xgbrf2...

xgbrf2 Hyperparame

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
