In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.decomposition import PCA

# Ensure the directories for saving data exist
def create_directories():
    if not os.path.exists('data/'):
        os.makedirs('data/')
        print("Created directory: data/")
    if not os.path.exists('models/'):
        os.makedirs('models/')
        print("Created directory: models/")

create_directories()  # Create the necessary directories


In [None]:
'''
duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.
'''


In [None]:
# Load and split dataset
def load_and_split_data(df, target_column='outcome', test_size=0.2, random_state=42):
    print("Loading and splitting the dataset...")
    X = df.drop(target_column, axis=1)  # Features
    Y = df[target_column]  # Target
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state, stratify=Y)

    # Save the split data
    X_train.to_csv('data/X_train.csv', index=False)
    X_test.to_csv('data/X_test.csv', index=False)
    Y_train.to_csv('data/Y_train.csv', index=False)
    Y_test.to_csv('data/Y_test.csv', index=False)

    print("Data split into training and testing sets.")
    return X_train, X_test, Y_train, Y_test


# Categorize the attack types into the 5 classes
def categorize_attack_type(label):
    if label in dos_attacks:
        return 'DOS'
    elif label in r2l_attacks:
        return 'R2L'
    elif label in u2r_attacks:
        return 'U2R'
    elif label in probe_attacks:
        return 'probing'
    else:
        return 'normal'
    

# Example Usage
df = pd.read_csv('data.csv')  # Replace with your actual dataset

dos_attacks = ['smurf.', 'neptune.', 'back.', 'teardrop.', 'pod.', 'land.']
r2l_attacks = ['warezclient.', 'guess_passwd.', 'imap.', 'warezmaster.', 'ftp_write.', 'phf.', 'spy.', 'multihop.']
u2r_attacks = ['buffer_overflow.', 'loadmodule.', 'rootkit.', 'perl.']
probe_attacks = ['satan.', 'ipsweep.', 'portsweep.', 'nmap.']
df['outcome'] = df['outcome'].apply(categorize_attack_type)


target_column = 'outcome'  # Define your target column
X_train, X_test, Y_train, Y_test = load_and_split_data(df, target_column)


In [None]:
# One-hot encoding categorical features
def one_hot_encode(X_train, X_test, categorical_columns):
    print("One-hot encoding categorical features...")
    encoders = {}
    X_train_encoded_list = []
    X_test_encoded_list = []
    feature_names = []

    # Loop through each categorical column and apply OneHotEncoder
    for col in categorical_columns:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        
        # Fit and transform the training data
        X_train_col_encoded = encoder.fit_transform(X_train[[col]])
        X_train_encoded_list.append(X_train_col_encoded)
        
        # Transform the test data
        X_test_col_encoded = encoder.transform(X_test[[col]])
        X_test_encoded_list.append(X_test_col_encoded)

        # Store the encoder for potential future use
        encoders[col] = encoder
        
        # Get feature names and add to the feature names list
        feature_names.extend(encoder.get_feature_names_out([col]))
    
    # Concatenate the encoded columns into a single matrix
    X_train_encoded = np.concatenate(X_train_encoded_list, axis=1)
    X_test_encoded = np.concatenate(X_test_encoded_list, axis=1)
    
    # Remove original categorical columns from X_train and X_test
    X_train_remaining = X_train.drop(columns=categorical_columns)
    X_test_remaining = X_test.drop(columns=categorical_columns)
    
    # Concatenate the remaining columns with the one-hot encoded columns
    X_train_final = np.concatenate([X_train_remaining.values, X_train_encoded], axis=1)
    X_test_final = np.concatenate([X_test_remaining.values, X_test_encoded], axis=1)
    
    # Ensure the directory exists before saving the files
    if not os.path.exists('data'):
        os.makedirs('data')

    # Save the encoded data
    np.save('data/X_train_encoded.npy', X_train_final)
    np.save('data/X_test_encoded.npy', X_test_final)

    print("Categorical features encoded and original columns removed.")
    return X_train_final, X_test_final, encoders

# Example usage with specified categorical columns
categorical_columns = ['protocol_type', 'service', 'flag']
X_train_encoded, X_test_encoded, encoders = one_hot_encode(X_train, X_test, categorical_columns)

In [None]:
# Apply Robust Scaling
def robust_scale(X_train, X_test):
    print("Applying robust scaling...")
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Save the scaled data
    np.save('data/X_train_scaled.npy', X_train_scaled)
    np.save('data/X_test_scaled.npy', X_test_scaled)

    print("Data scaled using RobustScaler.")
    return X_train_scaled, X_test_scaled, scaler

X_train_scaled, X_test_scaled, scaler = robust_scale(X_train_encoded, X_test_encoded)


In [None]:
import numpy as np
from collections import Counter
from imblearn.combine import SMOTEENN  # Import SMOTEENN

# Apply SMOTEENN resampling
def apply_smoteenn(X_train, Y_train):
    print("Applying SMOTEENN for resampling...")
    
    # Initialize SMOTEENN with a suitable k_neighbors
    smote_enn = SMOTEENN(sampling_strategy='auto', random_state=42)
    
    # Fit and resample the training data
    X_train_resampled, Y_train_resampled = smote_enn.fit_resample(X_train, Y_train)

    # Save resampled data
    np.save('data/X_train_resampled.npy', X_train_resampled)
    np.save('data/Y_train_resampled.npy', Y_train_resampled)

    print("Data resampled using SMOTEENN.")
    print(f"Resampled Y distribution: {Counter(Y_train_resampled)}")
    
    return X_train_resampled, Y_train_resampled

# Assuming X_train_scaled and Y_train are defined and preprocessed
X_train_resampled, Y_train_resampled = apply_smoteenn(X_train_scaled, Y_train)


In [None]:
from sklearn.preprocessing import LabelEncoder

def feature_selection(X_train, Y_train, variance_threshold=0.01, k_best=20, correlation_threshold=0.1):
    print("Starting feature selection...")

    # Variance Threshold
    vt = VarianceThreshold(threshold=variance_threshold)
    X_train_var = vt.fit_transform(X_train)

    # Calculate correlation matrix
    correlation_matrix = pd.DataFrame(X_train_var).corrwith(pd.Series(Y_train))
    low_corr_features = correlation_matrix[correlation_matrix.abs() < correlation_threshold].index
    X_train_var = np.delete(X_train_var, low_corr_features, axis=1)

    print(f"Removed low correlation features: {low_corr_features.tolist()}")

    # Keep one feature from highly correlated features
    corr_matrix = pd.DataFrame(X_train_var).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

    # Retain only one feature from the highly correlated ones
    X_train_var = np.delete(X_train_var, to_drop, axis=1)
    print(f"Removed highly correlated features: {to_drop}")

    # Mutual Information to select K best features
    selector = SelectKBest(mutual_info_classif, k=k_best)
    X_train_selected = selector.fit_transform(X_train_var, Y_train)

    # Save the selected features
    np.save('data/X_train_selected.npy', X_train_selected)

    print(f"Selected {k_best} best features using mutual information.")
    return X_train_selected, selector

X_train_selected, selector = feature_selection(X_train_resampled, Y_train_resampled)


In [None]:
# Save encoders and scalers
def save_model_instance(instance, filename):
    print(f"Saving model instance: {filename}...")
    np.save(f'models/{filename}.npy', instance)

# Save models
save_model_instance(encoders, 'encoders')
save_model_instance(scaler, 'scaler')
save_model_instance(selector, 'selector')

print("All models and instances saved.")


In [None]:
# Complete pipeline function to execute all steps
def complete_pipeline(df, target_column, categorical_columns):
    create_directories()
    X_train, X_test, Y_train, Y_test = load_and_split_data(df, target_column)
    X_train_encoded, X_test_encoded, encoders = one_hot_encode(X_train, X_test, categorical_columns)
    X_train_scaled, X_test_scaled, scaler = robust_scale(X_train_encoded, X_test_encoded)
    X_train_resampled, Y_train_resampled = apply_smoteenn(X_train_scaled, Y_train)
    X_train_selected, selector = feature_selection(X_train_resampled, Y_train_resampled)
    save_model_instance(encoders, 'encoders')
    save_model_instance(scaler, 'scaler')
    save_model_instance(selector, 'selector')


In [None]:
complete_pipeline(df, target_column='outcome', categorical_columns=categorical_columns)
