In [None]:
%pip install catboost


In [None]:
%pip install numpy==1.26.4 pandas==2.2.2 --force-reinstall


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hassan06/nslkdd")

print("Path to dataset files:", path)

In [None]:
import pandas as pd

# Define file paths
train_file_path = path + '/versions/1/KDDTrain+.txt'
test_file_path =  path + '/versions/1/KDDTest+.txt'
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]

# Define Wireshark-exportable features (manually confirmed)
wireshark_features = [
    'duration', 
    'protocol_type', 
    'service', 
    'src_bytes', 
    'dst_bytes', 
    'flag', 
    'land', 
    'wrong_fragment', 
    'urgent', 
    'label'  # include the label for classification
]

# Load datasets
df_train = pd.read_csv(train_file_path, names=column_names, index_col=False)
df_test = pd.read_csv(test_file_path, names=column_names, index_col=False)
df_train
# Drop non-Wireshark features
df_train = df_train[wireshark_features]
df_test = df_test[wireshark_features]


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Drop the 'difficulty' column
df_train.drop(columns=['difficulty'], inplace=True)
df_test.drop(columns=['difficulty'], inplace=True)

# Combine datasets to ensure consistent encoding
df_combined = pd.concat([df_train, df_test])

# Identify categorical and numerical columns
categorical_features = ['protocol_type', 'service', 'flag']
numerical_features = [col for col in df_train.columns if col not in categorical_features + ['label']]

# Define preprocessing for categorical and numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ])

# Prepare features and labels
X_combined = df_combined.drop('label', axis=1)
y_combined = df_combined['label'].apply(lambda x: 0 if x == 'normal' else 1)  # Binary classification

# Split back into train and test sets
X_train = X_combined.iloc[:len(df_train), :]
X_test = X_combined.iloc[len(df_train):, :]
y_train = y_combined.iloc[:len(df_train)]
y_test = y_combined.iloc[len(df_train):]

# Fit and transform the data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [None]:
from catboost import CatBoostClassifier
import pickle

# Initialize and train the CatBoost model
model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=200)
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)

# Save the trained model as a pickle file
model_filename = "catboost_model.pkl"
with open(model_filename, "wb") as model_file:
    pickle.dump(model, model_file)

print(f"Model saved as {model_filename}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")
print(f"Model Precision: {precision:.2f}")
print(f"Model Recall: {recall:.2f}")
print(f"Model F1 Score: {f1:.2f}")


In [None]:
import pandas as pd
import numpy as np

# Define the structure of your features based on the original dataset
dummy_data = pd.DataFrame({
    'duration': [np.random.randint(0, 1000)],
    'protocol_type': ['tcp'],  # Example categorical value
    'service': ['http'],       # Example categorical value
    'flag': ['SF'],            # Example categorical value
    'src_bytes': [np.random.randint(0, 10000)],
    'dst_bytes': [np.random.randint(0, 10000)],
    'land': [0],
    'wrong_fragment': [0],
    'urgent': [0],
    'hot': [0],
    'num_failed_logins': [0],
    'logged_in': [1],
    'num_compromised': [0],
    'root_shell': [0],
    'su_attempted': [0],
    'num_root': [0],
    'num_file_creations': [0],
    'num_shells': [0],
    'num_access_files': [0],
    'num_outbound_cmds': [0],
    'is_host_login': [0],
    'is_guest_login': [0],
    'count': [np.random.randint(0, 100)],
    'srv_count': [np.random.randint(0, 100)],
    'serror_rate': [np.random.random()],
    'srv_serror_rate': [np.random.random()],
    'rerror_rate': [np.random.random()],
    'srv_rerror_rate': [np.random.random()],
    'same_srv_rate': [np.random.random()],
    'diff_srv_rate': [np.random.random()],
    'srv_diff_host_rate': [np.random.random()],
    'dst_host_count': [np.random.randint(0, 255)],
    'dst_host_srv_count': [np.random.randint(0, 255)],
    'dst_host_same_srv_rate': [np.random.random()],
    'dst_host_diff_srv_rate': [np.random.random()],
    'dst_host_same_src_port_rate': [np.random.random()],
    'dst_host_srv_diff_host_rate': [np.random.random()],
    'dst_host_serror_rate': [np.random.random()],
    'dst_host_srv_serror_rate': [np.random.random()],
    'dst_host_rerror_rate': [np.random.random()],
    'dst_host_srv_rerror_rate': [np.random.random()]
})


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Assuming 'preprocessor' was previously defined and fitted on the training data
# If not, you need to fit it on the training data before transforming new data

# Transform the dummy data
dummy_data_transformed = preprocessor.transform(dummy_data)


In [None]:
import pickle

model_filename = 'catboost_model.pkl'  # Replace with your actual model filename
with open(model_filename, 'rb') as model_file:
    loaded_model = pickle.load(model_file)


In [None]:
# Predict using the loaded model
predictions = loaded_model.predict(dummy_data_transformed)

# Output the prediction
print(f"Prediction for dummy data: {predictions[0]}")


In [None]:
import lightgbm as lgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert data to LightGBM Dataset format
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define parameters for LightGBM
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9
}

# Train the model
lgb_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[valid_data],  # Pass the Dataset object for validation
)

# Save the trained LightGBM model as a pickle file
lgb_model_filename = "lightgbm_model.pkl"
with open(lgb_model_filename, "wb") as model_file:
    pickle.dump(lgb_model, model_file)

print(f"LightGBM Model saved as {lgb_model_filename}")

In [None]:
%pip install pytorch-tabnet


In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
import pickle
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


# Define and train the TabNet model
tabnet_model = TabNetClassifier(
    n_d=8,
    n_a=8,
    n_steps=3,
    gamma=1.3,
    lambda_sparse=1e-5,
    clip_value=2.,
    optimizer_fn=torch.optim.Adam,  # Use the actual Adam optimizer function
    optimizer_params=dict(lr=2e-2),
    mask_type="entmax"
)

# Fit the model with evaluation set in the correct format (as a list of tuples)
tabnet_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# Save the trained TabNet model as a pickle file
tabnet_model_filename = "tabnet_model.pkl"
with open(tabnet_model_filename, "wb") as model_file:
    pickle.dump(tabnet_model, model_file)

print(f"TabNet Model saved as {tabnet_model_filename}")


In [None]:
import pickle

# Load the trained LightGBM model from the pickle file
with open("lightgbm_model.pkl", "rb") as model_file:
    loaded_lgb_model = pickle.load(model_file)

# Make predictions on new data
predictions_lgb = loaded_lgb_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict probabilities on the test set
predictions_lgb = loaded_lgb_model.predict(X_test)

# Convert probabilities to binary predictions using a threshold of 0.5
y_pred = (predictions_lgb >= 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print(f"Model Accuracy: {accuracy:.2f}")
print(f"Model Precision: {precision:.2f}")
print(f"Model Recall: {recall:.2f}")
print(f"Model F1 Score: {f1:.2f}")


In [None]:
# Load the trained TabNet model from the pickle file
with open("tabnet_model.pkl", "rb") as model_file:
    loaded_tabnet_model = pickle.load(model_file)

# Make predictions on new data
predictions_tabnet = loaded_tabnet_model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on the test set
y_pred = loaded_tabnet_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")
print(f"Model Precision: {precision:.2f}")
print(f"Model Recall: {recall:.2f}")
print(f"Model F1 Score: {f1:.2f}")
