## Load data sets and split into features and labels

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer

# dataset title: network intrusion detection
# dataset source: https://www.kaggle.com/datasets/sampadab17/network-intrusion-detection/data

nid_train_data = pd.read_csv('network_intrusion_detection_train.csv')
#print(nid_train_data.head())
print(nid_train_data.info())
nid_test_data = pd.read_csv('network_intrusion_detection_test.csv')
#print(nid_test_data.head())

# check how much of the traffic is malicious in the training data
class_distribution = nid_train_data['class'].value_counts()
print(class_distribution)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25192 entries, 0 to 25191
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     25192 non-null  int64  
 1   protocol_type                25192 non-null  object 
 2   service                      25192 non-null  object 
 3   flag                         25192 non-null  object 
 4   src_bytes                    25192 non-null  int64  
 5   dst_bytes                    25192 non-null  int64  
 6   land                         25192 non-null  int64  
 7   wrong_fragment               25192 non-null  int64  
 8   urgent                       25192 non-null  int64  
 9   hot                          25192 non-null  int64  
 10  num_failed_logins            25192 non-null  int64  
 11  logged_in                    25192 non-null  int64  
 12  num_compromised              25192 non-null  int64  
 13  root_shell      

In [2]:
label_encoder = LabelEncoder()
nid_train_data['class'] = label_encoder.fit_transform(nid_train_data['class'])

X_train = nid_train_data.drop(columns=['class'])  # X - features
y_train = nid_train_data['class']  # y - labels

In [3]:
# Identify categorical columns (protocol_type, service, flag) for encoding
categorical_cols = ['protocol_type', 'service', 'flag']
numeric_cols = X_train.columns.difference(categorical_cols)

# Use ColumnTransformer to one-hot encode categorical columns and leave numeric columns unchanged
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Preprocess both training and test sets
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(nid_test_data)

# Scale the data after encoding
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_preprocessed)
X_test_scaled = scaler.transform(X_test_preprocessed)

In [4]:
# Linear Perceptron
linear_perceptron = Perceptron(max_iter=1000)
linear_perceptron.fit(X_train_scaled, y_train)
lp_test_predictions = linear_perceptron.predict(X_test_scaled)

In [5]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_scaled, y_train)
rf_test_predictions = random_forest.predict(X_test_scaled)

In [6]:
# Voting Classifier (combining RandomForest, DecisionTree, KNN, MLP)
decision_tree = DecisionTreeClassifier(random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
mlp = MLPClassifier(max_iter=300, random_state=42)

voting_clf = VotingClassifier(estimators=[
    ('rf', random_forest), 
    ('dt', decision_tree), 
    ('knn', knn), 
    ('mlp', mlp)],
    voting='hard')

voting_clf.fit(X_train_scaled, y_train)
vc_test_predictions = voting_clf.predict(X_test_scaled)

In [7]:
# predictions for test data
pd.DataFrame(lp_test_predictions, columns=['Linear_Perceptron_Predictions']).to_csv('lp_test_predictions.csv', index=False)
pd.DataFrame(rf_test_predictions, columns=['Random_Forest_Predictions']).to_csv('rf_test_predictions.csv', index=False)
pd.DataFrame(vc_test_predictions, columns=['Voting_Classifier_Predictions']).to_csv('vc_test_predictions.csv', index=False)
print("Predictions saved to CSV files. (done on test dataset)")

predictions = [
    (lp_test_predictions, 'Linear_Perceptron_Predictions', 'lp_test_predictions.csv'),
    (rf_test_predictions, 'Random_Forest_Predictions', 'rf_test_predictions.csv'),
    (vc_test_predictions, 'Voting_Classifier_Predictions', 'vc_test_predictions.csv')
]

# 0 for normal traffic, 1 for malcious traffic (abnormal)
for pred, col_name, file_name in predictions:
    pd.DataFrame(pred, columns=[col_name]).to_csv(file_name, index=False)
    pred_df = pd.read_csv(file_name)
    pred_count = pred_df[col_name].value_counts()
    print(f"Counts in {file_name}:")
    print(pred_count)
    print()

Predictions saved to CSV files. (done on test dataset)
Counts in lp_test_predictions.csv:
Linear_Perceptron_Predictions
1    13682
0     8862
Name: count, dtype: int64

Counts in rf_test_predictions.csv:
Random_Forest_Predictions
1    14196
0     8348
Name: count, dtype: int64

Counts in vc_test_predictions.csv:
Voting_Classifier_Predictions
1    13000
0     9544
Name: count, dtype: int64



In [8]:
# Evaluate accuracy on training data
lp_train_accuracy = linear_perceptron.score(X_train_scaled, y_train)
print(f'Linear Perceptron Training Accuracy: {lp_train_accuracy}')
from sklearn.model_selection import cross_val_score
# Perform cross-validation on the training data
lp_cv_scores = cross_val_score(linear_perceptron, X_train_scaled, y_train, cv=5)
print(f'Linear Perceptron Cross-Validation Accuracy: {lp_cv_scores.mean()}')


rf_train_accuracy = random_forest.score(X_train_scaled, y_train)
print(f'Random Forest Training Accuracy: {rf_train_accuracy}')
cv_scores = cross_val_score(random_forest, X_train_scaled, y_train, cv=5)
print(f'Random Forest Cross-Validation Accuracy: {cv_scores.mean()}')

vc_train_accuracy = voting_clf.score(X_train_scaled, y_train)
print(f'Voting Classifier Training Accuracy: {vc_train_accuracy}')
vc_cv_scores = cross_val_score(voting_clf, X_train_scaled, y_train, cv=5)
print(f'Voting Classifier Cross-Validation Accuracy: {vc_cv_scores.mean()}')

Linear Perceptron Training Accuracy: 0.9438710701810098
Linear Perceptron Cross-Validation Accuracy: 0.9682837582615818
Random Forest Training Accuracy: 0.9999603048586853
Random Forest Cross-Validation Accuracy: 0.997221316447076
Voting Classifier Training Accuracy: 0.9996824388694824
Voting Classifier Cross-Validation Accuracy: 0.9970228722514605
