In [1]:
import pandas as pd

In [2]:
train_data= pd.read_parquet('UNSW_NB15_training-set.parquet')
test_data= pd.read_parquet('UNSW_NB15_testing-set.parquet')

In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

columns_to_use = ['dur', 'proto', 'sbytes', 'dbytes', 'spkts', 'dpkts', 'rate', 'sload', 'dload', 'label']

train_selected = train_data[columns_to_use]
test_selected = test_data[columns_to_use]

preprocessing_steps = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['dur', 'sbytes', 'dbytes', 'spkts', 'dpkts', 'rate', 'sload', 'dload']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['proto'])
    ]
)

preprocess_pipeline = Pipeline(steps=[('preprocessor', preprocessing_steps)])

X_train_preprocessed = preprocess_pipeline.fit_transform(train_selected.drop('label', axis=1))
X_test_preprocessed = preprocess_pipeline.transform(test_selected.drop('label', axis=1))

X_train_preprocessed.shape, X_test_preprocessed.shape


((82332, 139), (175341, 139))

In [4]:
y_train = train_selected['label']
y_test = test_selected['label']

# Experiment 1

In [5]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

population_size = 100
chromosome_length = X_train_preprocessed.shape[1]
population = np.random.randint(0, 2, (population_size, chromosome_length))

def calculate_fitness(individual, features, labels):
    if isinstance(features, np.ndarray):
        features_dense = features
    else:
        features_dense = features.toarray()
    individual = np.array(individual).reshape(-1)
    prediction_scores = np.dot(features_dense, individual)
    predictions = prediction_scores > 0.5
    true_positives = np.sum((predictions == 1) & (labels == 1))
    true_negatives = np.sum((predictions == 0) & (labels == 0))
    false_positives = np.sum((predictions == 1) & (labels == 0))
    false_negatives = np.sum((predictions == 0) & (labels == 1))
    return true_positives * 2 + true_negatives - false_positives - 2 * false_negatives

def select(population, fitness):
    fitness_shifted = fitness - np.min(fitness) + 1e-3
    probability = fitness_shifted / np.sum(fitness_shifted)
    indices = np.random.choice(np.arange(population_size), size=population_size, p=probability)
    return population[indices]

def crossover(parent1, parent2):
    point = np.random.randint(1, chromosome_length - 1)
    child1 = np.concatenate((parent1[:point], parent2[point:]))
    child2 = np.concatenate((parent2[:point], parent1[point:]))
    return child1, child2

def mutate(individual, mutation_rate=0.01):
    for i in range(chromosome_length):
        if np.random.rand() < mutation_rate:
            individual[i] = 1 - individual[i]
    return individual

best_fitness_score = -np.inf
best_chromosome = None
for generation in range(50):
    fitness = np.array([calculate_fitness(ind, X_train_preprocessed, y_train) for ind in population])
    if np.max(fitness) > best_fitness_score:
        best_fitness_score = np.max(fitness)
        best_chromosome = population[np.argmax(fitness)]
    population = select(population, fitness)
    next_population = []
    for i in range(0, population_size, 2):
        parent1, parent2 = population[i], population[i+1]
        child1, child2 = crossover(parent1, parent2)
        next_population.extend([child1, child2])
    population = np.array([mutate(ind) for ind in next_population])
    print(f"Generation {generation}: Best Fitness - {best_fitness_score}")

def predict(data, chromosome):
    if isinstance(data, np.ndarray):
        data_dense = data
    else:
        data_dense = data.toarray()
    chromosome = np.array(chromosome).reshape(-1)
    prediction_scores = np.dot(data_dense, chromosome)
    predictions = prediction_scores > 0.5  
    return predictions

predictions = predict(X_test_preprocessed, best_chromosome)
print("Classification Report:")
print(classification_report(y_test, predictions))
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Generation 0: Best Fitness - 56844
Generation 1: Best Fitness - 56844
Generation 2: Best Fitness - 56844
Generation 3: Best Fitness - 56844
Generation 4: Best Fitness - 56844
Generation 5: Best Fitness - 56844
Generation 6: Best Fitness - 56844
Generation 7: Best Fitness - 56844
Generation 8: Best Fitness - 56844
Generation 9: Best Fitness - 56844
Generation 10: Best Fitness - 56844
Generation 11: Best Fitness - 56844
Generation 12: Best Fitness - 56844
Generation 13: Best Fitness - 56844
Generation 14: Best Fitness - 56844
Generation 15: Best Fitness - 56844
Generation 16: Best Fitness - 56844
Generation 17: Best Fitness - 56844
Generation 18: Best Fitness - 56844
Generation 19: Best Fitness - 57192
Generation 20: Best Fitness - 57352
Generation 21: Best Fitness - 57352
Generation 22: Best Fitness - 57428
Generation 23: Best Fitness - 57712
Generation 24: Best Fitness - 57904
Generation 25: Best Fitness - 61508
Generation 26: Best Fitness - 62088
Generation 27: Best Fitness - 66816
Ge

In [6]:
np.save("best_chromosome.npy", best_chromosome)

## Predictions

In [7]:
import numpy as np
def predict_new(data, chromosome):
    if isinstance(data, np.ndarray):
        data_dense = data
    else:
        data_dense = data.toarray()

    chromosome = np.array(chromosome).reshape(-1)
    
    prediction_scores = np.dot(data_dense, chromosome)
    predictions = prediction_scores > 0.5  
    return predictions.astype(int)
best_chromosome = np.load("best_chromosome.npy")

predicted_labels = predict_new(X_test_preprocessed, best_chromosome)

print("Predicted labels:")
print(predicted_labels)


Predicted labels:
[0 1 1 ... 1 1 1]


# Experiment 2

In [15]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

population_size = 150
chromosome_length = X_train_preprocessed.shape[1]
population = np.random.randint(0, 2, (population_size, chromosome_length))

def calculate_fitness(individual, features, labels):
    if isinstance(features, np.ndarray):
        features_dense = features
    else:
        features_dense = features.toarray()
    individual = np.array(individual).reshape(-1)
    prediction_scores = np.dot(features_dense, individual)
    predictions = prediction_scores > 0.5
    true_positives = np.sum((predictions == 1) & (labels == 1))
    true_negatives = np.sum((predictions == 0) & (labels == 0))
    false_positives = np.sum((predictions == 1) & (labels == 0))
    false_negatives = np.sum((predictions == 0) & (labels == 1))
    return true_positives * 2 + true_negatives - false_positives - 2 * false_negatives

def select(population, fitness):
    fitness_shifted = fitness - np.min(fitness) + 1e-3
    probability = fitness_shifted / np.sum(fitness_shifted)
    indices = np.random.choice(np.arange(population_size), size=population_size, p=probability)
    return population[indices]

def crossover(parent1, parent2):
    points = sorted(np.random.randint(1, chromosome_length - 1, size=2))
    child1 = np.concatenate([parent1[:points[0]], parent2[points[0]:points[1]], parent1[points[1]:]])
    child2 = np.concatenate([parent2[:points[0]], parent1[points[0]:points[1]], parent2[points[1]:]])
    return child1, child2

def mutate(individual, mutation_rate=0.05):
    for i in range(chromosome_length):
        if np.random.rand() < mutation_rate:
            individual[i] = 1 - individual[i]
    return individual

best_fitness_score = -np.inf
best_chromosome = None
for generation in range(50):
    fitness = np.array([calculate_fitness(ind, X_train_preprocessed, y_train) for ind in population])
    if np.max(fitness) > best_fitness_score:
        best_fitness_score = np.max(fitness)
        best_chromosome = population[np.argmax(fitness)]
    population = select(population, fitness)
    next_population = []
    for i in range(0, population_size, 2):
        parent1, parent2 = population[i], population[i+1]
        child1, child2 = crossover(parent1, parent2)
        next_population.extend([child1, child2])
    population = np.array([mutate(ind) for ind in next_population])
    print(f"Generation {generation}: Best Fitness - {best_fitness_score}")

def predict(data, chromosome):
    if isinstance(data, np.ndarray):
        data_dense = data
    else:
        data_dense = data.toarray()
    chromosome = np.array(chromosome).reshape(-1)
    prediction_scores = np.dot(data_dense, chromosome)
    predictions = prediction_scores > 0.5  
    return predictions

predictions = predict(X_test_preprocessed, best_chromosome)
print("Classification Report:")
print(classification_report(y_test, predictions))
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Generation 0: Best Fitness - 51250
Generation 1: Best Fitness - 51250
Generation 2: Best Fitness - 52680
Generation 3: Best Fitness - 52680
Generation 4: Best Fitness - 60086
Generation 5: Best Fitness - 60340
Generation 6: Best Fitness - 60340
Generation 7: Best Fitness - 60340
Generation 8: Best Fitness - 60340
Generation 9: Best Fitness - 60340
Generation 10: Best Fitness - 65652
Generation 11: Best Fitness - 65652
Generation 12: Best Fitness - 65652
Generation 13: Best Fitness - 65652
Generation 14: Best Fitness - 66156
Generation 15: Best Fitness - 66156
Generation 16: Best Fitness - 66156
Generation 17: Best Fitness - 66156
Generation 18: Best Fitness - 66156
Generation 19: Best Fitness - 66156
Generation 20: Best Fitness - 66156
Generation 21: Best Fitness - 66156
Generation 22: Best Fitness - 66156
Generation 23: Best Fitness - 66156
Generation 24: Best Fitness - 66156
Generation 25: Best Fitness - 67324
Generation 26: Best Fitness - 67388
Generation 27: Best Fitness - 67388
Ge

# Experiment 3

In [16]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

population_size = 200
chromosome_length = X_train_preprocessed.shape[1]
population = np.random.randint(0, 2, (population_size, chromosome_length))

def calculate_fitness(individual, features, labels):
    if isinstance(features, np.ndarray):
        features_dense = features
    else:
        features_dense = features.toarray()
    individual = np.array(individual).reshape(-1)
    prediction_scores = np.dot(features_dense, individual)
    predictions = prediction_scores > 0.5
    true_positives = np.sum((predictions == 1) & (labels == 1))
    true_negatives = np.sum((predictions == 0) & (labels == 0))
    false_positives = np.sum((predictions == 1) & (labels == 0))
    false_negatives = np.sum((predictions == 0) & (labels == 1))
    return true_positives * 2 + true_negatives - false_positives - 2 * false_negatives

def select(population, fitness):
    fitness_shifted = fitness - np.min(fitness) + 1e-3
    probability = fitness_shifted / np.sum(fitness_shifted)
    indices = np.random.choice(np.arange(population_size), size=population_size, p=probability)
    return population[indices]

def crossover(parent1, parent2):
    mask = np.random.randint(0, 2, size=chromosome_length)
    child1 = np.where(mask, parent1, parent2)
    child2 = np.where(mask, parent2, parent1)
    return child1, child2

def mutate(individual, mutation_rate=0.1):
    for i in range(chromosome_length):
        if np.random.rand() < mutation_rate:
            individual[i] = 1 - individual[i]
    return individual

best_fitness_score = -np.inf
best_chromosome = None
for generation in range(50):
    fitness = np.array([calculate_fitness(ind, X_train_preprocessed, y_train) for ind in population])
    if np.max(fitness) > best_fitness_score:
        best_fitness_score = np.max(fitness)
        best_chromosome = population[np.argmax(fitness)]
    population = select(population, fitness)
    next_population = []
    for i in range(0, population_size, 2):
        parent1, parent2 = population[i], population[i+1]
        child1, child2 = crossover(parent1, parent2)
        next_population.extend([child1, child2])
    population = np.array([mutate(ind) for ind in next_population])
    print(f"Generation {generation}: Best Fitness - {best_fitness_score}")

def predict(data, chromosome):
    if isinstance(data, np.ndarray):
        data_dense = data
    else:
        data_dense = data.toarray()
    chromosome = np.array(chromosome).reshape(-1)
    prediction_scores = np.dot(data_dense, chromosome)
    predictions = prediction_scores > 0.5  
    return predictions

predictions = predict(X_test_preprocessed, best_chromosome)
print("Classification Report:")
print(classification_report(y_test, predictions))
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Generation 0: Best Fitness - 52002
Generation 1: Best Fitness - 58946
Generation 2: Best Fitness - 58946
Generation 3: Best Fitness - 60950
Generation 4: Best Fitness - 66748
Generation 5: Best Fitness - 66748
Generation 6: Best Fitness - 66748
Generation 7: Best Fitness - 66748
Generation 8: Best Fitness - 66748
Generation 9: Best Fitness - 66748
Generation 10: Best Fitness - 66748
Generation 11: Best Fitness - 66748
Generation 12: Best Fitness - 66748
Generation 13: Best Fitness - 66748
Generation 14: Best Fitness - 66748
Generation 15: Best Fitness - 66748
Generation 16: Best Fitness - 66748
Generation 17: Best Fitness - 66748
Generation 18: Best Fitness - 66748
Generation 19: Best Fitness - 66748
Generation 20: Best Fitness - 66748
Generation 21: Best Fitness - 66748
Generation 22: Best Fitness - 66748
Generation 23: Best Fitness - 66748
Generation 24: Best Fitness - 66748
Generation 25: Best Fitness - 66748
Generation 26: Best Fitness - 66748
Generation 27: Best Fitness - 66748
Ge