In [11]:
# **Genetic Algorithms Exploration**

### The objective of this notebook is to learn more about and experiment with genetic algorithms

In [12]:
import pandas as pd
import random
import numpy as np

In [13]:
data = pd.read_csv("./Url_Processed.csv")
data.head(5)

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,fd_length,count_-,count_@,count_?,...,count_.,count_=,count_http,count_https,count_www,count_digits,count_letters,count_dir,use_of_ip,short_url
0,https://www.google.com,benign,0,22,14,0,0,0,0,0,...,2,0,1,1,1,0,17,0,1,1
1,https://www.youtube.com,benign,0,23,15,0,0,0,0,0,...,2,0,1,1,1,0,18,0,1,1
2,https://www.facebook.com,benign,0,24,16,0,0,0,0,0,...,2,0,1,1,1,0,19,0,1,1
3,https://www.baidu.com,benign,0,21,13,0,0,0,0,0,...,2,0,1,1,1,0,16,0,1,1
4,https://www.wikipedia.org,benign,0,25,17,0,0,0,0,0,...,2,0,1,1,1,0,20,0,1,1


In [14]:
# Hyperparameters
population_size = 10
mutation_rate = 0.1
num_generations = 10

# Function to classify domain as malicious or benign based on prediction score
def classify_domain(individual, domain_data):
    prediction_score = np.dot(individual, domain_data[3:].values)  # Vectorized dot product
    return 'malicious' if prediction_score > 0 else 'benign'

# Function to calculate the fitness of an individual
def calculate_fitness(individual, data):
    # Vectorized prediction and comparison
    predictions = np.dot(data.iloc[:, 3:].values, individual)
    predicted_labels = np.where(predictions > 0, 'malicious', 'benign')
    return np.mean(predicted_labels == data['label'].values)  # Compare predictions with actual labels

# Initialize population with random individuals
population = np.random.uniform(-1, 1, (population_size, len(data.columns) - 3))


In [15]:
# Evolution process
for generation in range(num_generations):
    # Calculate fitness scores for the population
    fitness_scores = np.array([calculate_fitness(ind, data) for ind in population])

    # Normalize fitness scores to use as selection probabilities
    fitness_probs = fitness_scores / fitness_scores.sum()

    # Select parents based on their fitness scores
    parents_indices = np.random.choice(np.arange(population_size), size=2, p=fitness_probs)
    parents = population[parents_indices]

    # Crossover (uniform crossover)
    child = np.array([random.choice([parents[0][i], parents[1][i]]) for i in range(len(parents[0]))])

    # Mutation (random mutation with small change)
    mutation_mask = np.random.rand(len(child)) < mutation_rate
    child[mutation_mask] += np.random.uniform(-0.1, 0.1, size=mutation_mask.sum())

    # Replace a random individual with the child
    random_index = random.randint(0, population_size - 1)
    population[random_index] = child

    # Print the best fitness of the current generation
    best_fitness = fitness_scores.max()
    print(f"Generation {generation + 1}: Best Fitness = {best_fitness:.2f}")

# Find and print the best individual after all generations
best_individual = population[np.argmax(fitness_scores)]
print("\nBest individual:", best_individual)


Generation 1: Best Fitness = 0.78
Generation 2: Best Fitness = 0.78
Generation 3: Best Fitness = 0.78
Generation 4: Best Fitness = 0.78
Generation 5: Best Fitness = 0.78
Generation 6: Best Fitness = 0.78
Generation 7: Best Fitness = 0.78
Generation 8: Best Fitness = 0.78
Generation 9: Best Fitness = 0.77
Generation 10: Best Fitness = 0.77

Best individual: [ 0.56078119 -0.66981529 -0.40013185 -0.48991546  0.34951569 -0.24227564
  0.71042028 -0.85296104 -0.31198728 -0.76484059 -0.99297229  0.04723614
  0.98982016  0.30834335 -0.89382837 -0.65475514 -0.78357431  0.14711586]


Fitness score=
Total number of URLs
Number of correct predictions
​

