# Phishing Domain Detection
## Genetic Algorithm for 53 features

[Dataset Link](https://data.mendeley.com/datasets/72ptz43s9v/1)<br>
[Dataset Description](https://www.sciencedirect.com/science/article/pii/S2352340920313202)

In [1]:
# **Genetic Algorithms Exploration**

### The objective of this notebook is to learn more about and experiment with genetic algorithms

In [1]:
import pandas as pd
import random
import numpy as np

In [2]:
data = pd.read_csv("data/final_data.csv")
data

Unnamed: 0,directory_length,time_domain_activation,qty_slash_directory,qty_at_file,qty_slash_file,qty_equal_file,qty_dot_file,ttl_hostname,qty_equal_directory,qty_plus_file,...,qty_and_directory,qty_questionmark_file,qty_hashtag_directory,params_length,qty_dot_params,qty_params,url_shortened,qty_equal_params,qty_space_params,phishing
0,8,-1,1,0,0,0,1,892,0,0,...,0,0,0,-1,-1,-1,0,-1,-1,1
1,42,579,3,0,0,0,1,9540,0,0,...,0,0,0,165,0,3,0,3,0,1
2,1,-1,1,0,0,0,0,589,0,0,...,0,0,0,-1,-1,-1,0,-1,-1,0
3,62,-1,5,0,0,0,1,292,0,0,...,0,0,0,-1,-1,-1,0,-1,-1,1
4,-1,6998,-1,-1,-1,-1,-1,3597,-1,-1,...,-1,-1,-1,-1,-1,-1,0,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87198,-1,5509,-1,-1,-1,-1,-1,3597,-1,-1,...,-1,-1,-1,-1,-1,-1,0,-1,-1,0
87199,-1,5046,-1,-1,-1,-1,-1,591,-1,-1,...,-1,-1,-1,-1,-1,-1,0,-1,-1,0
87200,48,1844,5,0,0,0,1,14391,0,0,...,0,0,0,-1,-1,-1,0,-1,-1,1
87201,1,-1,1,0,0,0,0,52,0,0,...,0,0,0,-1,-1,-1,0,-1,-1,1


In [11]:
# Define parameters for the genetic algorithm
population_size = 2  # Number of individuals in the population
mutation_rate = 0.1  # Probability of mutation
num_generations = 2  # Number of generations to evolve

In [12]:
def classify_domain(individual, domain_data):
    prediction_score = np.dot(individual, domain_data[3:])
    return 1 if prediction_score > 0 else 0

def calculate_fitness(individual, data):
    correct_predictions = 0
    for i in range(len(data)):
        prediction = classify_domain(individual, data.iloc[i])
        if prediction == data.iloc[i]['phishing']:
            correct_predictions += 1
    return correct_predictions / len(data)

In [13]:
population = []
for _ in range(population_size):
    individual = np.random.uniform(-1, 1, len(data.columns) - 3)
    population.append(individual)
population = np.array(population)
print("Population: ", population)

Population:  [[-0.01411975 -0.65738758 -0.176372    0.54168177  0.1310757  -0.62976523
   0.3328868  -0.35056602  0.65445039  0.67965691  0.66407654 -0.22879775
   0.53098342  0.42203059  0.48449749 -0.8265045  -0.4754313   0.20907673
  -0.59955661  0.83898026 -0.26365702  0.09068103 -0.94654361 -0.1585287
  -0.04774452  0.37612185 -0.70105853  0.88737336  0.60339146 -0.45175657
  -0.65522313  0.62490395  0.00465915 -0.2679552  -0.10264045  0.70492411
   0.0342192  -0.08687828 -0.62432399  0.40942183 -0.0333303  -0.98871539
   0.94580682  0.04132617  0.28101019 -0.81726286  0.37037486 -0.4799266
   0.64876859  0.73160418 -0.81080038]
 [ 0.83304907  0.54025493  0.73869738 -0.61488191 -0.15457973 -0.1487985
  -0.04751261 -0.42925333 -0.4753048  -0.55173922  0.75868001 -0.78303049
  -0.21233694  0.92574186 -0.30481672 -0.40304303 -0.46915029  0.66697318
  -0.40002797 -0.02776688 -0.80032836  0.27395587 -0.36069562 -0.49475736
  -0.68817619  0.27638768  0.61168006  0.92659994 -0.37275513  

In [14]:
for generation in range(num_generations):
    # Calculate fitness for each individual
    fitness_scores = []
    for individual in population:
        fitness = calculate_fitness(individual, data)
        fitness_scores.append(fitness)
    fitness_scores = np.array(fitness_scores)

    # Normalize fitness scores
    fitness_scores /= np.sum(fitness_scores)
    
    # Select parents based on fitness scores
    parents_indices = np.random.choice(np.arange(population_size), size=2, p=fitness_scores)
    parents = population[parents_indices]
    
    # Crossover (uniform crossover)
    child = []
    for i in range(len(data.columns) - 3):
        if np.random.rand() < 0.5:
            child.append(parents[0][i])
        else:
            child.append(parents[1][i])
    child = np.array(child)
    
    # Mutation
    for i in range(len(data.columns) - 3):
        if np.random.rand() < mutation_rate:
            child[i] += np.random.uniform(-0.1, 0.1)
    
    # Replace a random individual in the population with the child
    random_index = random.randint(0, population_size - 1)
    population[random_index] = child

    # Print the best fitness in this generation
    best_fitness = max(fitness_scores)
    print(f"Generation {generation + 1}: Best Fitness = {best_fitness:.2f}")

# Find the best individual after all generations
best_individual = population[np.argmax(fitness_scores)]
print("\nBest individual:", best_individual)

Generation 1: Best Fitness = 0.51
Generation 2: Best Fitness = 0.50

Best individual: [-0.01411975 -0.65738758 -0.176372    0.54168177  0.1310757  -0.62976523
  0.3328868  -0.35056602  0.65445039  0.67965691  0.66407654 -0.22879775
  0.53098342  0.42203059  0.48449749 -0.8265045  -0.4754313   0.20907673
 -0.59955661  0.83898026 -0.26365702  0.09068103 -0.94654361 -0.1585287
 -0.04774452  0.37612185 -0.70105853  0.88737336  0.60339146 -0.45175657
 -0.65522313  0.62490395  0.00465915 -0.2679552  -0.10264045  0.70492411
  0.0342192  -0.08687828 -0.62432399  0.40942183 -0.0333303  -0.98871539
  0.94580682  0.04132617  0.28101019 -0.81726286  0.37037486 -0.4799266
  0.64876859  0.73160418 -0.81080038]
