# **Import Required Libraries**

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

random_seeds = []  # To store seeds for reproducibility

# **Load the Adult Dataset**

In [None]:
df = pd.read_csv('adult.csv')

In [None]:
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [None]:
df.isna().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,0
relationship,0
race,0
gender,0


# **Preprocessing**

In [None]:
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Label encoding for categorical variables
label_encoders = {}
for col in df.select_dtypes(include='object'):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature/target split and scaling
X = df.drop("income", axis=1)
y = df["income"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# **PROBLEM FORMALISATION (Requirement b)**
This is a Constrained Optimisation problem.
We want to find the best hyperparameters (lr, alpha, hidden)
that maximize classification accuracy while remaining in feasible bounds.

# **DEFINE FITNESS FUNCTION (Part of e)**

In [None]:
def evaluate(individual):
    #  Constraint Handling (Requirement c) using repair functions
    lr = np.clip(individual[0], 0.0001, 0.1)
    alpha = np.clip(individual[1], 0.0001, 1)
    hidden = int(np.clip(individual[2], 10, 200))

    clf = MLPClassifier(
        learning_rate_init=lr,
        alpha=alpha,
        hidden_layer_sizes=(hidden,),
        max_iter=300,
        random_state=42
    )
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    return accuracy_score(y_test, preds)

# **DEFINE GENETIC OPERATORS (Part of e)**

In [None]:
def crossover_average(p1, p2):
    return [(p1[i] + p2[i]) / 2 for i in range(3)]

def crossover_uniform(p1, p2):
    return [random.choice([p1[i], p2[i]]) for i in range(3)]

def mutate_scalar(individual, rate):
    if random.random() < rate:
        index = random.randint(0, 2)
        if index == 0:
            individual[index] += np.random.uniform(-0.01, 0.01)
        elif index == 1:
            individual[index] += np.random.uniform(-0.05, 0.05)
        else:
            individual[index] += random.randint(-10, 10)
    return individual

def mutate_gaussian(individual, rate):
    if random.random() < rate:
        index = random.randint(0, 2)
        individual[index] += np.random.normal(0, 0.01 if index != 2 else 5)
    return individual

# **BFO-LIKE LOCAL SEARCH (Chemotaxis)**

In [None]:
def chemotaxis(individual):
    step = [0.01, 0.05, 10]
    direction = np.random.uniform(-1, 1, size=3)
    return [individual[i] + direction[i] * step[i] for i in range(3)]

# **DIVERSITY PRESERVATION (Requirement h)**

In [None]:
def calculate_diversity(population):
    return np.std(population, axis=0).mean()

# **HYBRID GA+BFO FUNCTION (Satisfying e, f, g, k)**

In [None]:
def hybrid_ga_bfo(mutation_rate=0.3, crossover_type='average', mutation_type='scalar', seed=None):
    random.seed(seed)
    np.random.seed(seed)
    random_seeds.append(seed)

    pop_size = 10
    generations = 10

    # Representation: list of [lr, alpha, hidden_units]
    population = [
        [np.random.uniform(0.0001, 0.1), np.random.uniform(0.0001, 1), np.random.randint(10, 200)]
        for _ in range(pop_size)
    ]

    best_score = -np.inf
    best_individual = None
    history = []

    for gen in range(generations):
        scores = [evaluate(ind) for ind in population]

        # Survivor selection: elitism
        sorted_indices = np.argsort(scores)[-pop_size//2:]
        parents = [population[i] for i in sorted_indices]

        children = []
        for _ in range(pop_size - len(parents)):
            p1, p2 = random.sample(parents, 2)
            if crossover_type == 'uniform':
                child = crossover_uniform(p1, p2)
            else:
                child = crossover_average(p1, p2)

            if mutation_type == 'gaussian':
                child = mutate_gaussian(child, mutation_rate)
            else:
                child = mutate_scalar(child, mutation_rate)

            child = chemotaxis(child)  #  BFO-inspired local refinement
            children.append(child)

        population = parents + children
        gen_best = max(scores)
        if gen_best > best_score:
            best_score = gen_best
            best_individual = population[np.argmax(scores)]

        diversity = calculate_diversity(population)  #  Used for analysis
        history.append((gen, best_score, diversity))
        print(f"Gen {gen+1} | Best Acc: {best_score:.4f} | Diversity: {diversity:.4f}")

    return best_individual, best_score, history

# **RUN 30 RUNS PER SETTING (Requirement k)**

In [1]:
settings = [
    {'mutation_rate': 0.2, 'crossover_type': 'average', 'mutation_type': 'scalar'},
    {'mutation_rate': 0.4, 'crossover_type': 'uniform', 'mutation_type': 'scalar'},
    {'mutation_rate': 0.4, 'crossover_type': 'average', 'mutation_type': 'gaussian'}
]

results = []

for setting in settings:
    print("\n=== Running setting ===", setting)
    for run in range(5):
        seed = random.randint(1, 1000)
        best_ind, best_acc, history = hybrid_ga_bfo(
            mutation_rate=setting['mutation_rate'],
            crossover_type=setting['crossover_type'],
            mutation_type=setting['mutation_type'],
            seed=seed
        )
        results.append({
            'setting': str(setting),
            'run': run + 1,
            'accuracy': best_acc,
            'seed': seed
        })


=== Running setting === {'mutation_rate': 0.2, 'crossover_type': 'average', 'mutation_type': 'scalar'}


NameError: name 'random' is not defined

# **Save results and seeds (Requirement k)**

In [None]:
df_results = pd.DataFrame(results)
df_results.to_csv("ga_bfo_experiment_results.csv", index=False)

print("\nFinal results saved to ga_bfo_experiment_results.csv")

NameError: name 'pd' is not defined

# **Visualise Accuracy Distribution**

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(all_scores)
plt.title("Final Accuracy Distribution Over 30 Runs (GA+BFO Hybrid)")
plt.ylabel("Accuracy\")
plt.grid(True)
plt.show()

print("Average Accuracy:\", np.mean(all_scores))
print("Max Accuracy:\", np.max(all_scores))