In [81]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, make_scorer
import time
from sklearn.model_selection import cross_val_score
import random
import math

In [82]:
path = kagglehub.dataset_download("wenruliu/adult-income-dataset")
import os
csv_file_path = None
expected_file = 'adult.csv'
for root, _, files in os.walk(path):
    if expected_file in files:
        csv_file_path = os.path.join(root, expected_file)
        break

In [83]:
df = pd.read_csv(csv_file_path, na_values=['?'])
df.shape

(48842, 15)

In [84]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        46043 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       46033 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   47985 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [86]:
df.isnull().sum()

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [87]:
df_processed = df.copy()

SUBSET_SIZE = 5000

df_processed = df_processed.sample(n=SUBSET_SIZE, random_state=42).reset_index(drop=True)

TARGET_COLUMN = 'income'
NEW_TARGET_NAME = 'Outcome'

y = df_processed[TARGET_COLUMN]
X = df_processed.drop(TARGET_COLUMN, axis=1)
COLUMNS_TO_DROP = ['fnlwgt']
if 'ID' in X.columns:
    COLUMNS_TO_DROP.append('ID')
if 'policy_id' in X.columns:
    COLUMNS_TO_DROP.append('policy_id')

X = X.drop(columns=COLUMNS_TO_DROP, errors='ignore')

y = y.rename(NEW_TARGET_NAME)

if y.dtype == 'object':
    le = LabelEncoder()
    y = pd.Series(le.fit_transform(y), name=NEW_TARGET_NAME, index=y.index)

numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

all_identified_features = numerical_features + categorical_features
if len(all_identified_features) != X.shape[1]:
    print("Warning: Not all columns were classified as numerical or categorical!")
    print("Unclassified columns:", [col for col in X.columns if col not in all_identified_features])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

transformers = []
if numerical_features:
    transformers.append(('num', numerical_pipeline, numerical_features))
if categorical_features:
    transformers.append(('cat', categorical_pipeline, categorical_features))

if not transformers:
     raise ValueError("No numerical or categorical features identified for preprocessing.")


preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')

X_train shape: (4000, 13)
y_train shape: (4000,)
X_test shape: (1000, 13)
y_test shape: (1000,)


In [88]:
def evaluate_classifier(model, X, y):
    y_pred = model.predict(X)
    if y.dtype not in [np.number, np.int64, np.float64]:
         print(f"Warning: Target labels are not numerical (dtype is {y.dtype}). Cannot calculate ROC AUC.")
         auc = np.nan
    else:
        try:
            if hasattr(model, 'predict_proba'):
                 y_prob = model.predict_proba(X)[:, 1]
                 if len(np.unique(y)) == 2:
                     auc = roc_auc_score(y, y_prob)
                 else:
                     auc = np.nan

            else:
                 auc = np.nan

        except Exception as e:
            print(f"Warning: Could not calculate ROC AUC. Error: {e}")
            auc = np.nan

    metrics = {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1_score': f1_score(y, y_pred),
        'roc_auc': auc
    }
    return metrics

In [89]:
svm_baseline_estimator = SVC(random_state=42, probability=True)

baseline_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svc', svm_baseline_estimator)
])

print(baseline_pipeline.named_steps['svc'].get_params())

start_time = time.time()
baseline_pipeline.fit(X_train, y_train)
train_time_baseline = time.time() - start_time

print(f"\nBaseline model training time: {train_time_baseline:.4f} seconds")

baseline_metrics = evaluate_classifier(baseline_pipeline, X_test, y_test)

for metric, value in baseline_metrics.items():
    print(f"{metric}: {value:.4f}")

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': True, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}

Baseline model training time: 3.2889 seconds
accuracy: 0.8700
precision: 0.7865
recall: 0.6292
f1_score: 0.6991
roc_auc: 0.9041


  if y.dtype not in [np.number, np.int64, np.float64]:


In [90]:
print("\n--- Traditional Tuning: Grid Search for SVM Parameters ---")

svm_gs_estimator = SVC(random_state=42, probability=True)

gs_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svc', svm_gs_estimator)
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': [0.01, 0.1, 'scale'],
    'svc__kernel': ['rbf']
}

print(param_grid)

cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=gs_pipeline,
    param_grid=param_grid,
    cv=cv_strategy,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

start_time = time.time()
grid_search.fit(X_train, y_train)
tuning_time_gs = time.time() - start_time

best_params_gs = grid_search.best_params_
best_svm_model_gs = grid_search.best_estimator_

print(best_params_gs)
print(f"Best cross-validation score ({grid_search.scorer_}): {grid_search.best_score_:.4f}")

tuned_metrics_gs = evaluate_classifier(best_svm_model_gs, X_test, y_test)

for metric, value in tuned_metrics_gs.items():
    print(f"{metric}: {value:.4f}")


--- Traditional Tuning: Grid Search for SVM Parameters ---
{'svc__C': [0.1, 1, 10], 'svc__gamma': [0.01, 0.1, 'scale'], 'svc__kernel': ['rbf']}
Fitting 3 folds for each of 9 candidates, totalling 27 fits
{'svc__C': 10, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
Best cross-validation score (make_scorer(f1_score, response_method='predict', average=binary)): 0.6572
accuracy: 0.8660
precision: 0.7650
recall: 0.6375
f1_score: 0.6955
roc_auc: 0.9080


  if y.dtype not in [np.number, np.int64, np.float64]:


In [91]:
print("Metric         | Baseline SVM | Grid Search Tuned SVM")
print("---------------------------------------------------")
all_metrics = sorted(list(set(baseline_metrics.keys()) | set(tuned_metrics_gs.keys())))

for metric in all_metrics:
    baseline_val = baseline_metrics.get(metric, np.nan)
    tuned_gs_val = tuned_metrics_gs.get(metric, np.nan)

    baseline_str = f"{baseline_val:.4f}" if pd.notna(baseline_val) else "N/A     "
    tuned_gs_str = f"{tuned_gs_val:.4f}" if pd.notna(tuned_gs_val) else "N/A     "

    print(f"{metric:<14} | {baseline_str}     | {tuned_gs_str}")

print(f"\nTuning Time (seconds): Baseline: {train_time_baseline:.4f}, Grid Search: {tuning_time_gs:.4f}")

Metric         | Baseline SVM | Grid Search Tuned SVM
---------------------------------------------------
accuracy       | 0.8700     | 0.8660
f1_score       | 0.6991     | 0.6955
precision      | 0.7865     | 0.7650
recall         | 0.6292     | 0.6375
roc_auc        | 0.9041     | 0.9080

Tuning Time (seconds): Baseline: 3.2889, Grid Search: 31.3232


In [92]:
num_fireflies = 20
max_generations = 10
alpha = 0.2        # Randomness 0--1
beta0 = 1          # Attractiveness at r=0
gamma = 1          # Absorption coefficient
lower_bounds = [0.01, 0.0001]  # For C and gamma
upper_bounds = [100, 10]


In [93]:
def levy_flight(beta=1.5):
    sigma = (math.gamma(1 + beta) * math.sin(math.pi * beta / 2) /
             (math.gamma((1 + beta) / 2) * beta * 2 ** ((beta - 1) / 2))) ** (1 / beta)
    u = np.random.normal(0, sigma, size=2)
    v = np.random.normal(0, 1, size=2)
    step = u / (np.abs(v) ** (1 / beta))
    return step

In [94]:
def fitness(firefly, X, y):
    C, gamma = firefly[0], firefly[1]
    if not (lower_bounds[0] <= C <= upper_bounds[0] and lower_bounds[1] <= gamma <= upper_bounds[1]):
        return -9999

    model = Pipeline([
        ('preprocessor', preprocessor),  # Preprocessing step (assumed defined globally)
        ('svc', SVC(C=C, gamma=gamma, kernel='rbf'))  # SVM model with parameters
    ])

    scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
    return scores.mean()


In [95]:
def initialize_fireflies(num_fireflies=num_fireflies):
    fireflies = np.zeros((num_fireflies, 2))
    fireflies[:, 0] = np.random.uniform(lower_bounds[0], upper_bounds[0], size=num_fireflies)
    fireflies[:, 1] = np.random.uniform(lower_bounds[1], upper_bounds[1], size=num_fireflies)
    return fireflies

In [96]:
def update_fireflies_with_levy(fireflies, fitness_values, alpha, beta_0, gamma_param, lower_bounds, upper_bounds, levy_beta=1.5):
    new_fireflies = []
    for i in range(len(fireflies)):
        xi = fireflies[i].copy()
        for j in range(len(fireflies)):
            if fitness_values[j] > fitness_values[i]:
                xj = fireflies[j]
                r = np.linalg.norm(xi - xj)
                beta_ij = beta_0 * math.exp(-gamma_param * r ** 2)
                attraction = beta_ij * (xj - xi)
                random_walk = alpha * levy_flight(levy_beta)
                xi = xi + attraction + random_walk
                xi = np.clip(xi, lower_bounds, upper_bounds)
        new_fireflies.append(xi)
    return np.array(new_fireflies)

In [97]:
def record_best_individual(fireflies, fitness_values, generation):
    sorted_indices = np.argsort(fitness_values)[::-1]
    best_individual = fireflies[sorted_indices[0]]
    best_fitness = fitness_values[sorted_indices[0]]
    return {
        'Generation': generation,
        'C': best_individual[0],
        'Gamma': best_individual[1],
        'Fitness': best_fitness
    }

In [99]:
def firefly_algorithm(X, y, num_fireflies=num_fireflies, generations=max_generations,
                      alpha=alpha, beta_0=beta0, gamma_param=gamma,
                      lower_bounds=lower_bounds, upper_bounds=upper_bounds):
    
    fireflies = initialize_fireflies(num_fireflies)
    fitness_values = np.zeros(num_fireflies)
    history = []
    best_individuals = []

    for generation in range(generations):
        for i in range(num_fireflies):
            fitness_values[i] = fitness(fireflies[i], X, y)

        history.append([(fireflies[i][0], fireflies[i][1], fitness_values[i]) for i in range(num_fireflies)])
        best_individual = record_best_individual(fireflies, fitness_values, generation)
        best_individuals.append(best_individual)

        print(f"Generation {generation}: Best => C: {best_individual['C']:.4f}, Gamma: {best_individual['Gamma']:.6f}, Fitness: {best_individual['Fitness']:.4f}")
        diversity = np.std(fitness_values)
        print(f"Generation {generation} Diversity: {diversity:.6f}")

        fireflies = update_fireflies_with_levy(fireflies, fitness_values, alpha, beta_0, gamma_param, lower_bounds, upper_bounds)

    return fireflies, fitness_values, history, best_individuals

In [100]:
def run_optimization(X, y, num_runs=30, max_generations=max_generations):
    all_history = []
    all_best = []

    for run in range(num_runs):
        seed = run + 1234
        np.random.seed(seed)
        random.seed(seed)

        print(f"\nStarting run {run+1} with seed {seed}")
        fireflies, fitness_values, history, best_individuals = firefly_algorithm(
            X, y, num_fireflies=num_fireflies, generations=max_generations
        )

        for generation, gen_population in enumerate(history):
            for idx, (C_val, gamma_val, fit_val) in enumerate(gen_population):
                all_history.append({
                    'Run': run + 1,
                    'Seed': seed,
                    'Generation': generation,
                    'Firefly': idx,
                    'C': C_val,
                    'Gamma': gamma_val,
                    'Fitness': fit_val
                })

        
        for bi in best_individuals:
            all_best.append({'Run': run + 1, 'Seed': seed, **bi})

    history_df = pd.DataFrame(all_history)
    best_df = pd.DataFrame(all_best)

    history_df.to_csv('firefly_algorithm_history.csv', index=False)
    best_df.to_csv('best_individuals_per_generation.csv', index=False)

    print("\nOptimization complete for all runs. Results saved to CSV files.")

In [None]:
run_optimization(X_train, y_train, num_runs=30, max_generations=10)


Starting run 1 with seed 1234
Generation 0: Best => C: 43.7784, Gamma: 0.753905, Fitness: 0.7900
Generation 0 Diversity: 0.005540
Generation 1: Best => C: 43.7784, Gamma: 0.753905, Fitness: 0.7900
Generation 1 Diversity: 0.007631
Generation 2: Best => C: 91.4809, Gamma: 0.000100, Fitness: 0.8450
Generation 2 Diversity: 0.024067
Generation 3: Best => C: 91.4809, Gamma: 0.000100, Fitness: 0.8450
Generation 3 Diversity: 0.024216
Generation 4: Best => C: 91.4809, Gamma: 0.000100, Fitness: 0.8450
Generation 4 Diversity: 0.022570
Generation 5: Best => C: 91.4809, Gamma: 0.000100, Fitness: 0.8450
Generation 5 Diversity: 0.023431
Generation 6: Best => C: 91.4809, Gamma: 0.000100, Fitness: 0.8450
Generation 6 Diversity: 0.023701
Generation 7: Best => C: 91.4809, Gamma: 0.000100, Fitness: 0.8450
Generation 7 Diversity: 0.024534
Generation 8: Best => C: 44.1373, Gamma: 0.014991, Fitness: 0.8528
Generation 8 Diversity: 0.025552
Generation 9: Best => C: 44.1373, Gamma: 0.014991, Fitness: 0.8528
Ge

In [60]:
run_optimization(X_train, y_train, 30,max_generations)

Generation 0: Best Individual => C: 8.388980864459342, Gamma: 0.012466268120326685, Fitness: 0.8495011683955472
Generation 0 Diversity: 0.014716472122671737
Generation 1: Best Individual => C: 8.388980864459342, Gamma: 0.012466268120326685, Fitness: 0.8495011683955472
Generation 1 Diversity: 0.003727253403170672
Generation 2: Best Individual => C: 4.374762009508561, Gamma: 0.03664650147931552, Fitness: 0.8497508560048558
Generation 2 Diversity: 0.0026177573337152716
Generation 3: Best Individual => C: 4.703193214182945, Gamma: 0.035225017613312065, Fitness: 0.8505008560985824
Generation 3 Diversity: 0.0019496111735924446
Generation 4: Best Individual => C: 8.858580795760531, Gamma: 0.026073911021231293, Fitness: 0.8507501688015707
Generation 4 Diversity: 0.0015882165214977041
Generation 5: Best Individual => C: 8.858580795760531, Gamma: 0.026073911021231293, Fitness: 0.8507501688015707
Generation 5 Diversity: 0.0010650849223458577
Generation 6: Best Individual => C: 8.799179317482183, 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import griddata


df = pd.read_csv('Best_Per_Generation_30_Runs.csv')

C = df['C'].values
gamma = df['gamma'].values
fitness = df['fitness'].values


C_lin = np.linspace(min(C), max(C), 100)
gamma_lin = np.linspace(min(gamma), max(gamma), 100)
C_mesh, gamma_mesh = np.meshgrid(C_lin, gamma_lin)

# Interpolation
fitness_grid = griddata((C, gamma), fitness, (C_mesh, gamma_mesh), method='cubic')


plt.figure(figsize=(10, 8))
cp = plt.contourf(C_mesh, gamma_mesh, fitness_grid, levels=20, cmap='viridis')
plt.colorbar(cp, label='Fitness')
plt.xlabel('C')
plt.ylabel('Gamma')
plt.title('Contour Plot of Fitness by C and Gamma')
plt.tight_layout()
plt.savefig("contour_fitness.png")  
