In [1]:
# --- 0. Install necessary library ---
#changed as recommended approach (!pip)-->(%pip) which equals to: python -m pip install <module>
%pip install gdown pandas numpy scikit-learn matplotlib seaborn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.57.0-cp312-cp312-win_amd64.whl.metadata (104 kB)
Collecting kiwisolver>=1.3.1 (f

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler # Keep for potential future use
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import gdown # Library to download from Google Drive
from sklearn.impute import SimpleImputer # Import imputer

In [None]:

# --- 1. Download Data from Google Drive ---
file_id = '1dRE9RSdR3pCnDbr5iYHfH2sdR4hOdG3H'
output_file = 'synthetic_data_loaded_with_header.csv' # Changed filename for clarity
url = f'https://drive.google.com/uc?id={file_id}'

print(f"Downloading file from Google Drive (ID: {file_id})...")
try:
    # Using fuzzy=True might help if the direct download link changes slightly
    gdown.download(url, output_file, quiet=False, fuzzy=True)
    print(f"File saved as {output_file}")
except Exception as e:
    print(f"Failed to download file: {e}")
    exit()

In [11]:
# --- 2. Load Data using Pandas (Assuming Header IS Present) ---
try:
    # FIX: Remove header=None to let Pandas read the first row as header
    data = pd.read_csv(output_file)
    print(f"Data loaded successfully from {output_file} with shape: {data.shape}")
    # Display first few rows and info to check data types
    print("\nFirst 5 rows of loaded data:")
    print(data.head())
    print("\nData info:")
    data.info()
except Exception as e:
    print(f"Failed to load data from {output_file}: {e}")
    exit()

Data loaded successfully from synthetic_data_loaded_with_header.csv with shape: (500, 590)

First 5 rows of loaded data:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0    2996.01    2500.01  2194.6444  1116.2950     1.3529      100.0   
1    3062.47    2494.43  2160.3667  1302.6607     1.4656      100.0   
2    3024.75    2288.05  2257.1667  1437.5003     1.6769      100.0   
3    2971.75    2462.09  2244.1111  1969.7867     1.2678      100.0   
4    2898.08    2484.55  2207.0444  1085.3232     1.1369      100.0   

   feature_6  feature_7  feature_8  feature_9  ...  feature_580  feature_581  \
0   103.0567     0.1195     1.5362     0.0104  ...       0.0052     737.3048   
1    98.1244     0.1253     1.5291    -0.0038  ...       0.0031      55.8468   
2    98.6889     0.1246     1.2956     0.0088  ...       0.0066     112.8617   
3   100.9333     0.1233     1.4826    -0.0106  ...       0.0040      46.4594   
4    98.0211     0.1214     1.6030    -0.0047  ...  

In [12]:


# --- 3. Prepare Data ---
if data.shape[1] > 1:
    # Assume last column is target 'y', all others are features 'X'
    feature_names = data.columns[:-1].tolist() # Get actual feature names from header
    X_df = data[feature_names] # Select feature columns using names
    y_series = data.iloc[:, -1] # Select target column by position

    print(f"\nFeature names extracted from header: {feature_names[:5]}... (Total: {len(feature_names)})")

    # --- Keep Robust Cleaning Steps ---
    # Convert to numeric, coercing errors (handles potential non-numeric strings WITHIN data)
    X_df_numeric = X_df.apply(pd.to_numeric, errors='coerce')
    y_series_numeric = pd.to_numeric(y_series, errors='coerce')

    # Handle potential NaN values resulting from coercion or missing values in original file
    imputer_X = SimpleImputer(strategy='mean')
    X_imputed = imputer_X.fit_transform(X_df_numeric)

    imputer_y = SimpleImputer(strategy='mean')
    y_imputed = imputer_y.fit_transform(y_series_numeric.values.reshape(-1, 1)).flatten()

    # Check for NaNs after imputation (should only happen if a whole column was non-numeric/NaN)
    if np.isnan(X_imputed).any() or np.isnan(y_imputed).any():
        print("Warning: NaNs still present after imputation. Check columns with all invalid values.")
        # Consider more advanced imputation or dropping problematic columns/rows if this occurs

    X = X_imputed
    y = y_imputed

    print(f"Features shape after cleaning: {X.shape}")
    print(f"Target shape after cleaning: {y.shape}")
    n_features_loaded = X.shape[1] # Keep track of the number of features

else:
    print("Error: Loaded data has only one column. Cannot separate features and target.")
    exit()


Feature names extracted from header: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4']... (Total: 589)
Features shape after cleaning: (500, 589)
Target shape after cleaning: (500,)


In [None]:

# --- 4. Split Data for Fitness Evaluation ---
# Using the cleaned X and y, with stratify helps maintain the proportion of class labels
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
n_features = X_train.shape[1] # Use the actual number of features
print(f"\nTraining features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")


Training features shape: (350, 589)
Validation features shape: (150, 589)


In [None]:
# --- 5. Hybrid Genetic Algorithm for Feature Selection ---

# GA Parameters (TUNED for potentially faster runtime)
# --- TUNING POINT ---
POP_SIZE = 25 # Reduced from 30
N_GEN = 20    # Reduced from 30
# --- TUNING POINT ---
CROSSOVER_RATE = 0.8 # Kept the same
MUTATION_RATE = 0.1  # Kept the same
ELITISM = True       # Kept the same
N_ELITE = 1          # Kept the same
# --- TUNING POINT ---
LOCAL_SEARCH_STEPS = 2 # Reduced from 5
LS_TOP_K = 2          # Reduced from 3
# --- TUNING POINT ---
PENALTY_COEFF = 0.01   # Kept the same

# --- Fitness Function (using Random Forest) ---
def fitness_function(individual):
    selected_indices = np.where(individual == 1)[0] # Get indices where bit is 1
    num_selected = len(selected_indices)

    if num_selected == 0:
        return -np.inf # Heavily penalize empty feature set

    # Select corresponding columns from train/validation sets
    X_train_sel = X_train[:, selected_indices]
    X_val_sel = X_val[:, selected_indices]

    # Check for NaNs *before* fitting (added robustness)
    if np.isnan(X_train_sel).any() or np.isnan(y_train).any() or np.isnan(X_val_sel).any():
         # This case should ideally not happen after imputation, but good safeguard
         print("Warning: NaN detected in data subset for fitness evaluation.")
         return -np.inf # Penalize subsets leading to NaNs

    # Train and evaluate Random Forest
    # --- TUNING POINT ---
    # Use fewer estimators for significantly faster fitness evaluation
    model = RandomForestRegressor(n_estimators=15, # Reduced from 50
                                  random_state=42,
                                  n_jobs=-1) # Use n_jobs=-1 for parallelization
    # --- TUNING POINT ---
    try:
        model.fit(X_train_sel, y_train)
        y_pred = model.predict(X_val_sel)
        mse = mean_squared_error(y_val, y_pred)
        # Fitness = Negative MSE (maximization) - Penalty for feature count
        fitness = -mse - PENALTY_COEFF * num_selected
    except ValueError as e:
        # Catch potential errors during fitting/prediction if data issues remain
        print(f"Error during model fitting/prediction: {e}")
        fitness = -np.inf # Penalize if model fails
    return fitness

# --- GA Operators (Initialization, Selection, Crossover, Mutation) ---
# (These functions remain unchanged structurally)
def initialize_population(pop_size, n_features):
    pop = np.random.randint(0, 2, size=(pop_size, n_features))
    # Ensure initial individuals are not all zeros
    for i in range(pop_size):
        if np.sum(pop[i]) == 0:
            if n_features > 0: # Avoid error if n_features is 0
                pop[i, np.random.randint(0, n_features)] = 1
    return pop

def tournament_selection(pop, fitnesses, k=3):
    # Handle potential -inf fitness values gracefully
    valid_indices = np.where(np.isfinite(fitnesses))[0]
    if len(valid_indices) == 0:
        # If no individuals have valid fitness, return random individuals to avoid crash
        print("Warning: No individuals with valid fitness in selection.")
        return pop[np.random.choice(len(pop), size=len(pop))]
    if len(valid_indices) < k:
        # If fewer valid individuals than tournament size, select randomly from valid ones
        selected_idx = np.random.choice(valid_indices, size=len(pop), replace=True) # Allow replacement
        return pop[selected_idx]

    selected = []
    for _ in range(len(pop)):
       # Ensure tournament participants are chosen only from valid individuals
       tournament_contenders_indices = np.random.choice(valid_indices, k, replace=False)
       winner_idx_in_contenders = np.argmax(fitnesses[tournament_contenders_indices])
       winner_original_idx = tournament_contenders_indices[winner_idx_in_contenders]
       selected.append(pop[winner_original_idx])
    return np.array(selected)

def uniform_crossover(parent1, parent2):
    mask = np.random.randint(0, 2, size=len(parent1), dtype=bool)
    child = np.where(mask, parent1, parent2)
    return child

def mutate(individual, mutation_rate):
    mutant = individual.copy()
    mutation_mask = np.random.rand(len(mutant)) < mutation_rate
    mutant[mutation_mask] = 1 - mutant[mutation_mask] # Flip bits
    # Ensure mutation doesn't result in an all-zero individual
    if np.sum(mutant) == 0 and len(mutant) > 0:
         mutant[np.random.randint(0, len(mutant))] = 1
    return mutant

# --- Local Search ---
# (This function remains unchanged structurally)
def local_search(individual, fitness_func, steps=LOCAL_SEARCH_STEPS): # Uses tuned steps
    best = individual.copy()
    current_fitness = fitness_func(best)
    # Cannot perform local search on invalid individual
    if not np.isfinite(current_fitness):
        return best

    indices_to_try = np.arange(len(best))
    for _ in range(steps):
        np.random.shuffle(indices_to_try)
        improved_in_step = False
        for idx in indices_to_try:
            candidate = best.copy()
            candidate[idx] = 1 - candidate[idx] # Flip bit
            # Ensure candidate is not all zeros
            if np.sum(candidate) == 0:
                continue # Skip invalid (all zero) candidate
            candidate_fitness = fitness_func(candidate)
            # Only accept improvement if candidate_fitness is valid and better
            if np.isfinite(candidate_fitness) and candidate_fitness > current_fitness:
                best = candidate
                current_fitness = candidate_fitness
                improved_in_step = True
                break # Move to next step once an improvement is found (first improvement)
        if not improved_in_step:
            break # Stop if no single flip improved fitness in a full pass
    return best

# --- Main HGA Loop ---
# (This function remains unchanged structurally, but uses the tuned parameters)
def hybrid_ga_feature_selection():
    population = initialize_population(POP_SIZE, n_features)
    best_fitnesses = []
    best_individual_overall = None
    best_fit_overall = -np.inf
    features_selected_history = []

    print("\nStarting Hybrid Genetic Algorithm...")
    for gen in range(N_GEN): # Uses tuned N_GEN
        # Evaluate fitness for the current population
        fitnesses = np.array([fitness_function(ind) for ind in population])

        # Find best valid individual in generation
        valid_fitness_indices = np.where(np.isfinite(fitnesses))[0]
        if len(valid_fitness_indices) == 0:
            print(f"Warning: No valid individuals in generation {gen+1}. Stopping early.")
            # Keep the last known best if available, otherwise end with None
            best_fitnesses.append(best_fit_overall if np.isfinite(best_fit_overall) else np.nan)
            features_selected_history.append(np.sum(best_individual_overall) if best_individual_overall is not None else 0)
            break # Stop the GA run

        gen_best_idx_among_valid = np.argmax(fitnesses[valid_fitness_indices])
        gen_best_original_idx = valid_fitness_indices[gen_best_idx_among_valid]
        gen_best_fitness = fitnesses[gen_best_original_idx]

        # Update overall best
        if gen_best_fitness > best_fit_overall:
            best_fit_overall = gen_best_fitness
            best_individual_overall = population[gen_best_original_idx].copy()

        # Store history based on overall best
        best_fitnesses.append(best_fit_overall)
        features_selected_history.append(np.sum(best_individual_overall) if best_individual_overall is not None else 0)

        # Elitism
        if ELITISM:
             sorted_valid_indices = valid_fitness_indices[np.argsort(fitnesses[valid_fitness_indices])]
             elite_indices = sorted_valid_indices[-N_ELITE:]
             elites = population[elite_indices].copy()
        else:
             elites = np.empty((0, n_features), dtype=int)

        # Selection
        selected_parents = tournament_selection(population, fitnesses)

        # Crossover & Mutation
        offspring = np.empty_like(population)
        for i in range(0, POP_SIZE, 2): # Uses tuned POP_SIZE
            p1_idx, p2_idx = i, (i + 1) % POP_SIZE
            parent1, parent2 = selected_parents[p1_idx], selected_parents[p2_idx]
            if np.random.rand() < CROSSOVER_RATE:
                child1 = uniform_crossover(parent1, parent2)
                child2 = uniform_crossover(parent2, parent1)
            else:
                child1, child2 = parent1.copy(), parent2.copy()
            offspring[i] = mutate(child1, MUTATION_RATE)
            if i + 1 < POP_SIZE:
                offspring[i+1] = mutate(child2, MUTATION_RATE)

        # Local Search on top K individuals (uses tuned LS_TOP_K)
        fitnesses_offspring_pre_ls = np.array([fitness_function(ind) for ind in offspring])
        valid_offspring_indices = np.where(np.isfinite(fitnesses_offspring_pre_ls))[0]

        if len(valid_offspring_indices) > 0:
             sorted_valid_offspring_indices = valid_offspring_indices[np.argsort(fitnesses_offspring_pre_ls[valid_offspring_indices])]
             top_indices_for_ls = sorted_valid_offspring_indices[-LS_TOP_K:] # Uses tuned LS_TOP_K
             for idx in top_indices_for_ls:
                 # Uses tuned LOCAL_SEARCH_STEPS inside the function call
                 offspring[idx] = local_search(offspring[idx], fitness_function) # steps uses default from parameter

        # Replacement with Elitism
        if ELITISM and len(elites) > 0:
            fitnesses_offspring_post_ls = np.array([fitness_function(ind) for ind in offspring])
            valid_offspring_indices_post_ls = np.where(np.isfinite(fitnesses_offspring_post_ls))[0]
            if len(valid_offspring_indices_post_ls) >= N_ELITE:
                sorted_valid_offspring_indices_post_ls = valid_offspring_indices_post_ls[np.argsort(fitnesses_offspring_post_ls[valid_offspring_indices_post_ls])]
                worst_indices = sorted_valid_offspring_indices_post_ls[:N_ELITE]
                num_to_replace = min(len(worst_indices), len(elites))
                offspring[worst_indices[:num_to_replace]] = elites[:num_to_replace]

        population = offspring

        current_best_features = np.sum(best_individual_overall) if best_individual_overall is not None else 0
        print(f"Generation {gen+1}/{N_GEN}: Best Fitness = {best_fit_overall:.4f}, Features Selected = {current_best_features}")

    print("Hybrid Genetic Algorithm finished.")
    return best_individual_overall, best_fitnesses, features_selected_history



In [None]:

# --- 6. Run the Hybrid GA ---
best_solution, fitness_history, features_selected_history = hybrid_ga_feature_selection()

# --- 7. Results ---
if best_solution is not None:
    selected_feature_indices = np.where(best_solution == 1)[0]
    print("\n--- HGA Results ---")
    print("Best feature subset found:")
    print("Selected feature indices:", selected_feature_indices)
    # Use ACTUAL feature names from the header
    selected_names = [feature_names[i] for i in selected_feature_indices]
    print("Selected feature names:", selected_names)
    print("Number of features selected:", len(selected_feature_indices))
    print(f"Final Best Fitness: {fitness_history[-1]:.4f}")
else:
    print("\nNo solution found or algorithm stopped early.")


Starting Hybrid Genetic Algorithm...
Generation 1/30: Best Fitness = -9622.4052, Features Selected = 306
Generation 2/30: Best Fitness = -9271.5723, Features Selected = 306
Generation 3/30: Best Fitness = -9271.5723, Features Selected = 306
Generation 4/30: Best Fitness = -9271.5723, Features Selected = 306
Generation 5/30: Best Fitness = -8850.7527, Features Selected = 299
Generation 6/30: Best Fitness = -8850.7527, Features Selected = 299
Generation 7/30: Best Fitness = -8850.7527, Features Selected = 299
Generation 8/30: Best Fitness = -8850.7527, Features Selected = 299
Generation 9/30: Best Fitness = -8850.7527, Features Selected = 299
Generation 10/30: Best Fitness = -8850.7527, Features Selected = 299
Generation 11/30: Best Fitness = -8850.7527, Features Selected = 299
Generation 12/30: Best Fitness = -8850.7527, Features Selected = 299
Generation 13/30: Best Fitness = -8850.7527, Features Selected = 299
Generation 14/30: Best Fitness = -8850.7527, Features Selected = 299
Gener

In [None]:

# --- 8. Plot Fitness Curve and Number of Features Selected ---
if best_solution is not None and fitness_history and features_selected_history:
    generations = range(1, len(fitness_history) + 1)
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.plot(generations, fitness_history, marker='o', linestyle='-')
    plt.xlabel('Generation')
    plt.ylabel('Best Fitness')
    plt.title('HGA Convergence: Best Fitness per Generation')
    plt.grid(True)
    plt.subplot(1, 2, 2)
    plt.plot(generations, features_selected_history, marker='s', linestyle='-', color='orange')
    plt.xlabel('Generation')
    plt.ylabel('Number of Features Selected')
    plt.title('HGA: Number of Features in Best Solution per Generation')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
else:
    print("Cannot plot results as no valid solution was found or history is empty.")
