In [4]:
import numpy as np
import gymnasium as gym
import random

# Hyperparameter
population_size = 100
num_generations = 200
mutation_rate = 0.1
crossover_rate = 0.5
elitism = True
elite_size = 5

# Environment
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Initialize population
def initialize_population():
    return [np.random.randn(state_size, action_size) for _ in range(population_size)]

# Fitness function
def fitness(chromosome):
    state, _ = env.reset()
    total_reward = 0
    for _ in range(200):
        action = np.argmax(np.dot(state, chromosome))
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break
    return total_reward  # Note: total_reward is negative

# Selection
def selection(population, fitnesses):
    # Ensure all fitness values are positive by adding a constant if necessary
    min_fitness = min(fitnesses)
    offset = abs(min_fitness) + 1 if min_fitness < 0 else 0
    adjusted_fitnesses = [f + offset for f in fitnesses]

    selected = random.choices(population, weights=adjusted_fitnesses, k=len(population) - elite_size)
    return selected

# Crossover
def crossover(parent1, parent2):
    if random.random() < crossover_rate:
        point = random.randint(1, state_size - 1)
        child1 = np.vstack((parent1[:point], parent2[point:]))
        child2 = np.vstack((parent2[:point], parent1[point:]))
        return child1, child2
    else:
        return parent1, parent2

# Mutation
def mutate(chromosome):
    if random.random() < mutation_rate:
        index = random.randint(0, state_size - 1)
        chromosome[index] = np.random.randn(action_size)
    return chromosome

# Main algorithm
population = initialize_population()
best_fitness = -float('inf')
best_chromosome = None

for generation in range(num_generations):
    fitnesses = [fitness(chromosome) for chromosome in population]
    max_fitness = max(fitnesses)
    if max_fitness > best_fitness:
        best_fitness = max_fitness
        best_chromosome = population[np.argmax(fitnesses)].copy()  # Ensure we copy the best chromosome
    print(f'Generation {generation} | Best fitness: {max_fitness}')
    
    # Apply elitism
    elite_indices = np.argsort(fitnesses)[-elite_size:]
    elites = [population[i] for i in elite_indices]
    
    selected_population = selection(population, fitnesses)
    next_population = []
    for i in range(0, len(selected_population) - 1, 2):
        parent1 = selected_population[i]
        parent2 = selected_population[i + 1]
        child1, child2 = crossover(parent1, parent2)
        next_population.extend([mutate(child1), mutate(child2)])
    
    # Add elites to the next population
    next_population.extend(elites)
    
    # Handle the case when the selected population size is odd
    if len(selected_population) % 2 != 0:
        next_population.append(selected_population[-1])

    population = next_population

# Save the best chromosome
np.save('best_chromosome.npy', best_chromosome)
print('Best solution found:', best_chromosome)

# Example of using the saved model
env = gym.make('MountainCar-v0', render_mode='human')
def load_model():
    return np.load('best_chromosome.npy')

def run_model(chromosome):
    state, _ = env.reset()
    total_reward = 0
    for _ in range(200):
        env.render()
        action = np.argmax(np.dot(state, chromosome))
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        if done:
            break
    env.close()
    return total_reward  # Total reward (negative value) reflects the number of steps

# Load the model and run it
best_chromosome = load_model()
total_reward = run_model(best_chromosome)
print('Total reward using the best model:', total_reward)

Generation 0 | Best fitness: -200.0
Generation 1 | Best fitness: -200.0
Generation 2 | Best fitness: -200.0
Generation 3 | Best fitness: -121.0
Generation 4 | Best fitness: -121.0
Generation 5 | Best fitness: -121.0
Generation 6 | Best fitness: -122.0
Generation 7 | Best fitness: -200.0
Generation 8 | Best fitness: -131.0
Generation 9 | Best fitness: -123.0
Generation 10 | Best fitness: -118.0
Generation 11 | Best fitness: -117.0
Generation 12 | Best fitness: -116.0
Generation 13 | Best fitness: -117.0
Generation 14 | Best fitness: -117.0
Generation 15 | Best fitness: -117.0
Generation 16 | Best fitness: -116.0
Generation 17 | Best fitness: -117.0
Generation 18 | Best fitness: -116.0
Generation 19 | Best fitness: -117.0
Generation 20 | Best fitness: -116.0
Generation 21 | Best fitness: -116.0
Generation 22 | Best fitness: -117.0
Generation 23 | Best fitness: -116.0
Generation 24 | Best fitness: -116.0
Generation 25 | Best fitness: -116.0
Generation 26 | Best fitness: -116.0
Generation 



Total reward using the best model: -132.0
