### A regression problem

We want to predict y for x in a sine function.

In [1]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Generating a sine wave with noise
x = torch.linspace(0, 2 * torch.pi, 2000).view(-1, 1)  # 1000 samples
y = torch.sin(x) + 0.1 * torch.randn(x.size())         # Add noise

# Task: Split into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# Wrap the datasets in PyTorch DataLoaders
train_dataset = TensorDataset(torch.tensor(x_train).float(), torch.tensor(y_train).float())
val_dataset = TensorDataset(torch.tensor(x_val).float(), torch.tensor(y_val).float())

# Leave batch size for tuning later
print(f"Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")


Training samples: 1600, Validation samples: 400


  train_dataset = TensorDataset(torch.tensor(x_train).float(), torch.tensor(y_train).float())
  val_dataset = TensorDataset(torch.tensor(x_val).float(), torch.tensor(y_val).float())


#### This is our deep Neural Network.

In [2]:
import torch.nn as nn

# Define a deeper feedforward neural network
class DeeperNN(nn.Module):
    def __init__(self, hidden_units=128):
        super(DeeperNN, self).__init__()
        self.hidden1 = nn.Linear(1, hidden_units)
        self.hidden2 = nn.Linear(hidden_units, hidden_units)
        self.hidden3 = nn.Linear(hidden_units, hidden_units)
        self.output = nn.Linear(hidden_units, 1)

    def forward(self, x):
        x = torch.relu(self.hidden1(x))
        x = torch.relu(self.hidden2(x))
        x = torch.relu(self.hidden3(x))
        return self.output(x)

# Initialize the model
model = DeeperNN(hidden_units=128)
print(model)


DeeperNN(
  (hidden1): Linear(in_features=1, out_features=128, bias=True)
  (hidden2): Linear(in_features=128, out_features=128, bias=True)
  (hidden3): Linear(in_features=128, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=1, bias=True)
)


#### Here we define basic a training function. 

In [3]:
import torch.optim as optim

# Define the training function
def train_model(model, criterion, optimizer, train_loader, val_loader, epochs):
    model.train()
    for epoch in range(epochs):
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            predictions = model(batch_x)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate on validation set
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            predictions = model(batch_x)
            val_loss += criterion(predictions, batch_y).item()
    return val_loss / len(val_loader)


# Task 1

Expand the hyperparameters for Grid Search (directly below). 
The more values you add, the bigger the search space will be - this is usually good practice, as you don't want to get stuck in local minima but rather find the global minumum -> The hyperparameter combination with the least validation loss. 


![Local Minimum](https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Extrema_example_original.svg/500px-Extrema_example_original.svg.png)



Run the cell after you expanded the hyperparameter sets by as many values as you think are enough. The training might take a few minutes. 



Compare the results with the Random Search results one block below Grid Search. What did you notice? 

In [4]:
# Define hyperparameters for grid search
hidden_units_grid = [8, 16, 32, 64, 128, 256]
learning_rates = [1, 0.1, 0.001]
batch_sizes = [2, 4, 8, 16, 32]
epochs = 20

Our Grid Search algorithm.

In [5]:
best_loss = float('inf')
best_params = None

# Grid Search iterates over all possible hyperparameter combinations
for hidden_units in hidden_units_grid:
    for lr in learning_rates:
        for batch_size in batch_sizes:
            
            # Prepare DataLoader
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)

            # Initialize model, criterion, and optimizer
            model = DeeperNN(hidden_units=hidden_units)
            criterion = nn.MSELoss()
            optimizer = optim.Adam(model.parameters(), lr=lr)

            # Train the model and get validation loss
            val_loss = train_model(model, criterion, optimizer, train_loader, val_loader, epochs)
            print(f"hidden_units={hidden_units}, lr={lr}, batch_size={batch_size}, val_loss={val_loss:.4f}")

            # Track the best configuration
            if val_loss < best_loss:
                best_loss = val_loss
                best_params = (hidden_units, lr, batch_size)

print(f"Best Grid Search Params: hidden_units={best_params[0]}, lr={best_params[1]}, batch_size={best_params[2]} with val_loss={best_loss:.4f}")


hidden_units=8, lr=1, batch_size=2, val_loss=0.5614
hidden_units=8, lr=1, batch_size=4, val_loss=0.5555
hidden_units=8, lr=1, batch_size=8, val_loss=0.5607
hidden_units=8, lr=1, batch_size=16, val_loss=0.5721
hidden_units=8, lr=1, batch_size=32, val_loss=0.5550
hidden_units=8, lr=0.1, batch_size=2, val_loss=0.5774
hidden_units=8, lr=0.1, batch_size=4, val_loss=0.0609
hidden_units=8, lr=0.1, batch_size=8, val_loss=0.0747
hidden_units=8, lr=0.1, batch_size=16, val_loss=0.0128
hidden_units=8, lr=0.1, batch_size=32, val_loss=0.0227
hidden_units=8, lr=0.001, batch_size=2, val_loss=0.0116
hidden_units=8, lr=0.001, batch_size=4, val_loss=0.0120
hidden_units=8, lr=0.001, batch_size=8, val_loss=0.0116
hidden_units=8, lr=0.001, batch_size=16, val_loss=0.0135
hidden_units=8, lr=0.001, batch_size=32, val_loss=0.0283
hidden_units=16, lr=1, batch_size=2, val_loss=0.5742
hidden_units=16, lr=1, batch_size=4, val_loss=0.5498
hidden_units=16, lr=1, batch_size=8, val_loss=0.6383
hidden_units=16, lr=1, ba

### Random Search

We will use the same parameters you defined in the first task. However we won't explore every possible combination, but take random combinations for a number of trials - you can tweak the number of trials if you like, however this is not necessary. 

# Task 1.1

Implement an algorithm that will randomly chose values from the sets you defined above. 

Hint: Check the import statement. 

In [6]:
import random

# Random Search
num_trials = 10
best_loss = float('inf')
best_params = None

for _ in range(num_trials):
    # Randomly sample hyperparameters
    hidden_units = random.choice(hidden_units_grid)
    lr = random.choice(learning_rates)
    batch_size = random.choice(batch_sizes)

    # Prepare DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Initialize model, criterion, and optimizer
    model = DeeperNN(hidden_units=hidden_units)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Train the model and get validation loss
    val_loss = train_model(model, criterion, optimizer, train_loader, val_loader, epochs)
    print(f"hidden_units={hidden_units}, lr={lr}, batch_size={batch_size}, val_loss={val_loss:.4f}")

    # Track the best configuration
    if val_loss < best_loss:
        best_loss = val_loss
        best_params = (hidden_units, lr, batch_size)

print(f"Best Random Search Params: hidden_units={best_params[0]}, lr={best_params[1]}, batch_size={best_params[2]} with val_loss={best_loss:.4f}")


hidden_units=128, lr=1, batch_size=32, val_loss=17.6688
hidden_units=128, lr=0.001, batch_size=2, val_loss=0.0139
hidden_units=64, lr=0.001, batch_size=2, val_loss=0.0138
hidden_units=64, lr=1, batch_size=16, val_loss=0.5496
hidden_units=64, lr=0.1, batch_size=4, val_loss=0.1012
hidden_units=64, lr=0.001, batch_size=16, val_loss=0.0114
hidden_units=64, lr=0.1, batch_size=2, val_loss=0.5616
hidden_units=128, lr=0.001, batch_size=32, val_loss=0.0121
hidden_units=16, lr=0.1, batch_size=2, val_loss=0.5497
hidden_units=32, lr=0.1, batch_size=8, val_loss=0.0724
Best Random Search Params: hidden_units=64, lr=0.001, batch_size=16 with val_loss=0.0114


#### What did you notice? What are key differences? 

Put your answers here. 

#### Even if results may be better by a small margin, what makes Grid Search unpractial compared to Random Search?

Hint: Think about the relation between the number of Hyperparameters and the number of combinations that result from that. Especially for harder problems with more dimensions. 

Put your answers here







## Evolutionary Algorithms

The following codeblock is a basic implementation of an Evolutionary Algorithm. Go over it and try to understand what happens. Remember what was told in the theoretical part. 

In [None]:
# Evaluate the model on the validation set
def evaluate_model(model, train_loader, val_loader, learning_rate, epochs):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    # Train the model
    for epoch in range(epochs):
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            predictions = model(batch_x)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate on the validation set
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            predictions = model(batch_x)
            loss = criterion(predictions, batch_y)
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

# Perform mutation on hyperparameters
def mutate(params, mutation_rate=0.1):
    new_params = params.copy()
    if random.random() < mutation_rate:
        if random.random() < 0.5:
            new_params['hidden_units'] = random.randint(4, 128)  # Choose hidden units randomly between 4 and 128
            new_params['learning_rate'] = random.uniform(0.0001, 1)  # Choose learning rate randomly in range
            new_params['batch_size'] = random.randint(2, 128)  # Choose batch size between 2 and 128
            new_params['epochs'] = random.randint(10, 100)  # Mutate epochs within a range
        else:
            new_params['hidden_units'] += random.randint(-8, 8)  # Choose hidden units randomly between 4 and 128
            new_params['learning_rate'] += random.uniform(-0.001, 0.001)  # Choose learning rate randomly in range
            new_params['batch_size'] += random.randint(-8, 8)  # Choose batch size between 2 and 128
            new_params['epochs'] += random.randint(-8, 8)  # Mutate epochs within a range
    return new_params

# Perform crossover between two sets of hyperparameters
def crossover(parent1, parent2):
    child = {}
    for key in parent1:
        child[key] = parent1[key] if random.random() > 0.5 else parent2[key]
    return child

# Main evolutionary algorithm
def evolutionary_algorithm(num_generations=20, population_size=10, mutation_rate=0.5):
    # Initialize population with random hyperparameters
    population = []
    for _ in range(population_size):
        params = {
            'hidden_units': random.randint(4, 1024),  # Randomly chosen hidden_units
            'learning_rate': random.uniform(0.0001, 1),  # Randomly chosen learning_rate
            'batch_size': random.randint(2, 1048),  # Randomly chosen batch_size
            'epochs': random.randint(1, 50)  # Randomly chosen epochs
        }
        population.append(params)

    best_hyperparams = None
    best_loss = float('inf')

    for generation in range(num_generations):
        # Evaluate fitness of each set of hyperparameters in the population
        fitness = []
        for params in population:
            # Create DataLoaders with the specified batch size
            train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=params['batch_size'])

            # Initialize a new model with the current hyperparameters
            model = DeeperNN(hidden_units=params['hidden_units'])

            # Evaluate the model's validation loss
            val_loss = evaluate_model(model, train_loader, val_loader, params['learning_rate'], params['epochs'])
            fitness.append((val_loss, params))

            # Update the best hyperparameters if the current model is better
            if val_loss < best_loss:
                best_loss = val_loss
                best_hyperparams = params

        # Sort population based on fitness (lower loss = better fitness)
        fitness.sort(key=lambda x: x[0])
        population = [individual[1] for individual in fitness]

        # Print the best fitness in the current generation
        print(f"Generation {generation + 1}: Best Validation Loss = {fitness[0][0]:.6f}")

        # Elitism: Keep the top 2 hyperparameter sets unchanged
        next_population = population[:2]

        # Generate the rest of the population via crossover and mutation
        while len(next_population) < population_size:
            parent1 = random.choice(population[:population_size // 2])
            parent2 = random.choice(population[:population_size // 2])
            child = crossover(parent1, parent2)
            child = mutate(child, mutation_rate=mutation_rate)  # Mutate the child's hyperparameters
            next_population.append(child)

        # Update the population
        population = next_population

    return best_hyperparams, best_loss

# Task 2: Evolutionary Algorithms

![Local Minimum](https://www.americanscientist.org/sites/americanscientist.org/files/20144141249210337-2014-05TechnologueFp170.jpg)

Now this one is a bit more tricky! We now want to use a Evolutionary Algorithm to find good Hyperparameters. Remember the theory behind selections. 


Run the algorithm with the default values first. What do you notice? 

If your machine is not that powerful training might take more than a few minutes. In that case you can stop the calculations after few generations. 

- Remember what the mutation rate does? Maybe try tweaking this value first in a useful manner. Hint: Check the mutation function. Write down why the default value is maybe not the best fit. 

- Maybe you want to check the population size next. Check how the algorithm choses the parents to influence the next generations population in the code above and tweak the value. So, why is the default population size of 4 maybe not the best choice? 

- Alright, step by step in the right direction. Maybe it makes sense to increase the number of generations? Check the comic in respect to the randomness in mutations to understand why ;)

Here is space for your notes and answers..

In [24]:
# Run the evolutionary algorithm
if __name__ == "__main__":
    num_generations = 10
    population_size = 8
    mutation_rate = 0.2

    best_hyperparams, best_loss = evolutionary_algorithm(
        num_generations=num_generations, 
        population_size=population_size,
        mutation_rate=mutation_rate
    )

    # Print the best hyperparameters and corresponding loss
    print("\nBest Hyperparameters:")
    print(f"Hidden Units: {best_hyperparams['hidden_units']}")
    print(f"Learning Rate: {best_hyperparams['learning_rate']:.6f}")
    print(f"Batch Size: {best_hyperparams['batch_size']}")
    print(f"Epochs: {best_hyperparams['epochs']}")
    print(f"Best Validation Loss: {best_loss:.6f}")

Generation 1: Best Validation Loss = 0.012560


TypeError: 'float' object cannot be interpreted as an integer

# Task 2.1 Some theory

Let's think about mutations and if the implementation in the current state can be optimized. 

- Is it useful, that mutation in late generations occur in the same rate and same "brutality" as in early generations? Should we address this? If yes, how? 

1) Adaptive Mutation Rate based on generations (inversely proportional)


In [9]:
# mutation_rate = base_mutation_rate * (1 - generation / max_generations)


2) Gradual Changes

In [10]:
# if random.random() < mutation_rate:
#         new_params['hidden_units'] += random.randint(-8, 8)  # Adjust within ±8
#         new_params['hidden_units'] = max(4, min(new_params['hidden_units'], 128))  # Ensure bounds

3) Hybrid approach (combination of 1 and 2)

In [11]:
# if random.random() < mutation_rate:
#     if random.random() < 0.5:
#         new_params['hidden_units'] = random.randint(4, 128)  # Abrupt change
#     else:
#         new_params['hidden_units'] += random.randint(-8, 8)  # Small adjustment
#         new_params['hidden_units'] = max(4, min(new_params['hidden_units'], 128))
