<a href="https://colab.research.google.com/github/marsggbo/AutoMLDemos/blob/master/ch3/RL-NAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. 搜索空间

In [42]:
import random
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch import nn

# 定义搜索空间
spaces = {
    'channels_1': [16, 32, 64],
    'channels_2': [16, 32, 64],
    'channels_3': [16, 32, 64],
    'kernel_size_1': [3, 5, 7],
    'kernel_size_2': [3, 5, 7],
    'kernel_size_3': [3, 5, 7],
}

# 随机采样模型编码
def sample_encoding():
    encoding = [random.choice(space) for space in spaces.values()]
    return encoding

# 构建模型
def build_model(c1, c2, c3, ks1, ks2, ks3):
    return nn.Sequential(
        nn.Conv2d(3, c1, kernel_size=ks1, stride=1, padding=ks1//2),
        nn.BatchNorm2d(c1), nn.ReLU(),
        nn.Conv2d(c1, c2, kernel_size=ks2, stride=2, padding=ks2//2),
        nn.BatchNorm2d(c2), nn.ReLU(),
        nn.Conv2d(c2, c3, kernel_size=ks3, stride=2, padding=ks3//2),
        nn.BatchNorm2d(c3), nn.ReLU(),
        nn.AdaptiveAvgPool2d(1), nn.Flatten(1),
        nn.Linear(c3, 10),  # 假设最后一层是10类分类任务
    )
    
# 评估模型性能的函数
def evaluate_model(model, num_episodes=10):
    # 在这里实现模型评估的代码,例如：计算模型在验证集上的准确率
    # 为避免代码复杂，此处返回一个随机数
    return torch.rand(1).item()

## 2. 策略梯度

### 2.1 策略网络

In [43]:
def policy_network(input_size, action_space):
    return nn.Sequential(
        nn.Linear(input_size, 128),
        nn.ReLU(),
        nn.Linear(128, action_space)
    )

### 2.2 定义智能体（策略梯度）

In [44]:

class PolicyGradient:
    def __init__(self, state_size, action_size, lr, gamma):
        self.state_size = state_size
        self.action_size = action_size
        self.lr = lr
        self.gamma = gamma

        self.policy_network = policy_network(state_size, action_size)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr)

        self.episode_rewards = []
        self.episode_log_probs = []

    def select_action(self, state):    
        state = torch.FloatTensor(state).view(1, -1)
        logits = self.policy_network(state)
        logits = logits.view(-1, 3)  # 将输出调整为6行3列的形状

        action_probs = F.softmax(logits, dim=1)
        action_dists = [torch.distributions.Categorical(probs) for probs in action_probs]

        actions = [dist.sample() for dist in action_dists]
        log_probs = [dist.log_prob(action) for dist, action in zip(action_dists, actions)]

        encoding = []
        for i in range(self.state_size):
            encoding.append(spaces[list(spaces.keys())[i]][actions[i]])

        return encoding, torch.stack(log_probs)

    def update_policy(self):
        R = 0
        returns = []
        policy_loss = []

        for r in self.episode_rewards[::-1]:
            R = r + self.gamma * R
            returns.insert(0, R)

        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)

        for log_prob, R in zip(self.episode_log_probs, returns):
            policy_loss.append(-log_prob * R)

        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()

        self.episode_rewards = []
        self.episode_log_probs = []

    def add_experience(self, reward, log_prob):
        self.episode_rewards.append(reward)
        self.episode_log_probs.append(log_prob)

## 3. 主函数

In [45]:

def main():
    num_episodes = 20
    num_steps = 50
    lr = 1e-3
    gamma = 0.99

    agent = PolicyGradient(state_size=len(spaces), action_size=6*3, lr=lr, gamma=gamma)

    best_model_encoding = None
    best_model_performance = float('-inf')

    for episode in range(num_episodes):
        state = sample_encoding()
        print(f'Episode {episode}: Model encoding = {state}')
        episode_reward = 0
        for _ in range(num_steps):
            action, log_prob = agent.select_action(state)
            model = build_model(*action)
            reward = evaluate_model(model)
            agent.add_experience(reward, log_prob)
            episode_reward += reward

            if reward > best_model_performance:
                best_model_performance = reward
                best_model_encoding = action

            state = action

        agent.update_policy()

        print(f'Episode {episode}: Average reward = {episode_reward / num_steps}')

    print("\nBest model encoding:", best_model_encoding)
    print("Best model performance:", best_model_performance)
main()

Episode 0: Model encoding = [32, 16, 64, 7, 5, 3]
Episode 0: Average reward = 0.46390264987945556
Episode 1: Model encoding = [16, 64, 64, 7, 7, 7]
Episode 1: Average reward = 0.5476399421691894
Episode 2: Model encoding = [32, 32, 16, 5, 3, 7]
Episode 2: Average reward = 0.5930303144454956
Episode 3: Model encoding = [64, 16, 32, 7, 5, 7]
Episode 3: Average reward = 0.4120137870311737
Episode 4: Model encoding = [32, 16, 64, 5, 7, 3]
Episode 4: Average reward = 0.49418020963668824
Episode 5: Model encoding = [32, 32, 16, 5, 5, 3]
Episode 5: Average reward = 0.5506968915462493
Episode 6: Model encoding = [32, 32, 16, 3, 3, 7]
Episode 6: Average reward = 0.501968320608139
Episode 7: Model encoding = [16, 32, 32, 7, 3, 3]
Episode 7: Average reward = 0.49421594381332395
Episode 8: Model encoding = [32, 64, 32, 7, 7, 5]
Episode 8: Average reward = 0.43238434433937073
Episode 9: Model encoding = [64, 16, 64, 7, 3, 5]
Episode 9: Average reward = 0.5080550241470337
Episode 10: Model encoding 