# Boilerplate

In [1]:
import torch as t
import torch.nn as nn

In [None]:
# Switch to GPU if available

device = "cuda" if t.cuda.is_available() else "cpu"
print(f"Using {device} device")


# Federated Averaging

Federated Averaging (`FedAvg` or `FederatedAveraging`) is an algorithm[1] for distributed training of deep learning models, targeting mostly memory- (although also compute- and network-) constrained devices.

The algorithm can be summarized as taking a random group of clients, training a minibatch on each of them, and then averaging the results at the central server. Then, the weights are distributed out to the client devices and the process begins again.

[1] H. B. McMahan, E. Moore, D. Ramage, S. Hampson, and B. A. y. Arcas,
“Communication-Efficient Learning of Deep Networks from Decentralized
Data,” arXiv:1602.05629 [cs], Feb. 2017, arXiv: 1602.05629. [Online].
Available: http://arxiv.org/abs/1602.05629

In [None]:
def make_models(model_class,n):
    '''
    Make _n_ models using constructor model_class()
    '''
    return [model_class() for _ in range(0, n)]

def get_params_data(m):
    '''
    Gets all data values from a model
    '''
    return [p.data.clone() for p in m.parameters()]

def avg_weights(ms):
    '''
    Average the weights of a list of models.

    All models must have exactly the same parameter shapes.
    '''
    # sum_weights([]) == []
    if len(ms) == 0:
        return []
    end_weights = []
    # Create zero tensors of the required shapes
    for p in get_params_data(ms[0]):
        end_weights.append(t.zeros(size=p.shape).to(device))
    params_by_model = [get_params_data(m) for m in ms]
    for model in params_by_model:
        for t, end_weight in zip(model, end_weights):
            end_weight += t
    for t in end_weights:
        t /= len(ms)
    return end_weights


def apply_weights(w, m):
    '''
    Apply the weight data stored in `w` to `m`.
    '''
    for p, w_p in zip(m.parameters(), w):
        p.data = w_p.clone()

def federated_avg(representative_model, models):
    '''
    Finish one round of Federated Averaging.

    Average the weights from models and apply then to representative_model.
    '''
    weights = avg_weights(models)
    apply_weights(weights, representative_model)

# Q Learning Environment Setup

Typical environment setup similar to any tutorial on solving simple Q learning tasks.

A simple Q learning task is used because it allows for quick evaluation without long training times.

Since I'm experimenting not with new _architectures_, but new ways of _training models_, this saves me a lot of time.

In [None]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image

import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [None]:
# Using the old Cartpole because I have it working with this version and it's sufficient to test 
env = gym.make('CartPole-v0').unwrapped

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = t.device("cuda" if t.cuda.is_available() else "cpu")

In [None]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
class DQN(nn.Module):

    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))

In [None]:
resize = T.Compose([T.ToPILImage(),
                    T.Resize(40, interpolation=Image.CUBIC),
                    T.ToTensor()])


def get_cart_location(screen_width):
    world_width = env.x_threshold * 2
    scale = screen_width / world_width
    return int(env.state[0] * scale + screen_width / 2.0)  # MIDDLE OF CART

def get_screen():
    # Returned screen requested by gym is 400x600x3, but is sometimes larger
    # such as 800x1200x3. Transpose it into torch order (CHW).
    screen = env.render(mode='rgb_array').transpose((2, 0, 1))
    # Cart is in the lower half, so strip off the top and bottom of the screen
    _, screen_height, screen_width = screen.shape
    screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)]
    view_width = int(screen_width * 0.6)
    cart_location = get_cart_location(screen_width)
    if cart_location < view_width // 2:
        slice_range = slice(view_width)
    elif cart_location > (screen_width - view_width // 2):
        slice_range = slice(-view_width, None)
    else:
        slice_range = slice(cart_location - view_width // 2,
                            cart_location + view_width // 2)
    # Strip off the edges, so that we have a square image centered on a cart
    screen = screen[:, :, slice_range]
    # Convert to float, rescale, convert to torch tensor
    # (this doesn't require a copy)
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = t.from_numpy(screen)
    # Resize, and add a batch dimension (BCHW)
    return resize(screen).unsqueeze(0)


env.reset()
plt.figure()
plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(),
           interpolation='none')
plt.title('Example extracted screen')
plt.show()

## Parameters

A number of global variables are defined here. This is bad, so if you're going back over this code to modify it, please be my guest to refactor these somewhere else.

Most of them are obvious. `GAMMA` is $\gamma$, the discount factor, `EPS_START` is $\epsilon$, the parameter used for $\epsilon$-random training, `EPS_DECAY` and `EPS_END` determine how quickly this parameter decays.

`n_actions` is the number of actions to take, which we take directly from `gym`.

In [None]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = get_screen()
_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n

# policy_net = DQN(screen_height, screen_width, n_actions).to(device)
# target_net = DQN(screen_height, screen_width, n_actions).to(device)
# target_net.load_state_dict(policy_net.state_dict())
# target_net.eval()
# 
# optimizer = optim.RMSprop(policy_net.parameters())
# memory = ReplayMemory(10000)


def select_action(policy_net, state, steps_done):
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with t.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return t.tensor([[random.randrange(n_actions)]], device=device, dtype=t.long)


episode_durations = []


def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = t.tensor(episode_durations, dtype=t.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = t.cat((t.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())

## Training loop (`optimize_model`)



In [None]:

def optimize_model(policy_net: nn.Module, target_net: nn.Module, optimizer: optim.Optimizer, local_batch_size: int, memory: ReplayMemory) -> None:
    '''
    Perform one epoch worth of training for some model.
    
    Mutates both `policy_net` and `target_net`.

    Local batch size refers to the batch size to be used for this one set of samples.

    Memory should be for this agent.
    '''
    if len(memory) < local_batch_size:
        return

    transitions = memory.sample(local_batch_size)
    # FL Version: Get one set of samples for 
    # IID Version
    # transitions = [memory.sample(local_batch_size) for _ in range(0, len(policy_nets))]

    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))
    # FL Version
    # batch = [Transition(*zip(*t)) for t in transitions]

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = t.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=t.bool)
    non_final_next_states = t.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = t.cat(batch.state)
    action_batch = t.cat(batch.action)
    reward_batch = t.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = t.zeros(local_batch_size, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()

    # BACKWARD STEP
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [None]:
def do_episodes(policy_net: nn.Module, target_net: nn.Module, optimizer: optim.Optimizer, memory: ReplayMemory, episodes: int, local_batch_size: int) -> None:
    '''
    Perform a number of episodes worth of training for one model.
    '''
    # policy_net,target_net,optimizer = nets
    # policy_net = Model(screen_height, screen_width, n_actions).to(device)
    # target_net = Model(screen_height, screen_width, n_actions).to(device)
    # target_net.load_state_dict(policy_net.state_dict())
    # target_net.eval()
    #optimizer = optim.RMSprop(policy_net.parameters())
    # memory = ReplayMemory(10000)
    num_episodes = episodes
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen()
        current_screen = get_screen()
        state = current_screen - last_screen
        for timestep in count():
            # Select and perform an action
            action = select_action(policy_net, state, timestep)
            _, reward, done, _ = env.step(action.item())
            reward = t.tensor([reward], device=device)

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen()
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the policy network)
            optimize_model(policy_net, target_net, optimizer, local_batch_size, memory)
            if done:
                episode_durations.append(timestep + 1)
                plot_durations()
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

# env.render()
# env.close()
#plt.ioff()
#plt.show()

# policy_net = DQN(screen_height, screen_width, n_actions).to(device)
# target_net = DQN(screen_height, screen_width, n_actions).to(device)
# target_net.load_state_dict(policy_net.state_dict())
# target_net.eval()
# 
# optimizer = optim.RMSprop(policy_net.parameters())
# memory = ReplayMemory(10000)

NUM_MODELS = 10
C = 4
MEMORY_SIZE = 10000
ROUNDS = 100
EPISODES = 10
LOCAL_BATCH_SIZE = 128

# Set up models
models = [(DQN(screen_height, screen_width, n_actions).to(device), DQN(screen_height, screen_width, n_actions).to(device), ReplayMemory(MEMORY_SIZE)) for _ in range(0, NUM_MODELS)]
models = [(policy_net, target_net, optim.RMSprop(policy_net.parameters()), memory) for (policy_net, target_net, memory) in models]
for (policy_net, target_net, _, _) in models:
    target_net.load_state_dict(policy_net.state_dict())
    # Set to not backprop
    target_net.eval()

for round in range(0, ROUNDS):
    local_models = random.sample(models, k=C)
    for net in local_models:
        # train this model for a local set of episodes
        (policy_net, target_net, optimizer, agent_memory) = net
        do_episodes(policy_net, target_net, optimizer, agent_memory, EPISODES, LOCAL_BATCH_SIZE)
        # Average model weights
        policy_ws = avg_weights([m for m, _, _, _ in models])
        target_ws = avg_weights([m for _, m, _, _ in models])
        # Distribute to other models
        with t.no_grad():
            for policy_net in policy_net, _, _, _ in models:
                apply_weights(policy_ws, policy_net)
            for target_net in _, target_net, _, _ in models:
                apply_weights(target_ws, target_net)

In [None]:
env.render()
env.close()