## Обучение агента - ДЗ1 ##



## Подготовка к обучению ##

In [2]:
import gymnasium as gym
import numpy as np
import torch
from random import randint

env = gym.make('CartPole-v1')

In [3]:
from torch import nn

class PolicyModel(nn.Module):
  def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(4, 8), # 4 параметра среды из observation
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, 2) # вероятность кажого из двух действий
        )

  def forward(self,x):
      return self.model(x)

In [4]:
from random import randint

def comlete_session(env, policy_model, eval=False, seed=randint(0, 1000)):
  if eval:
      policy_model.eval()
  else:
      policy_model.train()

  state, info = env.reset(seed=seed)
  finished = False

  states, actions, action_log_probs, rewards, all_probs = [], [], [], [], []
  while not finished:
    state_tensr = torch.tensor(state)
    logits = policy_model(state_tensr)
    probs = torch.softmax(logits, dim=0)
    all_probs.append(probs)
    if eval:
      action_i = int(probs.argmax())
    else:
      action_i = int(torch.multinomial(probs, num_samples=1))
    logit = logits[action_i]
    prob = probs[action_i]

    actions.append(action_i)
    action_log_probs.append(torch.log(prob))
    states.append(state_tensr)

    state, reward, terminated, truncated, info = env.step(action_i)
    finished = any([terminated, truncated])
    rewards.append(float(reward))

  return (torch.stack(states), torch.tensor(actions), torch.stack(action_log_probs), torch.tensor(rewards), torch.stack(all_probs))



## REINFORCEMENT LEARNING ##

### Vanilla Policy Gradient ###

In [5]:
def TrainVanilaPG(model, N_epochs, lr, gamma, seed=0, entropy_regulirization=False):
  PolicyVPG = model
  optimizer = torch.optim.Adam(PolicyVPG.parameters(),lr=lr)
  metrics = [[0, 0]]
  check = 100
  ns = np.zeros(check)
  prev_nsmean = 1

  for ei in range(N_epochs):
    if not seed:
      seed = randint(0, 1000)
    states, actions, log_probs, rewards, all_probs = comlete_session(env, PolicyVPG, seed=seed)
    n = rewards.size()[0]
    G = torch.zeros(n)
    G[-1] = rewards[-1]
    for i in range(2, n+1):
      G[-i] = rewards[-i] + G[-i+1] * gamma
    loss = -(G * log_probs).mean()
    if entropy_regulirization:
      H = -(all_probs * all_probs.log()).sum(dim=1)
      loss -= 0.001 * H.mean()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    ns[ei%100] = n
    if ei and ei % 100 == 0:
      nsmean = ns.mean()
      metrics.append([ei, nsmean])
  return [PolicyVPG, np.array(metrics)]


In [56]:
N_epochs = 5000
gamma = 0.99
lr = 0.001
model = PolicyModel()
PolicyVPG, VPGmetrics = TrainVanilaPG(model, N_epochs, lr, gamma, seed=1)

In [57]:
import plotly
import plotly.graph_objects as go

traceVPG = go.Scatter(x=VPGmetrics[:, 0], y=VPGmetrics[:, 1], name='Vanilla Policy Gradient')
stat = [traceVPG]
fig = go.Figure(data = stat)
fig.update_layout(
    xaxis_title='Количество эпизодов',
    yaxis_title='Количество действий до падения',
    legend={'xanchor': 'left', 'yanchor': 'top'}
)
fig.show()

---

### PG с бейзлайном: Средняя награда, наблюдаемая за время обучения. ###

In [6]:
def TrainPGwBM(model, N_epochs, lr, gamma, seed=0, entropy_regulirization=False):
  PolicyPGBM = model
  optimizer = torch.optim.Adam(PolicyPGBM.parameters(),lr=lr)
  metrics = [[0, 0]]
  check = 100
  ns = np.zeros(check)
  prev_nsmean = 1

  for ei in range(N_epochs):
    if not seed:
      seed = randint(0, 1000)
    states, actions, log_probs, rewards, all_probs = comlete_session(env, PolicyPGBM, seed=seed)
    n = rewards.size()[0]
    G = torch.zeros(n)
    G[-1] = rewards[-1]
    for i in range(2, n+1):
      G[-i] = rewards[-i] + G[-i+1] * gamma
    b = G.mean()
    G = G - b
    loss = -(G * log_probs).mean()
    if entropy_regulirization:
      H = -(all_probs * all_probs.log()).sum(dim=1)
      loss -= 0.001 * H.mean()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    ns[ei%100] = n
    if ei and ei % 100 == 0:
      nsmean = ns.mean()
      metrics.append([ei, nsmean])
  return [PolicyPGBM, np.array(metrics)]


In [59]:
N_epochs = 5000
gamma = 0.99
lr = 0.001
model = PolicyModel()
PolicyPGBM, PGBMmetrics = TrainPGwBM(model, N_epochs, lr, gamma, seed=1)

In [60]:
import plotly
import plotly.graph_objects as go

traceVPG = go.Scatter(x=VPGmetrics[:, 0], y=VPGmetrics[:, 1], name='Vanilla Policy Gradient')
tracePGBM = go.Scatter(x=PGBMmetrics[:, 0], y=PGBMmetrics[:, 1], name='PG with b | b = mean')
stat = [traceVPG, tracePGBM]
fig = go.Figure(data = stat)
fig.update_layout(
    xaxis_title='Количество эпизодов',
    yaxis_title='Количество действий до падения',
    legend={'xanchor': 'left', 'yanchor': 'top'}
)
fig.show()

### PG с бейзлайном: Value function ###

In [7]:
class ValueFunction(nn.Module):
  def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(4, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, 1) # награда
        )

  def forward(self,x):
      return self.model(x)

In [8]:
def TrainPGwBVF(Policy, N_epochs, lr, gamma, seed=0, entropy_regulirization=False):
  PolicyPGBVF = Policy
  optimizer_policy = torch.optim.Adam(PolicyPGBVF.parameters(),lr=lr)
  V = ValueFunction()
  optimizer_V = torch.optim.Adam(V.parameters(),lr=lr)

  metrics = [[0, 0]]
  check = 100
  ns = np.zeros(check)
  prev_nsmean = 1

  for ei in range(N_epochs):
    if not seed:
      seed = randint(0, 1000)
    states, actions, log_probs, rewards, all_probs = comlete_session(env, PolicyPGBVF, seed=seed)
    n = rewards.size()[0]
    G = torch.zeros(n)
    G[-1] = rewards[-1]
    for i in range(2, n+1):
      G[-i] = rewards[-i] + G[-i+1] * gamma
    b = V(states)
    Adv = G - b.detach()
    loss_policy = -(Adv * log_probs).mean()
    if entropy_regulirization:
      H = -(all_probs * all_probs.log()).sum(dim=1)
      loss_policy -= 0.001 * H.mean()
    optimizer_policy.zero_grad()
    loss_policy.backward()
    optimizer_policy.step()

    loss_V = ((G - b) ** 2).mean()
    optimizer_V.zero_grad()
    loss_V.backward()
    optimizer_V.step()


    ns[ei%100] = n
    if ei and ei % 100 == 0:
      nsmean = ns.mean()
      metrics.append([ei, nsmean])
  return [PolicyPGBVF, np.array(metrics)]


In [63]:
N_epochs = 5000
gamma = 0.99
lr = 0.001
model = PolicyModel()
PolicyPGBVF, PGBVFmetrics = TrainPGwBVF(model, N_epochs, lr, gamma, seed=1)

In [64]:
import plotly
import plotly.graph_objects as go

traceVPG = go.Scatter(x=VPGmetrics[:, 0], y=VPGmetrics[:, 1], name='Vanilla Policy Gradient')
tracePGBM = go.Scatter(x=PGBMmetrics[:, 0], y=PGBMmetrics[:, 1], name='PG with b | b = mean')
tracePGBVF = go.Scatter(x=PGBVFmetrics[:, 0], y=PGBVFmetrics[:, 1], name='PG with b | b = V(s)')
stat = [traceVPG, tracePGBM, tracePGBVF]
fig = go.Figure(data = stat)
fig.update_layout(
    xaxis_title='Количество эпизодов',
    yaxis_title='Количество действий до падения',
    legend={'xanchor': 'left', 'yanchor': 'top'}
)
fig.show()

### PG с бейзлайном: RLOO ###

In [9]:
def TrainPGwBRLOO(model, N_epochs, lr, gamma, seed=0, entropy_regulirization=False):
  PolicyPGBRLOO = model
  optimizer = torch.optim.Adam(PolicyPGBRLOO.parameters(),lr=lr)

  batch_size = 10

  metrics = [[0, 0]]
  check = 100
  ns = np.zeros(check)
  prev_nsmean = 1

  for ei in range(N_epochs):
    statesB, actionsB, log_probsB, rewardsB, all_probsB, rewards_sums = [], [], [], [], [], []
    for s in range(batch_size):
      if not seed:
        seed = randint(0, 1000)
      states, actions, log_probs, rewards, all_probs = comlete_session(env, PolicyPGBRLOO, seed=seed)
      statesB.append(states)
      actionsB.append(actions)
      log_probsB.append(log_probs)
      all_probsB.append(all_probs)
      rewards_sums.append(rewards.sum())
      rewardsB.append(rewards)

    rewards_sum = sum(rewards_sums)
    B = torch.zeros(batch_size)
    loss = torch.tensor(0.0)
    for bi in range(batch_size):
      B[bi] = (rewards_sum-rewards_sums[bi])/(batch_size-1)

      n = rewardsB[bi].size()[0]
      G = torch.zeros(n)
      G[-1] = rewardsB[bi][-1]
      for i in range(2, n+1):
        G[-i] = rewardsB[bi][-i] + G[-i+1] * gamma
      G = G - B[bi]
      loss += (-(G * log_probsB[bi]).mean())
      if entropy_regulirization:
        H = -(all_probsB[bi] * all_probsB[bi].log()).sum(dim=1)
        loss -= 0.001 * H.mean()

    ns[ei%100] = n
    if ei and ei % (100//batch_size) == 0:
      nsmean = ns.mean()
      metrics.append([ei, nsmean])
    loss_mean = loss/batch_size
    optimizer.zero_grad()
    loss_mean.backward()
    optimizer.step()
  return [PolicyPGBRLOO, np.array(metrics)]

In [66]:
N_epochs = 2000
gamma = 0.99
lr = 0.001
model = PolicyModel()
PolicyPGBRLOO, PGBRLOOmetrics = TrainPGwBRLOO(model, N_epochs, lr, gamma, seed=1)

In [67]:
import plotly
import plotly.graph_objects as go


traceVPG = go.Scatter(x=VPGmetrics[:, 0], y=VPGmetrics[:, 1], name='Vanilla Policy Gradient')
tracePGBM = go.Scatter(x=PGBMmetrics[:, 0], y=PGBMmetrics[:, 1], name='PG with b | b = mean')
tracePGBVF = go.Scatter(x=PGBVFmetrics[:, 0], y=PGBVFmetrics[:, 1], name='PG with b | b = V(s)')
tracePGBRLOO = go.Scatter(x=PGBRLOOmetrics[:, 0], y=PGBRLOOmetrics[:, 1], name='PG with b | b = RLOO')
stat = [traceVPG, tracePGBM, tracePGBVF, tracePGBRLOO]
fig = go.Figure(data = stat)
fig.update_layout(
    xaxis_title='Количество эпизодов',
    yaxis_title='Количество действий до падения',
    legend={'xanchor': 'left', 'yanchor': 'top'},
    title='Сравнение всех моделей при gamma = 0.99, lr=0.001'
)
fig.show()

###  Регуляризацию на энтропию и изменение гиперпараметров ###

In [10]:
model_dict = {
    'Vanilla Policy Gradient': TrainVanilaPG,
    'PG with b | b = mean': TrainPGwBM,
    'PG with b | b = V(s)': TrainPGwBVF,
    'PG with b | b = RLOO': TrainPGwBRLOO
}

### Сравнение поведения моделей с регуляризацией на энтропию и без ###

In [69]:
from plotly.subplots import make_subplots

N_epochs = 2001
print(f'Сравнение всех моделей при gamma = 0.99, lr=0.001 на первых {N_epochs-1} играх c регуляризацией на энтропию и без')

#for model_k in list(model_dict.keys()[3]):
for model_k in model_dict.keys():
  fig = go.Figure()
  model, metrics = model_dict[model_k](PolicyModel(), N_epochs, 0.001, 0.99, entropy_regulirization=True, seed=1)
  trace_with = go.Scatter(name='with Reg', x=metrics[:, 0], y=metrics[:, 1])
  fig.add_trace(trace_with)
  model, metrics = model_dict[model_k](PolicyModel(), N_epochs, 0.001, 0.99, entropy_regulirization=False, seed=1)
  trace_without = go.Scatter(name='without Reg',x=metrics[:, 0], y=metrics[:, 1])
  fig.add_trace(trace_without)

  fig.update_layout(
      xaxis_title='Количество эпизодов',
      yaxis_title='Количество действий до падения',
      legend={'xanchor': 'left', 'yanchor': 'top'},
      title = model_k
  )
  fig.show()

Сравнение всех моделей при gamma = 0.99, lr=0.001 на первых 2000 играх c регуляризацией на энтропию и без


### Сравнение поведения моделей c изменение +-20% от lr = 0.001 ###

In [None]:
from plotly.subplots import make_subplots
import plotly
import plotly.graph_objects as go

N_epochs = 2001
print(f'Сравнение всех моделей при gamma = 0.99, lr = 0.001 +- 20% на первых {N_epochs-1} играх')
#for model_k in [list(model_dict.keys())[3]]:
for model_k in model_dict.keys():
  fig = go.Figure()
  for lr in [0.001 * 0.8, 0.001, 0.001 * 1.2]:
    model, metrics = model_dict[model_k](PolicyModel(), N_epochs, lr, 0.99, entropy_regulirization=False, seed=1)
    trace_with = go.Scatter(name=f'lr = {lr}', x=metrics[:, 0], y=metrics[:, 1])
    fig.add_trace(trace_with)

  fig.update_layout(
      xaxis_title='Количество эпизодов',
      yaxis_title='Количество действий до падения',
      legend={'xanchor': 'left', 'yanchor': 'top'},
      title = model_k
  )
  fig.show()

Сравнение всех моделей при gamma = 0.99, lr = 0.001 +- 20% на первых 2000 играх


### Сравнение поведения моделей c изменение +-20% от gamma = 0.9, 0.99, 0.999 ###

In [11]:
from plotly.subplots import make_subplots
import plotly
import plotly.graph_objects as go

N_epochs = 2001
print(f'Сравнение всех моделей при lr = 0.001, gamma = 0.9, 0.99, 0.999 на первых {N_epochs-1} играх')

# for model_k in [list(model_dict.keys())[2]]:
for model_k in model_dict.keys():
  fig = go.Figure()
  for gamma in [0.9, 0.99, 0.999]:
    model, metrics = model_dict[model_k](PolicyModel(), N_epochs, 0.001, gamma, entropy_regulirization=False, seed=1)
    trace_with = go.Scatter(name=f'gamma = {gamma}', x=metrics[:, 0], y=metrics[:, 1])
    fig.add_trace(trace_with)

  fig.update_layout(
      xaxis_title='Количество эпизодов',
      yaxis_title='Количество действий до падения',
      legend={'xanchor': 'left', 'yanchor': 'top'},
      title = model_k
  )
  fig.show()

Сравнение всех моделей при lr = 0.001, gamma = 0.9, 0.99, 0.999 на первых 2000 играх


KeyboardInterrupt: 

## BEHAVIOUR CLONING ##

In [12]:
N = 1000
expert, PGBVLROOmetrics = TrainPGwBRLOO(PolicyModel(), N, 0.001, 0.99)

tracePGBRLOO = go.Scatter(x=PGBVLROOmetrics[:, 0], y=PGBVLROOmetrics[:, 1], name='PG with b | b = RLOO')
stat = [tracePGBRLOO]
fig = go.Figure(data = stat)
fig.show()

In [13]:
ns = []
stat = []
expert.eval()
for ei in range(2000):
  state, info = env.reset(seed=randint(0, 100000))
  finished = False
  n = 0
  while not finished and n < 100: # если игра идет долго, там одни и те же позиции
    n += 1
    state = torch.tensor(state)
    probs = expert(state)
    action = int(probs.argmax())
    res = torch.cat((state, torch.tensor([action])))
    stat.append(res)


    state, reward, terminated, truncated, info = env.step(action)
    finished = any([terminated, truncated])
  ns.append(n)
print(sum(ns)/len(ns))




100.0


In [14]:
stat = np.array(stat)

In [15]:
import plotly
import plotly.graph_objects as go

for i in range(4):
  fig = go.Figure()
  fig.add_trace(go.Histogram(x=stat[:, i]))
  fig.update_layout(
      legend={'xanchor': 'left', 'yanchor': 'top'},
      title = f'Distibution of {['Cart Position', 'Cart Velocity', 'Pole Angle', 'Pole Angular Velocity'][i]}'
  )
  fig.show()

In [16]:
DataSets = {
    'Include all states': stat[:],
    'Limit Cart Position': stat[abs(stat[:, 0]) < 0.05],
    'Limit Cart Velocity': stat[abs(stat[:, 1]) < 0.1],
    'Limit Pole Angulat Velocity': stat[abs(stat[:, 3] + 0.05) < 0.1],
    'Limit Pole Angle': stat[abs(stat[:, 2]) < 0.005]
}

In [17]:
for keys in DataSets.keys():
  print(f'Для случая {keys} у нас будет {len(DataSets[keys])} состояний для обучения')

Для случая Include all states у нас будет 200000 состояний для обучения
Для случая Limit Cart Position у нас будет 107980 состояний для обучения
Для случая Limit Cart Velocity у нас будет 89188 состояний для обучения
Для случая Limit Pole Angulat Velocity у нас будет 81058 состояний для обучения
Для случая Limit Pole Angle у нас будет 84237 состояний для обучения


### BC на всех парах ###

In [18]:
for limit in DataSets.keys():
  # обучаем
  model = PolicyModel()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  model.train()
  for item in DataSets[limit][:10_000]:
    state = torch.tensor(item[:4])
    action_e = int(item[-1])
    porbs_e = [0, 0]
    porbs_e[action_e] = 1
    probs_m = torch.softmax(model(state), dim=0)
    loss = -(torch.tensor(porbs_e) * probs_m.log()).sum()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  # тестируем
  model.eval()
  stat = []
  for ei in range(800):
    state, info = env.reset(seed=randint(0, 100000))
    finished = False
    n = 0
    while not finished:
      n += 1
      state = torch.tensor(state)
      probs = model(state)
      action = int(probs.argmax())
      state, reward, terminated, truncated, info = env.step(action)
      finished = any([terminated, truncated])
    stat.append([ei, n])
  # результаты
  fig = go.Figure()
  stat = np.array(stat)
  fig.add_trace(go.Histogram(x=stat[:, 1], xbins={'size': 10}, name="Distill model Lifetime [0, 500]"))
  fig.update_layout(
      legend={'xanchor': 'left', 'yanchor': 'top'},
      title = f'Distill model Lifetime [0, 500] with {limit}'
  )
  fig.show()