In [1]:
import gymnasium as gym
from tqdm import tqdm
#from .autonotebook import tqdm as notebook_tqdm
import numpy as np
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
# from stable_baselines3 import PPO, A2C, SAC, TD3
# from stable_baselines3.common.evaluation import evaluate_policy
from statistics import mean

In [2]:
#define env
env_id = "CartPole-v1"
#env_id = "Acrobot-v1"
env = gym.make(env_id)

In [3]:
#define expert agent
ppo_expert = PPO('MlpPolicy', env_id, verbose=1, create_eval_env=True)

#train expert
ppo_expert.learn(total_timesteps=3e4, eval_freq=10000)

#save expert
ppo_expert.save("ppo_expert")

#evaluate expert
mean_reward, std_reward = evaluate_policy(ppo_expert, env, n_eval_episodes=10)
print(f"Mean reward expert agent= {mean_reward} +/- {std_reward}")

Using cpu device
Creating environment from the given name 'CartPole-v1'
Creating environment from the given name 'CartPole-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  ppo_expert = PPO('MlpPolicy', env_id, verbose=1, create_eval_env=True)
  ppo_expert.learn(total_timesteps=3e4, eval_freq=10000)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20.7     |
|    ep_rew_mean     | 20.7     |
| time/              |          |
|    fps             | 1470     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 24.4       |
|    ep_rew_mean          | 24.4       |
| time/                   |            |
|    fps                  | 998        |
|    iterations           | 2          |
|    time_elapsed         | 4          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00946904 |
|    clip_fraction        | 0.117      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.686     |
|    explained_variance   | -0.0142    |
|    learning_rate        | 0.0003     |
|   

New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 150      |
|    ep_rew_mean     | 150      |
| time/              |          |
|    fps             | 725      |
|    iterations      | 10       |
|    time_elapsed    | 28       |
|    total_timesteps | 20480    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 168          |
|    ep_rew_mean          | 168          |
| time/                   |              |
|    fps                  | 731          |
|    iterations           | 11           |
|    time_elapsed         | 30           |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0056254724 |
|    clip_fraction        | 0.0453       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.549       |
|    explained_variance   | 0.736   



Mean reward expert agent= 500.0 +/- 0.0


In [4]:
##create expert dataset

#empty dataset
num_interactions = int(4e4)

expert_observations = np.empty((num_interactions,) + env.observation_space.shape)
expert_actions = np.empty((num_interactions,) + env.action_space.shape)

print(expert_observations.shape)
print(expert_actions.shape)

#collect experience usign expert policy
obs = env.reset()
for i in tqdm(range(num_interactions)):
    action, _ = ppo_expert.predict(obs, deterministic=True)
    expert_observations[i] = obs
    expert_actions[i] = action
    obs, reward, done, info = env.step(action)
    if done:
        obs = env.reset()
        


(40000, 4)
(40000,)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [00:22<00:00, 1784.40it/s]


In [5]:
#save dataset
np.savez_compressed(
   "expert_data",
   expert_actions=expert_actions,
   expert_observations=expert_observations,
   )

In [6]:
##dataset class
from torch.utils.data.dataset import Dataset, random_split

class ExpertDataSet(Dataset):

    def __init__(self, expert_observations, expert_actions):
        self.observations = expert_observations
        self.actions = expert_actions

    def __getitem__(self, index):
        return (self.observations[index], self.actions[index])

    def __len__(self):
        return len(self.observations)

In [7]:
expert_dataset = ExpertDataSet(expert_observations, expert_actions)

#split in 80% training and 20%test
batch_size = 64
train_prop = 0.8
train_size = int(train_prop * len(expert_dataset))
test_size = len(expert_dataset) - train_size
train_expert_dataset, test_expert_dataset = random_split(expert_dataset, [train_size, test_size])

train_loader = th.utils.data.DataLoader(  dataset=train_expert_dataset, batch_size=batch_size, shuffle=True)
test_loader = th.utils.data.DataLoader(  dataset=test_expert_dataset, batch_size=batch_size, shuffle=True)

In [8]:
###### Define student agent
no_cuda = False
use_cuda = not no_cuda and th.cuda.is_available()
   
device = th.device("cuda" if use_cuda else "cpu")

class StudentAgent:
    def __init__(self, env, train_loader, test_loader, learning_rate):
        self.env = env
        self.train_loader = train_loader
        self.test_loader = test_loader
        
        n_inputs = env.observation_space.shape[0]
        n_outputs = env.action_space.n
        
        self.policy = nn.Sequential(
            nn.Linear(n_inputs, 16), 
            nn.ReLU(), 
            nn.Linear(16, n_outputs),
            nn.Softmax(dim=-1))
        
        print("policy net: ", self.policy)
        
        self.loss_criterion = nn.CrossEntropyLoss()
        
        self.optimizer =  optim.Adam(self.policy.parameters(), lr=learning_rate)
        
        self.num_eval_episodes = 10
        
    def train(self, num_epochs):
        self.policy.train()
        self.policy.to(device)
        for epoch in range(num_epochs):
            for batch_idx, (data, target) in enumerate(train_loader):
                obs, expert_action = data.to(device), target.to(device)
                self.optimizer.zero_grad()
                obs = obs.float()
                student_action = self.policy(obs)
                expert_action = expert_action.long()
                loss = self.loss_criterion(student_action, expert_action)
                loss.backward()
                self.optimizer.step()
            #compute accuracy
            train_acc = self.compute_accuracy(self.train_loader)
            test_acc = self.compute_accuracy(self.test_loader)
            policy_return = self.evaluate_policy(self.num_eval_episodes)
            print("Epoch {}:\ttrain accuracy: {}\ttest accuracy: {}\tpolicy return:{}".format(epoch, train_acc, test_acc, policy_return))

    def compute_accuracy(self, loader):
        total = 0
        correct = 0
        
        self.policy.eval()
        test_loss = 0
        with th.no_grad():
            for data, target in test_loader:
                obs, expert_action = data.to(device), target.to(device)
                obs = obs.float()
            
                student_action = self.policy_action(obs)
            
                total += student_action.size()[0]
                correct += sum(student_action==expert_action).item()
            
        accuracy = 100. * correct/(float)(total)
            
        return accuracy
            
        
    
    def policy_action(self, obs):
        policy_act = self.policy(obs)
        return th.argmax(policy_act, dim= 1)
        
    def evaluate_policy(self, num_episodes, render=False):
        rewards = []
        for ep in range(num_episodes):
            done = False
            tot_rew = 0
            obs = self.env.reset()

            while not done:
                obs = th.FloatTensor(obs).unsqueeze(0)
                action = self.policy_action(obs)
                obs, reward, done, info = env.step(action.item())
                if render:
                    env.render()
                tot_rew += reward
            rewards.append(tot_rew)
        return mean(rewards)
    

            

    

In [9]:
student = StudentAgent(env, train_loader, test_loader, 0.01)
student.train(50)

policy net:  Sequential(
  (0): Linear(in_features=4, out_features=16, bias=True)
  (1): ReLU()
  (2): Linear(in_features=16, out_features=2, bias=True)
  (3): Softmax(dim=-1)
)
Epoch 0:	train accuracy: 92.675	test accuracy: 92.675	policy return:500.0
Epoch 1:	train accuracy: 94.975	test accuracy: 94.975	policy return:500.0
Epoch 2:	train accuracy: 95.55	test accuracy: 95.55	policy return:500.0
Epoch 3:	train accuracy: 96.3875	test accuracy: 96.3875	policy return:500.0
Epoch 4:	train accuracy: 96.5875	test accuracy: 96.5875	policy return:500.0
Epoch 5:	train accuracy: 96.7125	test accuracy: 96.7125	policy return:500.0
Epoch 6:	train accuracy: 96.7375	test accuracy: 96.7375	policy return:500.0
Epoch 7:	train accuracy: 97.7375	test accuracy: 97.7375	policy return:500.0
Epoch 8:	train accuracy: 97.4375	test accuracy: 97.4375	policy return:500.0
Epoch 9:	train accuracy: 97.825	test accuracy: 97.825	policy return:500.0
Epoch 10:	train accuracy: 97.725	test accuracy: 97.725	policy return:500