In [1]:
# coding: utf-8
import gym
import torch.utils.data as data
from dynamics import *
from controller import *
from utils import *
from quanser_robots.common import GentlyTerminating

# datasets:  numpy array, size:[sample number, input dimension]
# labels:  numpy array, size:[sample number, output dimension]

env_id ="Qube-v0" # "CartPole-v0"
env = GentlyTerminating(gym.make(env_id))
config_path = "config.yml"
config = load_config(config_path)
#print_config(config_path)

In [2]:
model = DynamicModel(config)

data_fac = DatasetFactory(env,config)
data_fac.collect_random_dataset()

  if datasets == None:
  if labels == None:


Collect random dataset shape:  (4329, 7)


In [3]:
loss = model.train(data_fac.random_trainset,data_fac.random_testset)

Total training step per epoch [55]
Epoch [50/500], Training Loss: 0.06064808, Test Loss: 0.06149180
Epoch [100/500], Training Loss: 0.05886089, Test Loss: 0.05924280
Epoch [150/500], Training Loss: 0.05818230, Test Loss: 0.06072282
Epoch [200/500], Training Loss: 0.05725430, Test Loss: 0.05917601
Epoch [250/500], Training Loss: 0.05566457, Test Loss: 0.05872102
Epoch [300/500], Training Loss: 0.05542131, Test Loss: 0.05889234
Epoch [350/500], Training Loss: 0.05565484, Test Loss: 0.06071570
Epoch [400/500], Training Loss: 0.05715850, Test Loss: 0.06050992
Epoch [450/500], Training Loss: 0.05410156, Test Loss: 0.06346160
Epoch [500/500], Training Loss: 0.05326184, Test Loss: 0.06036323


In [4]:
config["mpc_config"]["horizon"] = 15
config["mpc_config"]["numb_bees"] = 8
config["mpc_config"]["max_itrs"] = 20
config["mpc_config"]["gamma"] = 0.95
mpc = MPC(env,config)

In [None]:
rewards_list = []
for itr in range(config["dataset_config"]["n_mpc_itrs"]):
    print("Begin the reinforce process [%s], collecting data ..." % itr)
    rewards = data_fac.collect_mpc_dataset(mpc,model)
    trainset, testset = data_fac.make_dataset()
    rewards_list += rewards
    
    plt.close("all")
    plt.figure(figsize=(12, 5))
    plt.title('Reward Trend with %s iteration' % itr)
    plt.plot(rewards_list)
    plt.savefig("storage/reward-" + str(model.exp_number) + ".png")
    
    loss = model.train(trainset,testset)

Begin the [0] reinforce process, collecting data ...


  self.Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda()


In [None]:

class DatasetFactory(object):
    def __init__(self, env, config):
        self.env = env
        dataset_config = config["dataset_config"]
        self.load_flag = dataset_config["load_flag"]
        self.load_path = dataset_config["load_path"]
        self.n_max_steps = dataset_config["n_max_steps"]
        self.n_random_episodes = dataset_config["n_random_episodes"]
        self.testset_split = dataset_config["testset_split"]
        self.n_mpc_episodes = dataset_config["n_mpc_episodes"]
        self.mpc_dataset_split = dataset_config["mpc_dataset_split"]
        self.n_mpc_itrs = dataset_config["n_mpc_itrs"]
        self.save_flag = dataset_config["save_flag"]
        self.save_path = dataset_config["save_path"]
        self.random_dataset = None
        self.random_trainset = None
        self.random_testset = None
        self.mpc_dataset = None
        self.mpc_dataset_len = 0
        self.trainset = None
        if self.load_flag:
            self.all_dataset = self.load_dataset()
        else:
            self.all_dataset = None

    # numpy array, collect n_random_episodes data with maximum n_max_steps steps per episode
    def collect_random_dataset(self):
        datasets = None
        labels = None
        for i in range(self.n_random_episodes):
            data_tmp = []
            label_tmp = []
            state_old = self.env.reset()
            for j in range(self.n_max_steps):
                action = self.env.action_space.sample()
                data_tmp.append(np.concatenate((state_old, action)))
                state_new, reward, done, info = self.env.step(action)
                label_tmp.append(state_new - state_old)
                if done:
                    break
                state_old = state_new
            data_tmp = np.array(data_tmp)
            label_tmp = np.array(label_tmp)
            if datasets == None:
                datasets = data_tmp
            else:
                datasets = np.concatenate((datasets, data_tmp))
            if labels == None:
                labels = label_tmp
            else:
                labels = np.concatenate((labels, label_tmp))
        data_and_label = np.concatenate((datasets, labels), axis=1)
        # Merge the data and label into one array and then shuffle
        np.random.shuffle(data_and_label)
        print("Collect random dataset shape: ", datasets.shape)
        testset_len = int(datasets.shape[0] * self.testset_split)
        data_len = datasets.shape[1]
        self.random_testset = {"data": data_and_label[:testset_len, :data_len],
                               "label": data_and_label[:testset_len, data_len:]}
        self.random_trainset = {"data": data_and_label[testset_len:, :data_len],
                                "label": data_and_label[testset_len:, data_len:]}
        self.random_dataset = {"data": datasets, "label": labels}
        self.all_dataset = self.random_dataset

    def collect_mpc_dataset(self, mpc, dynamic_model):
        datasets = None
        labels = None
        reward_episodes = []
        for i in range(self.n_mpc_episodes):
            data_tmp = []
            label_tmp = []
            reward_episode = 0
            state_old = self.env.reset()
            for j in range(self.n_max_steps):
                action = mpc.act(state_old, dynamic_model)
                action = np.array([action])
                data_tmp.append(np.concatenate((state_old, action)))
                state_new, reward, done, info = self.env.step(action)
                reward_episode += reward
                label_tmp.append(state_new - state_old)
                if done:
                    break
                state_old = state_new
            data_tmp = np.array(data_tmp)
            label_tmp = np.array(label_tmp)
            if datasets == None:
                datasets = data_tmp
            else:
                datasets = np.concatenate((datasets, data_tmp))
            if labels == None:
                labels = label_tmp
            else:
                labels = np.concatenate((labels, label_tmp))
            reward_episodes.append(reward_episode)
            print(f"Episode [{i}/{self.n_mpc_episodes}], Reward: {reward_episode:.8f}")
        self.mpc_dataset = {"data": datasets, "label": labels}
        self.mpc_dataset_len = datasets.shape[0]
        print("Totally collect %s data based on MPC" % self.mpc_dataset_len)
        all_datasets = np.concatenate((datasets, self.all_dataset["data"]))
        all_labels = np.concatenate((labels, self.all_dataset["label"]))
        self.all_dataset = {"data": all_datasets, "label": all_labels}
        if self.save_flag:
            self.save_datasets(self.all_dataset)
        return reward_episodes

    def make_dataset(self):
        # calculate how many samples needed from the all datasets
        all_length = int(self.mpc_dataset_len / self.mpc_dataset_split)
        sample_length = all_length - self.mpc_dataset_len
        sample_length = min(self.all_dataset.shape[0], sample_length)
        print("Sample %s training data from all previous dataset, total training sample: %s" % (
        sample_length, all_length))
        data_and_label = np.concatenate((self.all_dataset["data"], self.all_dataset["label"]), axis=1)
        # Merge the data and label into one array and then shuffle
        np.random.shuffle(data_and_label)
        testset_len = min(int(all_length * self.testset_split), self.all_dataset.shape[0])
        data_len = self.mpc_dataset.shape[1]

        trainset_data = np.concatenate((self.mpc_dataset["data"], data_and_label[:sample_length, :data_len]))
        trainset_label = np.concatenate((self.mpc_dataset["label"], data_and_label[:sample_length, data_len:]))
        testset_data = data_and_label[testset_len:, :data_len]
        testset_label = data_and_label[testset_len:, data_len:]
        trainset = {"data": trainset_data, "label": trainset_label}
        testset = {"data": testset_data, "label": testset_label}
        return trainset, testset

    # Save dictionary dataset
    def save_datasets(self,data):
        print("Saving all datas to %s" % self.save_path)
        with open(self.save_path, 'wb') as f:  # open file with write-mode
            pickle.dump(data, f, -1)  # serialize and save object

    def load_dataset(self):
        print("Load datas from %s" % self.load_path)
        with open(self.load_path, 'rb') as f:
            dataset = pickle.load(f)
        return dataset