In [21]:
%load_ext autoreload
%autoreload 2

from __future__ import absolute_import, division, print_function
import argparse
from datetime import datetime
import imp
import numpy as np
import torch
from utils.monitor import Monitor
from envs.mo_env import MultiObjectiveEnv
# from gym_env_moll.multiobjective import LunarLander
# import gym
import json


use_cuda =  torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor


def generate_next_preference(preference, alpha=10000):
    preference = np.array(preference)
    preference += 1e-6
    
    return FloatTensor(np.random.dirichlet(alpha*preference))

def init_log_file(log_file_str):
    with open(log_file_str, mode='w+') as log_file:
        log_file.write('[\n')

def write_log(log_file_str, data, is_json = False):
    with open(log_file_str, mode='a+') as log_file:
        if is_json:
            json.dump(data, log_file)
        else:
            log_file.write(data)

def train(env, agent, args):
    log_file_str = './logs/multihead3_Q_log_' + args.env_name + '_' + str(datetime.today().strftime("%Y_%m_%d")) + '.json'
    save_loc = './saved_models/'
    save_file_name = 'multihead3_Q_log_' + args.env_name + '_' + str(datetime.today().strftime("%Y_%m_%d"))

    init_log_file(log_file_str)
    fixed_probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0])
    env.reset()
    alpha = args.alpha

    dirichet_param = 0.1
    dirichet_param_schedule = 0.9/(args.episode_num - 1000)

    max_steps_in_env = 100
    for num_eps in range(60):
        terminal = False
        env.reset()
        q_loss = 0
        exploration_loss = 0
        cnt = 0
        tot_reward = 0

        probe = np.random.randn(6)
        probe = FloatTensor(np.abs(probe)/np.linalg.norm(probe, ord=1))
    
        # probe = generate_next_preference(np.random.uniform(size=len(env.reward_spec)), alpha = 1)
        # probe = generate_next_preference(np.ones(shape=len(env.reward_spec))*dirichet_param, alpha = 1)
        
        # if dirichet_param < 0.99:
        #     dirichet_param += dirichet_param_schedule
        # else:
        #     dirichet_param = 0.99
        

        # if num_eps % 100 == 0:
        #     probe = FloatTensor([0.98, 0.02])
        #     probe = generate_next_preference(probe, 200)
        
        write_log(log_file_str, '[')

        while not terminal:
            state = env.observe()
            action = agent.act(state, probe)
            next_state, reward, terminal = env.step(action)
            next_preference = generate_next_preference(probe, alpha)
            
            agent.memorize(state, action, next_state, reward, terminal, probe, next_preference)
            loss = agent.learn()
            q_loss += loss[0]
            exploration_loss += loss[1]

            if cnt > max_steps_in_env:
                terminal = True
                agent.reset()
            
            tot_reward = tot_reward + (fixed_probe.cpu().numpy().dot(reward)) * np.power(args.gamma, cnt)
            probe = next_preference
            cnt = cnt + 1

            if reward[0] > 8:
                print(reward, state)

            if args.log and (num_eps % 50) == 0:
                _, Q, q = agent.predict(probe, state)

                log = {
                    'state':state.tolist(),
                    'action':action,
                    'reward':reward.tolist(),
                    'terminal':terminal,
                    'probe':probe.detach().numpy().tolist(),
                    'q_val': q.tolist(),
                    'cnt': cnt,
                    'num_eps': num_eps
                }

                print('probe', probe.detach().numpy().tolist())
                print('state', log['state'])
                print('action', log['action'])
                print('reward', log['reward'])
                print('q_val', log['q_val'])
                print('Q_val', Q.detach().numpy().tolist())
                print('tot_reward', tot_reward)
                print('cnt', log['cnt'])
                print('num_eps', log['num_eps'])
                print('eps', agent.epsilon)
                print('---------------------------------------')

                write_log(log_file_str, log, True)

                if not terminal:
                    write_log(log_file_str, ',\n')
                else:
                    write_log(log_file_str, '\n],\n')


        _, Q, q = agent.predict(fixed_probe)

        if args.env_name == "dst":
            act_1 = q[0, 3]
            act_2 = q[0, 1]
        elif args.env_name in ['ft', 'ft5', 'ft7']:
            act_1 = q[0, 1]
            act_2 = q[0, 0]

        if args.method == "crl-naive":
            act_1 = act_1.data.cpu()
            act_2 = act_2.data.cpu()
        elif args.method == "crl-envelope":
            act_1 = probe.dot(act_1.data)
            act_2 = probe.dot(act_2.data)
        elif args.method == "crl-energy":
            act_1 = probe.dot(act_1.data)
            act_2 = probe.dot(act_2.data)
        print("eps %d reward (1) %0.2f, the Q is %0.2f | %0.2f; the probe is %0.2f | %0.2f; dirichet: %0.3f; q_loss: %0.4f; exploration_loss: %0.4f" % (
            num_eps,
            tot_reward,
            act_1,
            act_2,
            probe[0],
            probe[1],
            dirichet_param,
            q_loss / cnt,
            exploration_loss/cnt))


        if (num_eps+1) % 500 == 0:
            agent.save(save_loc, save_file_name+"_eps_"+str(num_eps))

    
    agent.save(save_loc, save_file_name+"_eps_"+str(num_eps))
    return agent


parser = argparse.ArgumentParser(description='MORL')
# CONFIG
parser.add_argument('--env-name', default='ft', metavar='ENVNAME',
                    help='environment to train on: dst | ft | ft5 | ft7')
parser.add_argument('--method', default='crl-naive', metavar='METHODS',
                    help='methods: crl-naive | crl-envelope | crl-energy')
parser.add_argument('--model', default='linear', metavar='MODELS',
                    help='linear | cnn | cnn + lstm')
parser.add_argument('--gamma', type=float, default=0.99, metavar='GAMMA',
                    help='gamma for infinite horizonal MDPs')
# TRAINING
parser.add_argument('--mem-size', type=int, default=4000, metavar='M',
                    help='max size of the replay memory')
parser.add_argument('--batch-size', type=int, default=256, metavar='B',
                    help='batch size')
parser.add_argument('--lr', type=float, default=1e-3, metavar='LR',
                    help='learning rate')
parser.add_argument('--epsilon', type=float, default=0.5, metavar='EPS',
                    help='epsilon greedy exploration')
parser.add_argument('--epsilon-decay', default=True, action='store_true',
                    help='linear epsilon decay to zero')
parser.add_argument('--weight-num', type=int, default=16, metavar='WN',
                    help='number of sampled weights per iteration')
parser.add_argument('--episode-num', type=int, default=10000, metavar='EN',
                    help='number of episodes for training')
parser.add_argument('--optimizer', default='Adam', metavar='OPT',
                    help='optimizer: Adam | RMSprop')
parser.add_argument('--update-freq', type=int, default=100, metavar='OPT',
                    help='optimizer: Adam | RMSprop')
parser.add_argument('--beta', type=float, default=0.01, metavar='BETA',
                    help='(initial) beta for evelope algorithm, default = 0.01')
parser.add_argument('--homotopy', default=False, action='store_true',
                    help='use homotopy optimization method')
# LOG & SAVING
parser.add_argument('--serialize', default=False, action='store_true',
                    help='serialize a model')
parser.add_argument('--save', default='crl/naive/saved/', metavar='SAVE',
                    help='path for saving trained models')
parser.add_argument('--name', default='', metavar='name',
                    help='specify a name for saving the model')
parser.add_argument('--log', default='crl/naive/logs/', metavar='LOG',
                    help='path for recording training informtion')

use_cuda =  torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

args = parser.parse_args(args=[])

 # setup the environment
    # args.env_name = 'Lunar'
# env = gym.make('gym.envs.multiobjective/LunarLander')
env = MultiObjectiveEnv(args.env_name)
# get state / action / reward sizes
state_size = len(env.state_spec)
action_size = env.action_spec[2][1] - env.action_spec[2][0]
reward_size = len(env.reward_spec)

# generate an agent for initial training
agent = None

args.alpha = 4000

from crl.envelope.meta_mod import MetaAgent
# from crl.envelope.models.multiheadoutput import EnvelopeLinearCQN
from crl.envelope.models.multihead3 import EnvelopeLinearCQN
from crl.envelope.exemplar import Exemplar

if args.serialize:
    model = torch.load("{}{}.pkl".format(args.save,
                                 "m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name)))


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [162]:
model = EnvelopeLinearCQN(state_size, action_size, reward_size)
exemplar_model = Exemplar(reward_size, reward_size, 1e-3, 0, device, 3)
agent = MetaAgent(model, exemplar_model, args, is_train=True)  

agent = train(env, agent, args)

probe [0.25677719712257385, 0.33327680826187134, 0.10112029314041138, 0.20036625862121582, 0.012989888899028301, 0.09546953439712524]
state [0, 0]
action 0
reward [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
q_val [[-0.04470448195934296, 0.03817763179540634]]
Q_val [[[0.019332872703671455, 0.051032863557338715, -0.06491155922412872, -0.014705541543662548, -0.025676142424345016, -0.01069581974297762], [0.06739066541194916, -0.010870634578168392, -0.051332782953977585, -0.03402823954820633, -0.04388321191072464, 0.02700396440923214]]]
tot_reward 0.0
cnt 1
num_eps 0
eps 0.5
---------------------------------------
probe [0.2548563480377197, 0.3373427391052246, 0.0946146696805954, 0.20052200555801392, 0.01254075113683939, 0.10012347996234894]
state [1, 0]
action 1
reward [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
q_val [[-0.04460306093096733, 0.03795444220304489]]
Q_val [[[0.017160890623927116, 0.0507727712392807, -0.0655260905623436, -0.011765580624341965, -0.025441974401474, -0.013694685883820057], [0.073931425809

eps 42 reward (1) 5.88, the Q is 0.04 | -0.05; the probe is 0.18 | 0.30; dirichet: 0.100; q_loss: 0.8532; exploration_loss: 0.2917
eps 43 reward (1) 2.78, the Q is 0.04 | -0.05; the probe is 0.03 | 0.22; dirichet: 0.100; q_loss: 1.9425; exploration_loss: 0.7428
eps 44 reward (1) 0.88, the Q is 0.04 | -0.05; the probe is 0.10 | 0.08; dirichet: 0.100; q_loss: 1.6693; exploration_loss: 0.7838
eps 45 reward (1) 3.12, the Q is 0.04 | -0.05; the probe is 0.18 | 0.18; dirichet: 0.100; q_loss: 1.6190; exploration_loss: 0.8724
eps 46 reward (1) 0.88, the Q is 0.04 | -0.05; the probe is 0.12 | 0.02; dirichet: 0.100; q_loss: 1.5443; exploration_loss: 0.7686
eps 47 reward (1) 5.58, the Q is 0.04 | -0.05; the probe is 0.15 | 0.34; dirichet: 0.100; q_loss: 1.4249; exploration_loss: 0.7190
eps 48 reward (1) 4.27, the Q is 0.04 | -0.05; the probe is 0.14 | 0.23; dirichet: 0.100; q_loss: 1.3143; exploration_loss: 0.7120
eps 49 reward (1) 5.65, the Q is 0.04 | -0.05; the probe is 0.00 | 0.02; dirichet: 

In [152]:
from torch.autograd import Variable

minibatch = agent.sample(agent.trans_mem, agent.priority_mem, agent.batch_size)
batchify = lambda x: list(x) * agent.weight_num
state_batch = batchify(map(lambda x: x.s.unsqueeze(0), minibatch))
action_batch = batchify(map(lambda x: LongTensor([x.a]), minibatch))
reward_batch = batchify(map(lambda x: x.r.unsqueeze(0), minibatch))
next_state_batch = batchify(map(lambda x: x.s_.unsqueeze(0), minibatch))
terminal_batch = batchify(map(lambda x: x.d, minibatch))

# w_batch = batchify(map(lambda x: x.w, minibatch))
# w_batch = Variable(torch.stack(w_batch), requires_grad=False).type(FloatTensor)

w_batch = list(map(lambda x: x.w, minibatch))
w_batch = Variable(torch.stack(w_batch), requires_grad=False).type(FloatTensor)
next_w_batch = list(map(lambda x: x.w_, minibatch))
next_w_batch = Variable(torch.stack(next_w_batch), requires_grad=False).type(FloatTensor)
w_batch, next_w_batch = agent.generate_neighbours(w_batch, next_w_batch, agent.weight_num)

exemplar_batch_size = 10
index_list = np.random.randint(0, w_batch.shape[0], size=exemplar_batch_size)

# sample1 = torch.cat((torch.cat(state_batch, dim=0)[index_list], w_batch[index_list]), dim=1)
sample1 = w_batch[index_list]
positive = sample1[0:int(sample1.shape[0]/2)]
negative = sample1[int(sample1.shape[0]/2):]

sample1 = torch.cat((positive, positive), axis=0)
sample2 = torch.cat((positive, negative), axis=0)

target = torch.cat((torch.ones((positive.shape[0], 1)), torch.zeros((negative.shape[0],1))))

exploration_loss = agent.exemplar_exploration.update(sample1, sample2, target)
exploration_loss

(array(0.6931633, dtype=float32),
 array([0.09749418, 0.09749418, 0.09749418, 0.09749418, 0.09749418,
        0.09749418, 0.09749418, 0.09749418, 0.09749418, 0.09749418],
       dtype=float32),
 array([0.72320235, 0.72320235, 0.72320235, 0.72320235, 0.72320235,
        0.72320235, 0.72320235, 0.72320235, 0.72320235, 0.72320235],
       dtype=float32))

In [159]:
sample1

tensor([[0.0297, 0.2281, 0.1234, 0.0945, 0.2127, 0.3115],
        [0.1338, 0.5137, 0.1799, 0.0336, 0.0319, 0.1071],
        [0.2994, 0.2420, 0.1056, 0.0280, 0.0977, 0.2273],
        [0.0331, 0.2781, 0.0769, 0.2023, 0.3133, 0.0962],
        [0.0329, 0.1789, 0.1710, 0.3797, 0.0533, 0.1842],
        [0.0297, 0.2281, 0.1234, 0.0945, 0.2127, 0.3115],
        [0.1338, 0.5137, 0.1799, 0.0336, 0.0319, 0.1071],
        [0.2994, 0.2420, 0.1056, 0.0280, 0.0977, 0.2273],
        [0.0331, 0.2781, 0.0769, 0.2023, 0.3133, 0.0962],
        [0.0329, 0.1789, 0.1710, 0.3797, 0.0533, 0.1842]])

In [158]:
sample2

tensor([[2.9691e-02, 2.2813e-01, 1.2344e-01, 9.4515e-02, 2.1273e-01, 3.1149e-01],
        [1.3377e-01, 5.1370e-01, 1.7990e-01, 3.3646e-02, 3.1887e-02, 1.0710e-01],
        [2.9945e-01, 2.4204e-01, 1.0557e-01, 2.7976e-02, 9.7674e-02, 2.2730e-01],
        [3.3106e-02, 2.7814e-01, 7.6940e-02, 2.0231e-01, 3.1331e-01, 9.6196e-02],
        [3.2943e-02, 1.7889e-01, 1.7100e-01, 3.7972e-01, 5.3261e-02, 1.8418e-01],
        [5.8789e-02, 1.1202e-02, 2.8961e-01, 2.2498e-01, 3.1856e-01, 9.6868e-02],
        [2.8426e-01, 3.0438e-01, 2.4056e-02, 2.6303e-01, 4.1852e-02, 8.2426e-02],
        [2.2463e-02, 2.2332e-01, 1.9728e-01, 1.6033e-01, 1.6904e-01, 2.2756e-01],
        [8.4553e-05, 4.4013e-01, 9.1140e-02, 3.5121e-01, 9.4354e-02, 2.3085e-02],
        [1.1008e-01, 3.6935e-03, 2.0084e-02, 2.0717e-01, 4.8589e-01, 1.7308e-01]])

In [160]:
agent.exemplar_exploration.get_prob(torch.Tensor(sample2))

tensor([0.5028, 0.5028, 0.5028, 0.5028, 0.5028, 0.5028, 0.5028, 0.5028, 0.5028,
        0.5028])

In [161]:
agent.exemplar_exploration.encoder1(sample1)

(tensor([[-0.3842, -0.2174, -0.0095],
         [-0.3842, -0.2174, -0.0095],
         [-0.3842, -0.2174, -0.0095],
         [-0.3842, -0.2174, -0.0095],
         [-0.3842, -0.2174, -0.0095],
         [-0.3842, -0.2174, -0.0095],
         [-0.3842, -0.2174, -0.0095],
         [-0.3842, -0.2174, -0.0095],
         [-0.3842, -0.2174, -0.0095],
         [-0.3842, -0.2174, -0.0095]], grad_fn=<AddmmBackward0>),
 tensor([1.0000, 1.0000, 1.0000], grad_fn=<ExpBackward0>))

In [150]:
agent.exemplar_exploration.encoder2(sample2)

(tensor([[ 0.7905, -0.8554, -0.2997],
         [ 0.7905, -0.8554, -0.2997],
         [ 0.7905, -0.8554, -0.2997],
         ...,
         [ 0.7905, -0.8554, -0.2997],
         [ 0.7905, -0.8554, -0.2997],
         [ 0.7905, -0.8554, -0.2997]], grad_fn=<AddmmBackward0>),
 tensor([1.0000, 1.0000, 1.0000], grad_fn=<ExpBackward0>))

In [107]:
sample2

tensor([[0.4491, 0.0546, 0.2129, 0.0268, 0.1296, 0.1270],
        [0.1129, 0.4247, 0.1316, 0.0431, 0.0427, 0.2449],
        [0.4848, 0.0818, 0.0580, 0.0354, 0.0662, 0.2738],
        [0.2462, 0.1596, 0.1394, 0.1993, 0.0489, 0.2066],
        [0.1032, 0.4228, 0.0668, 0.0909, 0.2036, 0.1128],
        [0.0998, 0.1017, 0.2919, 0.1661, 0.0968, 0.2438],
        [0.0471, 0.1219, 0.2037, 0.0927, 0.3346, 0.2000],
        [0.2262, 0.1354, 0.2125, 0.2736, 0.0224, 0.1299],
        [0.1114, 0.1518, 0.1869, 0.1177, 0.3797, 0.0525],
        [0.2703, 0.0382, 0.0501, 0.3743, 0.1896, 0.0775]])

In [48]:
agent.exemplar_exploration.encoder1.input_layer.weight

Parameter containing:
tensor([[-0.1517,  0.0255,  0.2918, -0.6622,  0.2170,  0.5774],
        [-0.2126,  0.0381,  0.7031,  0.0107, -0.4459,  0.6690],
        [-0.5074, -0.7704, -0.5007,  0.3616,  0.0365,  0.7566],
        [-0.6614,  0.5753,  0.4212, -0.7270, -0.6620, -0.1003],
        [-0.0516, -0.2310, -0.4327,  0.3027,  0.0174,  0.2454],
        [-0.0792,  0.5761,  0.3265,  0.4939,  0.6704, -0.3653]],
       requires_grad=True)

In [49]:
agent.exemplar_exploration.encoder1.middle_layers[0].weight

Parameter containing:
tensor([[-0.4052, -0.2013,  0.1476,  0.0046, -0.1058, -0.1568],
        [ 0.6673,  0.0222, -0.4492,  0.0455, -0.2971,  0.2218],
        [-0.0672, -0.0889, -0.3498, -0.5678, -0.6732,  0.3505],
        [-0.3164,  0.2907, -0.4615, -0.3610,  0.0238,  0.6773],
        [ 0.5307,  0.3437, -0.3207, -0.5510,  0.5744,  0.1655],
        [-0.1069,  0.1500,  0.8121,  0.5394,  0.0563, -0.1241]],
       requires_grad=True)

In [50]:
agent.exemplar_exploration.encoder1.middle_layers[1].weight

Parameter containing:
tensor([[ 0.0508,  0.1165, -0.4575, -0.5614, -0.5016, -0.3066],
        [-0.6025,  0.4505,  0.3224,  0.1770, -0.4793,  0.5280],
        [ 0.4596,  0.3312, -0.0333,  0.6614,  0.4940,  0.1429],
        [ 0.5125,  0.3081, -0.7372, -0.3431, -0.1594,  0.3183],
        [-0.0357, -0.5719, -0.4612,  0.1695,  0.5028,  0.4007],
        [-0.6890,  0.4767, -0.2983, -0.1980, -0.4688,  0.4817]],
       requires_grad=True)

In [51]:
agent.exemplar_exploration.encoder1.output_layer.weight

Parameter containing:
tensor([[-0.2280, -0.0511,  0.0087, -0.1226,  0.1295, -0.4337],
        [ 0.7213, -0.6539, -0.5365,  0.8129, -0.0790,  0.1673],
        [ 0.7710,  0.1478, -0.3960,  0.1658, -0.3018, -0.0182]],
       requires_grad=True)

In [52]:
agent.exemplar_exploration.encoder2.input_layer.weight

Parameter containing:
tensor([[-0.4219,  0.0388,  0.6717,  0.5690,  0.0200,  0.3857],
        [-0.2483, -0.8850, -0.5738, -0.2108, -0.6203,  0.4135],
        [ 0.3906, -0.6704, -0.3467,  0.4073,  0.5902, -0.4055],
        [ 0.0849,  0.4339, -0.4858, -0.1623, -0.0183, -0.3314],
        [ 0.2226,  0.7325, -0.3647,  0.0252,  0.2816, -0.7248],
        [ 0.3811,  0.2219, -0.3775, -0.5197, -0.0470,  0.2148]],
       requires_grad=True)

In [55]:
agent.exemplar_exploration.encoder2.middle_layers[1].weight

Parameter containing:
tensor([[-0.0442, -0.6948,  0.0905, -0.5359,  0.3059, -0.6966],
        [ 0.4607, -0.4929, -0.6293,  0.3777, -0.0472, -0.6598],
        [ 0.5017, -0.2959,  0.6547, -0.4876,  0.4253, -0.1499],
        [-0.7312,  0.4837, -0.5116, -0.2307, -0.5350, -0.3990],
        [ 0.0530, -0.3585,  0.5933, -0.4743, -0.4582,  0.5733],
        [-0.1381,  0.5811,  0.1330,  0.5490,  0.2900, -0.3450]],
       requires_grad=True)

In [56]:
agent.exemplar_exploration.encoder2.middle_layers[0].weight

Parameter containing:
tensor([[ 0.2358,  0.7143, -0.0748,  0.3639,  0.4208, -0.1441],
        [ 0.6619, -0.5357, -0.3894,  0.5004,  0.4789,  0.0025],
        [-0.2524,  0.2116,  0.2926, -0.1054, -0.0636,  0.4401],
        [-0.7706,  0.2716, -0.7749, -0.4349, -0.9102,  0.6747],
        [-0.4367, -0.2666, -0.1968, -0.6260, -0.5411, -0.2329],
        [ 0.2322,  0.3201,  0.5454, -0.6749, -0.2609, -0.1297]],
       requires_grad=True)

In [57]:
agent.exemplar_exploration.encoder2.output_layer.weight

Parameter containing:
tensor([[ 0.4517, -0.5651,  0.3831, -0.7905, -0.5621,  0.5282],
        [ 0.5757,  0.6838, -0.4832,  0.7202,  0.5067, -0.0118],
        [ 0.1186, -0.5849,  0.3356,  0.5019, -0.4892, -0.7697]],
       requires_grad=True)