In [None]:
"""
Generating the experts.
"""
# ----------------------------------------------------------------------------------------------------------------------
# imports
import sys
import pickle
import numpy as np
sys.path.append('/Users/tingtingni/Documents/cirl copy/')
from env.gridworld import Gridworld
from algs.cmdp import cmdp_gda, cmdp_gda_occ, regularization
from examples.log import *
import visualization.gridworld_vis as gv
from einops import rearrange, reduce, repeat, einsum
import copy

import random
import pandas as pd
import argparse
from pathlib import Path
from scipy.stats import entropy


float_formatter = "{:.8f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})

# ----------------------------------------------------------------------------------------------------------------------
# Parse command line arguments
# default values
b_str = 'low'
noise = 0.1
N = 10000
seed = 20

data_path = 'data/expert_data/'+str(noise)+'_noise/'+b_str+'_b/'
Path(data_path).mkdir(parents=True, exist_ok=True)
plotting = True

# ----------------------------------------------------------------------------------------------------------------------
# Fix random seeds for reproducibility
random.seed(seed)
np.random.seed(seed)

# ----------------------------------------------------------------------------------------------------------------------
# create environment
params = {
    'grid_height': 6,
    'grid_width' : 6,
    'noise': float(noise),
    'gamma': 0.7
}
n = params['grid_height'] * params['grid_width']
m = 4 # number of actions
k = 2 # number of constaints
Psi = np.zeros((params['grid_height'], params['grid_width'], m, k))
Psi[1:5, 1, :, 0] = 10
Psi[1:5, 3, :, 1] = 10
# Psi[2:5, 3, :, 2] = 1 ,[[2,3],[3,1]]
Psi = rearrange(Psi, 'sx sy a k -> (sy sx) a k')
constraint_patches = np.array([[[1,1],[4,1]], [[1,3],[4,1]]])
nu0 = np.ones(n) / (n - len(np.nonzero(Psi[:,0,:])[0]))
nu0[np.nonzero(Psi[:,0,:])[0]] = 0.0
r = np.zeros((n, m))
r[15,:] = 1
policysize = 4
distance = [1, 1.5, 1.7, 1.83]
policy_soft = np.zeros((n, m, policysize))
for i in range(policysize):
    b = np.array([distance[i], distance[i]])
    env = Gridworld(**params, constraints = (Psi, b), nu0 =  nu0, r = r)
    env.P[15, :, :] = np.zeros_like(env.P[0, :, :])
    env.P[15, :, 15] = 1.0
    beta = 0.3
    # Approximate solution to regularized problem
    eta_p = (1-env.gamma) /beta * 1
    eta_xi = 0.1
    print('-----------')
    print('GDA solution')
    policy_soft[:,:,i], xi, values_soft = cmdp_gda(env, beta , eta_p, eta_xi, max_iters=1e4, tol=-1, mode='alt_gda',
                                        n_v_tot_eval_steps=50, n_v_c_eval_steps=50, logging=False, check_steps=1000)
    occ_soft = env.policy2stateactionocc(policy_soft[:,:,i])

    print('objective: ', np.sum(occ_soft * env.r / (1-env.gamma)))
    print('constraints: ', np.einsum('jki,jk->i', env.Psi, occ_soft) / (1-env.gamma))
    print('primal: ', np.sum(occ_soft * env.r / (1-env.gamma)) - regularization(env, occ_soft, beta))
    print('dual value: ', np.sum(occ_soft * env.r / (1-env.gamma)) - regularization(env, occ_soft, beta) + np.sum(xi * (env.b - np.einsum('jki,jk->i', env.Psi, occ_soft) / (1-env.gamma))))


# set up the env
b = np.array([2, 2])
env = Gridworld(**params, constraints = (Psi, b), nu0 =  nu0, r = r)
env.P[15, :, :] = np.zeros_like(env.P[0, :, :])
env.P[15, :, 15] = 1.0


In [None]:
eta = 0.01
T = 50
thr = 0.001 
batch = 10
samplesize = 7
sample = [100, 300, 500, 700, 900, 1500, 3000]
gap = np.zeros((batch, policysize, samplesize))
stepsize = np.zeros((batch, policysize, samplesize))
for i in range(batch):
    for j in range(policysize):
        policy = policy_soft[:,:,j]
        for k in range(samplesize):
            sample_n = sample[k] 
            gap[i,j,k], stepsize[i,j,k] = LBPG(env, policy, T, sample_n, thr, eta)    
            
gap_path = "data/" + f'gap.pkl'
stepsize_path = "data/" + f'stepsize.pkl'
# Serialize and write to file using pickle.dump()
with open(gap_path, 'wb') as file:
    pickle.dump(gap, file)

with open(stepsize_path, 'wb') as file:
    pickle.dump(stepsize, file)


In [None]:
# Plot
g_avg = np.mean(gap, axis=0)
g_max = np.quantile(gap, 0.9, axis=0)
g_min = np.quantile(gap, 0.1, axis=0)

s_avg = np.mean(stepsize, axis=0)
s_max = np.quantile(stepsize, 0.9, axis=0)
s_min = np.quantile(stepsize, 0.1, axis=0)

import matplotlib.pyplot as plt
import math
import matplotlib.gridspec as gridspec
mode_w = [r'$V_{i}^{\theta}(\rho)=-1,\, i\in[2]$',r'$V_{i}^{\theta}(\rho)=-1.5,\, i\in[2]$',
          r'$V_{i}^{\theta}(\rho)=-1.7,\,i\in[2]$',
          r'$V_{1}^{\theta}(\rho) \approx -1.73 ,V_{2}^{\theta}(\rho)\approx -1.83$']
colorlist = ['tab:blue','tab:green','tab:purple','tab:orange', 'tab:gray','tab:brown', 'tab:pink','tab:red']
linestyle_str = ['dotted','dashed','solid', 'dashdot']  # Same as '-.'
# Create (2,2) sub plots
gs = gridspec.GridSpec(1, 2)
fig = plt.figure(figsize=(30,12))

policysize = 4
ax = plt.subplot(gs[0, 0])
for i in range(policysize): 
    ax.plot(sample, g_avg[i, :], color=colorlist[i], label=mode_w[i], linewidth=2, linestyle=linestyle_str[i])
    ax.fill_between(sample, g_min[i, :], g_max[i, :], alpha=0.3, facecolor=colorlist[i])

plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
ax.set_xlabel("Batch size", color='black', fontsize=24)
ax.set_ylabel("Gradient estimation error", color='black', fontsize=24)
plt.title('Gradient estimation error', fontsize=24)
ax.legend(loc='upper right', fontsize=30)  # Place the legend outside the loop

# Similarly for the second subplot
ax = plt.subplot(gs[0, 1]) 
for i in range(policysize): 
    ax.plot(sample, s_avg[i, :], color=colorlist[i], label=mode_w[i], linewidth=2, linestyle=linestyle_str[i])
    ax.fill_between(sample, s_min[i, :], s_max[i, :], alpha=0.3, facecolor=colorlist[i])

plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
ax.set_xlabel("Batch size", color='black', fontsize=24)
ax.set_ylabel("Stepsize", color='black', fontsize=24)
ax.legend(loc='lower right', fontsize=26)  # Place the legend outside the loop
plt.title('Stepsize', fontsize=24)


# fig.suptitle(r'$6\times 6$ gridworld with $\eta=10^{-2}$', fontsize = 'xx-large')
plt.show()

fig.savefig('sample.svg', format='svg', dpi = 1200)



In [None]:
ax = plt.subplot(gs[0, 0])
policysize = 3
for i in range(policysize): 
    ax.plot(sample, g_avg[i, :], color=colorlist[i], label=mode_w[i], linewidth=2, linestyle=linestyle_str[i])
    ax.fill_between(sample, g_min[i, :], g_max[i, :], alpha=0.3, facecolor=colorlist[i])

plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
ax.set_xlabel("Batch size", color='black', fontsize=18)
ax.set_ylabel("Gradient estimation error", color='black', fontsize=18)
plt.title('Gradient estimation error', fontsize=18)
ax.legend(loc='center right', fontsize=17)  # Place the legend outside the loop

# Similarly for the second subplot
ax = plt.subplot(gs[0, 1]) 
for i in range(policysize): 
    ax.plot(sample, s_avg[i, :], color=colorlist[i], label=mode_w[i], linewidth=2, linestyle=linestyle_str[i])
    ax.fill_between(sample, s_min[i, :], s_max[i, :], alpha=0.3, facecolor=colorlist[i])

plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
ax.set_xlabel("Batch size", color='black', fontsize=18)
ax.set_ylabel("Stepsize", color='black', fontsize=18)
ax.legend(loc='lower right', fontsize=17)  # Place the legend outside the loop
plt.title('Stepsize', fontsize=18)
