In [51]:
import numpy as np
from scipy import stats

In [41]:
num_states = 18
num_actions = 4
gamma = 0.95

In [2]:
num_episodes = -1
histories = []
with open("data.csv") as file:
    cur_episode = -1
    cur_history = None
    cur_timestep = -1
    for idx, line in enumerate(file):
        if(idx == 0):
            num_episodes = int(line)
            continue
        
        data = line.split(",")
        if(len(data) == 1):
            num_time_steps = int(line)
            cur_episode += 1
            cur_timestep = 0
            histories.append(np.zeros((num_time_steps, 4)))
            continue
            
        St, At, Rt, pib = data
        histories[cur_episode][cur_timestep, 0] = int(St)
        histories[cur_episode][cur_timestep, 1] = int(At)
        histories[cur_episode][cur_timestep, 2] = float(Rt)
        histories[cur_episode][cur_timestep, 3] = float(pib)         
        cur_timestep += 1
        
        

In [52]:
s_histories = []
for i in range(int(len(histories) * 0.3)):
    s_histories.append(histories.pop())

In [56]:
print(len(histories))
print(len(s_histories))

700000
300000


In [4]:
def SanityCheckConsistentPolicy():
    for i, h in enumerate(histories):
        for state in range(num_states):
            same_state = h[h[:,0] == state]
            if(same_state.shape[0] == 0):
                continue

            if (not (same_state[:,3] == same_state[0,3]).all()):
                print(i)

In [9]:
def GetPolicy(ep_num):
    policy = np.zeros((num_states, num_actions))
    traj = histories[ep_num]
    
    for state in range(num_states):
        for action in range(num_actions):
            valid_idx = np.logical_and(traj[:,0] == state, traj[:,1] == action)
            if(not valid_idx.any()):
                continue
            
            policy[state, action] = traj[valid_idx][0,3]
            
    return policy
    

In [33]:
cur_policy = np.zeros((num_states, num_actions))

# for ep in range(len(histories)):
for ep in range(10):
    temp_p = GetPolicy(ep)
    for state in range(num_states):
        for action in range(num_actions):
            if((cur_policy[state, action] == 0) and (temp_p[state, action] != 0)):
                cur_policy[state, action] = temp_p[state, action]



In [46]:
def ImportanceSampling(ep_num, new_policy):
    traj = histories[ep_num]
    is_weight = 1
    disc_return = 0
    for j in range(traj.shape[0]):
        St, At, Rt, _ = traj[j]
        St = int(St)
        At = int(At)
        is_weight *= new_policy[St, At] / cur_policy[St, At]
        disc_return += (gamma ** j) * Rt
    return is_weight * disc_return

def PDImportanceSampling(ep_num, new_policy):
    traj = histories[ep_num]
    
    result = 0
    for t in range(traj.shape[0]):
        _, _, Rt, _ = traj[t]
        is_weight = 1
        for j in range(t + 1):
            St, At, _, _ = traj[j]
            St = int(St)
            At = int(At)
            is_weight *= new_policy[St, At] / cur_policy[St, At]
        result += (gamma ** t) * is_weight * Rt
    return result

In [60]:
def CalcAvgIS(new_policy, ISFunc):
    total = 0
    for ep in range(len(histories)):
        total += ISFunc(ep, new_policy)
    return total / len(histories)

def CalcStdDev(new_policy, ISFunc):
    avgIS = CalcAvgIS(new_policy, ISFunc)
    total = 0
    for ep in range(len(histories)):
        total += (ISFunc(ep, new_policy) - avgIS)**2
    
    return np.sqrt((1 / (len(histories) - 1)) * total)

def TTest(new_policy, ISFunc, delta, num_safety):
    t_value = stats.t.ppf(1-delta, num_safety - 1)
    avgIS = CalcAvgIS(new_policy, ISFunc)
    std_dev = CalcStdDev(new_policy, ISFunc)
    
    return avgIS - (std_dev / np.sqrt(num_safety)) * t_value
    

In [62]:
result = TTest(cur_policy, PDImportanceSampling, 0.1, len(s_histories))

KeyboardInterrupt: 

In [63]:
import cupy as cp

ModuleNotFoundError: No module named 'cupy'