In [1]:
import torch
import torch.nn.functional as F
from torch.distributions.uniform import Uniform

In [2]:
# simple hack to support import module from parent directory
import sys
sys.path.append('../')

from instruct_llama.utils import RunningMeanStd


In [7]:
a = torch.range(0.1, 0.6, 0.1)
b = torch.range(0.4, 0.9, 0.1)

print(torch.log(a/b))
print(torch.log(a) - torch.log(b))

print(torch.log(b/a))
print(torch.log(b) - torch.log(a))

tensor([-1.3863, -0.9163, -0.6931, -0.5596, -0.4700, -0.4055])
tensor([-1.3863, -0.9163, -0.6931, -0.5596, -0.4700, -0.4055])
tensor([1.3863, 0.9163, 0.6931, 0.5596, 0.4700, 0.4055])
tensor([1.3863, 0.9163, 0.6931, 0.5596, 0.4700, 0.4055])


  a = torch.range(0.1, 0.6, 0.1)
  b = torch.range(0.4, 0.9, 0.1)


In [4]:
# Example tensors
tensor1 = torch.tensor([1, 2, 3])
tensor2 = torch.tensor([4, 5, 6, 7, 8])

# Calculate the difference in lengths
length_diff = len(tensor2) - len(tensor1)

# Pad tensor1 to match the length of tensor2
if length_diff > 0:
    padded_tensor1 = F.pad(tensor1, (0, length_diff), value=0)
else:
    # Handle the case where tensor1 is longer or equal in length to tensor2
    padded_tensor1 = tensor1

print(padded_tensor1)

tensor([1, 2, 3, 0, 0])


In [9]:
def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:
    tensor = tensor * mask
    tensor = tensor.sum(dim=dim)
    mask_sum = mask.sum(dim=dim)

    # avoid division by zero
    mask_sum = torch.where(mask_sum <= 0, 1e-8, mask_sum)

    mean = tensor / mask_sum
    return mean


def masked_sum(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:
    tensor = tensor * mask
    sum = tensor.sum(dim=dim)
    return sum


def masked_whiten(
    tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1, eps: float = 1e-8, shift_mean: bool = True
) -> torch.Tensor:
    tensor = tensor * mask
    mean = masked_mean(tensor, mask, dim=dim)

    if len(tensor.shape) > len(mean.shape):
        mean = mean.unsqueeze(1)

    mean_centered = tensor - mean
    var = masked_mean(mean_centered**2, mask, dim=dim)
    if len(tensor.shape) > len(var.shape):
        var = var.unsqueeze(1)

    whitened = mean_centered * var.clamp(min=eps).rsqrt()

    if not shift_mean:
        whitened += mean

    return whitened


In [4]:
rewards = Uniform(1, 5).sample((32, 100))
mask = torch.where(rewards > 4, 0, 1)

mean = masked_mean(rewards, mask)
print(mean.shape)

torch.Size([32])


In [10]:
print(masked_whiten(rewards, mask))

tensor([[-1.6464, -1.2026,  0.2889,  ...,  0.9180, -0.2109, -1.2180],
        [-1.2525,  1.0659,  0.3704,  ..., -1.5527,  1.3320, -2.9028],
        [-3.1446, -1.2506,  0.5387,  ..., -3.1446, -3.1446, -3.1446],
        ...,
        [-2.8881,  0.5655, -0.8311,  ...,  1.2368, -2.8881,  1.0424],
        [ 0.0600, -2.9786,  0.3721,  ..., -1.6684, -1.1566,  0.3989],
        [-0.5544,  0.6849, -3.1889,  ..., -3.1889,  0.5617, -1.3843]])


In [6]:
def find_pattern_index(lst, pattern):
    pattern_length = len(pattern)
    lst_length = len(lst)

    for i in range(lst_length - pattern_length + 1):
        if lst[i:i + pattern_length] == pattern:
            return i

    return -1  # Return -1 if pattern is not found

# Example list and pattern
my_list = [123, 456, 331, 518, 25580, 29962, 789, 518, 25580, 29962]
my_pattern = [518, 25580, 29962]

# Find the index of the first occurrence of the pattern
index = find_pattern_index(my_list, my_pattern)

if index != -1:
    print("Pattern found at index:", index)
else:
    print("Pattern not found.")

Pattern found at index: 3


In [8]:
normalizer = RunningMeanStd()

In [9]:
for i in range(10):
    rewards = Uniform(1, 5).sample((32,))
    rewards_normed = normalizer.normalize(rewards)

    print(rewards)
    print(rewards_normed)
    print('-'*80)
    normalizer.update(rewards)

tensor([2.3305, 4.2210, 3.0239, 1.5279, 2.9952, 4.6518, 4.1763, 3.7775, 3.2323,
        3.7950, 1.6351, 2.1434, 2.0246, 2.2833, 2.7809, 4.1443, 1.5020, 4.1390,
        3.3725, 4.0163, 3.6551, 3.6362, 3.1837, 1.4609, 1.1217, 4.0624, 1.6084,
        3.0058, 1.8046, 4.4765, 1.0956, 2.8427])
tensor([2.3305, 4.2210, 3.0239, 1.5279, 2.9952, 4.6518, 4.1763, 3.7775, 3.2323,
        3.7950, 1.6351, 2.1434, 2.0246, 2.2833, 2.7809, 4.1443, 1.5020, 4.1390,
        3.3725, 4.0163, 3.6551, 3.6362, 3.1837, 1.4609, 1.1217, 4.0624, 1.6084,
        3.0058, 1.8046, 4.4765, 1.0956, 2.8427])
--------------------------------------------------------------------------------
tensor([2.5978, 3.3823, 3.3522, 1.5136, 3.6318, 4.8554, 1.6120, 1.1887, 2.0239,
        2.0285, 3.2207, 1.5096, 3.1732, 1.4625, 3.8015, 3.5378, 3.9452, 2.4192,
        3.6837, 1.1092, 3.4495, 2.7614, 4.3623, 1.1907, 3.7079, 4.1499, 2.5371,
        3.7045, 4.5393, 3.6307, 3.6911, 3.8957])
tensor([-0.1060,  0.1451,  0.1355, -0.4531,  0.2250,

In [17]:
max_len = 32
response_length=32
gamma = 1.0
gae_lambda = 0.95

In [20]:
values = Uniform(-2, 2).sample((2, max_len))
rewards = Uniform(-1, 1).sample((2, max_len))
print(values)
print(rewards)

tensor([[-0.9976, -1.9505,  0.3911,  0.5321, -0.8263, -0.8723, -0.4773,  0.9948,
         -0.8882,  0.0343, -0.8034,  1.3666,  0.5070,  0.1407,  0.2962,  1.2644,
          1.2184,  0.3032,  0.2709,  0.9851, -1.0642,  1.8545,  0.5080,  1.0328,
         -0.0914, -1.0718, -0.6253, -1.9257, -0.7694,  1.5111, -1.4778, -0.0999],
        [ 0.0704,  1.2708,  1.3660, -1.4567, -1.7659,  0.6596,  0.1654,  1.8031,
          1.3858, -0.3607,  0.9536,  0.3245,  0.0842, -1.5950, -0.8751, -1.8475,
         -0.7938, -0.7384,  1.8661,  0.4114,  0.9599, -1.1225, -1.6356,  0.5733,
          1.7450, -1.6083,  0.8860, -1.0427,  1.3411,  1.1768,  0.7282, -1.8170]])
tensor([[-0.5760, -0.3617, -0.7945, -0.8109, -0.8843, -0.1325,  0.7195, -0.8992,
          0.2091, -0.4879,  0.8301,  0.0372,  0.1386, -0.9673,  0.1793, -0.9158,
         -0.8970, -0.1553,  0.0554,  0.7365,  0.5238,  0.3480, -0.7212,  0.2770,
          0.8277,  0.6249,  0.2147,  0.0265,  0.0330,  0.3872,  0.2065, -0.0400],
        [-0.5509, -0.130

In [11]:
def compute_returns_advantages_openai(rewards, values):

    lastgaelam = 0
    advantages_reversed = []
    gen_length = response_length
    for t in reversed(range(gen_length)):
        nextvalues = values[:, t + 1] if t < gen_length - 1 else 0.0
        delta = rewards[:, t] + gamma * nextvalues - values[:, t]
        lastgaelam = delta + gamma * gae_lambda * lastgaelam
        advantages_reversed.append(lastgaelam)
    advantages = torch.stack(advantages_reversed[::-1], dim=1)
    returns = advantages + values

    return returns, advantages

In [21]:
returns, advantages = compute_returns_advantages_openai(rewards, values)
print(returns)
print(advantages)

tensor([[-3.0821, -2.5354, -2.3086, -1.6218, -0.8101,  0.1241,  0.2952, -0.4990,
          0.4680,  0.2707,  0.8408, -0.0607, -0.1297, -0.2899,  0.6974,  0.4789,
          1.4039,  2.4060,  2.6819,  2.7130,  2.1365,  1.6000,  1.2911,  2.0639,
          1.8857,  1.1701,  0.6068,  0.5141,  0.5537,  0.4686,  0.1635, -0.0400],
        [ 2.5998,  3.2496,  3.4855,  3.2318,  3.7110,  3.1253,  3.0992,  2.2227,
          1.5220,  1.8387,  0.9354,  1.6541,  1.4918,  0.6452,  1.3835,  1.1005,
          0.2335,  1.3172,  0.4781,  1.4135,  1.1548,  0.5110,  1.2928,  1.4081,
          2.4366,  1.8509,  0.9607,  1.4270,  0.3798,  0.8345, -0.0089,  0.0988]])
tensor([[-2.0845, -0.5849, -2.6997, -2.1539,  0.0163,  0.9964,  0.7725, -1.4938,
          1.3562,  0.2364,  1.6442, -1.4273, -0.6367, -0.4306,  0.4012, -0.7855,
          0.1855,  2.1029,  2.4110,  1.7279,  3.2007, -0.2545,  0.7832,  1.0310,
          1.9772,  2.2420,  1.2322,  2.4398,  1.3231, -1.0425,  1.6413,  0.0599],
        [ 2.5294,  1.978

In [23]:
def compute_returns_and_advantages(rewards, values, values_tp1, done_tp1):
    # TODO investigate if should we use a special normalization process, as our rewards and state values contains lots of 0s,

    num_rows, num_cols = values.shape  # Get the dimensions of the input data
    discount_tp1 = (~done_tp1).float() * gamma

    lambda_ = torch.ones_like(discount_tp1) * gae_lambda  # If scalar, make into vector.
    delta_t = rewards + discount_tp1 * values_tp1 - values
    advantages = torch.zeros_like(delta_t, dtype=torch.float)

    returns = torch.zeros_like(advantages, dtype=torch.float)  # Initialize the returns tensor

    for row in range(num_rows):
        gae_t = 0
        for col in reversed(range(num_cols)):
            gae_t = delta_t[row, col] + discount_tp1[row, col] * lambda_[row, col] * gae_t
            advantages[row, col] = gae_t
        returns[row] = advantages[row] + values[row]

    return returns, advantages

In [24]:
dones = torch.zeros_like(values, dtype=torch.bool)
dones[:, -1] = True
values_tp1 = torch.zeros_like(values)
done_tp1 = torch.ones_like(values, dtype=torch.bool)
values_tp1[:, :-1] = values[:, 1:]
done_tp1[:, :-1] = dones[:, 1:]

returns, advantages = compute_returns_and_advantages(rewards, values, values_tp1, done_tp1)
print(returns)
print(advantages)

tensor([[-3.0729, -2.5257, -2.2984, -1.6110, -0.7987,  0.1360,  0.3078, -0.4858,
          0.4819,  0.2854,  0.8562, -0.0445, -0.1126, -0.2719,  0.7164,  0.4988,
          1.4249,  2.4281,  2.7052,  2.7374,  2.1623,  1.6271,  1.3197,  2.0939,
          1.9174,  1.2034,  0.6419,  0.5510,  0.5925,  0.5095,  0.2065, -0.0400],
        [ 2.5991,  3.2489,  3.4848,  3.2311,  3.7102,  3.1244,  3.0984,  2.2217,
          1.5211,  1.8377,  0.9344,  1.6530,  1.4906,  0.6439,  1.3822,  1.0992,
          0.2321,  1.3156,  0.4765,  1.4118,  1.1530,  0.5091,  1.2908,  1.4060,
          2.4344,  1.8486,  0.9583,  1.4245,  0.3771,  0.8316, -0.0119,  0.0988]])
tensor([[-2.0753, -0.5751, -2.6895, -2.1431,  0.0276,  1.0084,  0.7850, -1.4806,
          1.3701,  0.2511,  1.6596, -1.4111, -0.6197, -0.4126,  0.4201, -0.7656,
          0.2065,  2.1250,  2.4343,  1.7523,  3.2265, -0.2274,  0.8117,  1.0611,
          2.0088,  2.2753,  1.2672,  2.4767,  1.3619, -1.0017,  1.6843,  0.0599],
        [ 2.5287,  1.978

In [6]:
tokens = torch.tensor(range(1, 11))
print(tokens)

dones = torch.zeros_like(tokens).to(dtype=torch.bool)
print(dones)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
tensor([False, False, False, False, False, False, False, False, False, False])


In [7]:
start = 4
end = 7

masked_values = torch.zeros_like(tokens)
masked_returns = torch.ones_like(tokens)
masked_dones = torch.zeros_like(dones).to(dtype=torch.bool)

masked_values[start-1:end] = 1
masked_returns[end:] = 0.0
masked_dones[end-1:] = True

print(masked_values)
print(masked_returns)
print(masked_dones)

tensor([0, 0, 0, 1, 1, 1, 1, 0, 0, 0])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])
tensor([False, False, False, False, False, False,  True,  True,  True,  True])


In [8]:
tokens[end:] = -1
print(tokens)

tensor([ 1,  2,  3,  4,  5,  6,  7, -1, -1, -1])


In [2]:
def whiten_with_mask(values, mask, shift_mean=True):
    masked_values = torch.masked_select(values, mask)
    mean = torch.mean(masked_values)
    var = torch.var(masked_values, unbiased=False)  # PyTorch uses biased variance estimation by default
    
    whitened = (values - mean) * torch.rsqrt(var + 1e-8)
    if not shift_mean:
        whitened += mean
    return whitened

# Example usage:
values = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
mask = torch.tensor([[True, False, True], [False, True, True]])
whitened = whiten_with_mask(values, mask, shift_mean=True)
print(whitened)

tensor([[-1.4321, -0.9113, -0.3906],
        [ 0.1302,  0.6509,  1.1717]])


In [3]:
# Example usage:
values = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
mask = torch.tensor([[True, True, True], [True, True, True]])
whitened = whiten_with_mask(values, mask, shift_mean=True)
print(whitened)

tensor([[-1.4639, -0.8783, -0.2928],
        [ 0.2928,  0.8783,  1.4639]])


In [5]:
mask = torch.tensor([[False, False, False, False,  True,  True,  True,  True,  False,  False],
                     [False, False, False, False,  False,  True,  True,  True,  False,  False],
        [False, False, False, False, True, True, True, True,  True,  True]], dtype=bool)

# Convert mask to float tensor
mask_float = mask.float()

# Calculate the sum of mask values along the appropriate axis
sum_mask_values = mask_float.sum(dim=1)  # Assuming dim=1 is the appropriate axis

# Create a boolean tensor indicating which episodes meet the condition
condition_met = sum_mask_values >= 4

# Count the number of True values in the boolean tensor
print(condition_met.sum().item())

2


In [6]:
def find_pattern_index(tensor, pattern):
    pattern_length = pattern.size(0)
    tensor_length = tensor.size(0)

    for i in range(tensor_length - pattern_length + 1):
        if torch.equal(tensor[i:i+pattern_length], pattern):
            return i

    return -1  # Return -1 if pattern is not found

# Example tensor and pattern
my_tensor = torch.tensor([123, 456, 331, 518, 25580, 29962, 789, 518, 25580, 29962])
my_pattern = torch.tensor([518, 25580, 29962])

# Find the index of the first occurrence of the pattern
index = find_pattern_index(my_tensor, my_pattern)

if index != -1:
    print("Pattern found at index:", index)
else:
    print("Pattern not found.")

Pattern found at index: 3


In [8]:
def find_pattern_index(tensor, pattern, start_index=0):
    pattern_length = pattern.size(0)
    tensor_length = tensor.size(0)

    for i in range(start_index, tensor_length - pattern_length + 1):
        if torch.equal(tensor[i:i+pattern_length], pattern):
            return i

    return -1  # Return -1 if pattern is not found

# Example tensor and pattern
my_tensor = torch.tensor([123, 456, 331, 518, 25580, 29962, 518, 25580, 29962, 789, 518, 25580, 29962])
my_pattern = torch.tensor([518, 25580, 29962])

# Find the index of the first occurrence of the pattern starting from index 3
start_index = 7
index = find_pattern_index(my_tensor, my_pattern, start_index)

if index != -1:
    print("Pattern found at index:", index)
else:
    print("Pattern not found.")

Pattern found at index: 10


In [12]:
discount = 1.0
gae_lambda = 0.95


def compute_returns_and_advantages(v_t, r_t, v_tp1, done_tp1):
    discount_tp1 = (~done_tp1).float() * discount

    lambda_ = torch.ones_like(discount_tp1) * gae_lambda  # If scalar, make into vector.

    delta_t = r_t + discount_tp1 * v_tp1 - v_t

    advantage_t = torch.zeros_like(delta_t, dtype=torch.float32)

    gae_t = 0
    for i in reversed(range(len(delta_t))):
        gae_t = delta_t[i] + discount_tp1[i] * lambda_[i] * gae_t
        advantage_t[i] = gae_t

    return_t = advantage_t + v_t

    return return_t, advantage_t

In [14]:
def compute_returns_and_advantages(v_t, r_t, v_tp1, done_tp1):
    num_rows, num_cols = v_t.shape  # Get the dimensions of the input data
    discount_tp1 = (~done_tp1).float() * discount

    lambda_ = torch.ones_like(discount_tp1) * gae_lambda  # If scalar, make into vector.

    delta_t = r_t + discount_tp1 * v_tp1 - v_t

    advantage_t = torch.zeros_like(delta_t, dtype=torch.float32)

    return_t = torch.zeros_like(advantage_t, dtype=torch.float32)  # Initialize the return_t tensor

    for row in range(num_rows):
        gae_t = 0
        for col in reversed(range(num_cols)):
            gae_t = delta_t[row, col] + discount_tp1[row, col] * lambda_[row, col] * gae_t
            advantage_t[row, col] = gae_t
        return_t[row] = advantage_t[row] + v_t[row]

    return return_t, advantage_t

In [17]:
rewards = torch.ones((3, 10)) * 0.02
rewards[:, 5] += 0.5
values = torch.zeros((3, 10))
values_tp1 = torch.zeros((3, 10))
dones = torch.zeros((3, 10))
dones[:, 5:] = 1

dones = dones.to(dtype=torch.bool)

returns, advantages = compute_returns_and_advantages(values, rewards, values_tp1, dones)

print(returns)
print(advantages)

tensor([[0.4929, 0.4977, 0.5029, 0.5083, 0.5140, 0.5200, 0.0200, 0.0200, 0.0200,
         0.0200],
        [0.4929, 0.4977, 0.5029, 0.5083, 0.5140, 0.5200, 0.0200, 0.0200, 0.0200,
         0.0200],
        [0.4929, 0.4977, 0.5029, 0.5083, 0.5140, 0.5200, 0.0200, 0.0200, 0.0200,
         0.0200]])
tensor([[0.4929, 0.4977, 0.5029, 0.5083, 0.5140, 0.5200, 0.0200, 0.0200, 0.0200,
         0.0200],
        [0.4929, 0.4977, 0.5029, 0.5083, 0.5140, 0.5200, 0.0200, 0.0200, 0.0200,
         0.0200],
        [0.4929, 0.4977, 0.5029, 0.5083, 0.5140, 0.5200, 0.0200, 0.0200, 0.0200,
         0.0200]])


In [18]:
lastgaelam = 0
advantages_reversed = []
gen_length = 10
for t in reversed(range(gen_length)):
    nextvalues = values[:, t + 1] if t < gen_length - 1 else 0.0
    delta = rewards[:, t] + discount * nextvalues - values[:, t]
    lastgaelam = delta + discount * gae_lambda * lastgaelam
    advantages_reversed.append(lastgaelam)
advantages = torch.stack(advantages_reversed[::-1], dim=1)
returns = advantages + values


print(returns)
print(advantages)

tensor([[0.5474, 0.5552, 0.5633, 0.5719, 0.5810, 0.5905, 0.0742, 0.0570, 0.0390,
         0.0200],
        [0.5474, 0.5552, 0.5633, 0.5719, 0.5810, 0.5905, 0.0742, 0.0570, 0.0390,
         0.0200],
        [0.5474, 0.5552, 0.5633, 0.5719, 0.5810, 0.5905, 0.0742, 0.0570, 0.0390,
         0.0200]])
tensor([[0.5474, 0.5552, 0.5633, 0.5719, 0.5810, 0.5905, 0.0742, 0.0570, 0.0390,
         0.0200],
        [0.5474, 0.5552, 0.5633, 0.5719, 0.5810, 0.5905, 0.0742, 0.0570, 0.0390,
         0.0200],
        [0.5474, 0.5552, 0.5633, 0.5719, 0.5810, 0.5905, 0.0742, 0.0570, 0.0390,
         0.0200]])


In [4]:
a  = [0, 1, 2, 3, 4, 5]

len_a = len(a)

print(a[2:len_a+1])

[2, 3, 4, 5]
