fix an error with the way action log prob is collected during the epi…

…sode rollouts, addressing #31 and thanks to @kisseternity
lucidrains · Feb 22, 2023 · a159310 · a159310
1 parent bfcffe7
commit a159310
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 3 deletions.
diff --git a/palm_rlhf_pytorch/ppo.py b/palm_rlhf_pytorch/ppo.py
@@ -250,6 +250,7 @@ def log(t, eps = 1e-20):
     return torch.log(t.clamp(min = eps))
 
 def log_prob(prob, indices):
+    assert prob.shape[:2] == indices.shape, f'preceding shapes of prob {prob.shape[:2]} and indices {indices.shape} must match'
     return log(prob.gather(-1, indices[..., None])).squeeze(-1)
 
 def shift(t, value = 0, shift = 1, dim = -1):
@@ -608,11 +609,13 @@ def train(
                     temperature = temperature,
                     return_values = True
                 )
-
                 action_logits = shift(action_logits, shift = 1, dim = -2) # need to shift along sequence dimension by 1, since actions start from the last prompt (state) token
 
                 action_prob = action_logits.softmax(dim = -1)
-                action_log_prob = log_prob(action_prob, actions)
+
+                action_len = actions.shape[-1]
+                action_log_prob = log_prob(action_prob, sequence)
+                action_log_prob = action_log_prob[:, -action_len:]
 
                 actions = rearrange(actions, '1 ... -> ...')
 

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'PaLM-rlhf-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.64',
+  version = '0.0.65',
   license='MIT',
   description = 'PaLM + Reinforcement Learning with Human Feedback - Pytorch',
   author = 'Phil Wang',