opendilab · PaParaZz1 · Sep 4, 2023 · May 11, 2023 · May 11, 2023 · May 11, 2023
diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py
@@ -5,6 +5,7 @@
 from .vac import VAC
 from .bc import DiscreteBC, ContinuousBC
 from .pg import PG
+from .nlp_pretrained_model import NLPPretrainedModel
 # algorithm-specific
 from .ppg import PPG
 from .qmix import Mixer, QMix

diff --git a/ding/model/template/nlp_pretrained_model.py b/ding/model/template/nlp_pretrained_model.py
@@ -0,0 +1,54 @@
+import torch
+
+from ding.utils import MODEL_REGISTRY
+from torch import nn
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+
+
+@MODEL_REGISTRY.register('nlp_pretrained_model')
+class NLPPretrainedModel(nn.Module):
+
+    def __init__(
+            self,
+            model_name: str = "bert-base-uncased",
+            add_linear: bool = False,
+            embedding_size: int = 128,
+            freeze_encoder: bool = True
+    ) -> None:
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForTokenClassification.from_pretrained(model_name)
+
+        # Freeze transformer encoder and only train the linear layer
+        if freeze_encoder:
+            for param in self.model.parameters():
+                param.requires_grad = False
+
+        if add_linear:
+            # Add an additional small, adjustable linear layer on top of BERT tuned through RL
+            self.embedding_size = embedding_size
+            self.linear = nn.Linear(
+                self.model.config.hidden_size, embedding_size
+            )  # 768 for bert-base-uncased, distilbert-base-uncased
+        else:
+            self.linear = None
+
+    def _calc_embedding(self, x: list) -> torch.Tensor:
+        input = self.tokenizer(x, truncation=True, padding=True, return_tensors="pt").to(self.model.device)
+        output = self.model(**input, output_hidden_states=True)
+        # Get last layer hidden states
+        last_hidden_states = output.hidden_states[-1]
+        # Get [CLS] hidden states
+        sentence_embedding = last_hidden_states[:, 0, :]  # len(input_list) x hidden_size
+        # print(f"sentence_embedding: {sentence_embedding}")
+
+        if self.linear:
+            sentence_embedding = self.linear(sentence_embedding)  # len(input_list) x embedding_size
+
+        return sentence_embedding
+
+    def forward(self, train_samples: list, candidate_samples: list) -> dict:
+        ctxt_embedding = self._calc_embedding(train_samples)
+        cands_embedding = self._calc_embedding(candidate_samples)
+        scores = torch.mm(ctxt_embedding, cands_embedding.t())
+        return {'dist': torch.distributions.Categorical(logits=scores), 'logit': scores}
diff --git a/ding/model/template/tests/test_nlp_pretrained_model.py b/ding/model/template/tests/test_nlp_pretrained_model.py
@@ -0,0 +1,27 @@
+import pytest
+import torch
+
+from ding.model.template.nlp_pretrained_model import NLPPretrainedModel
+
+
+@pytest.mark.unittest
+class TestNLPPretrainedModel:
+
+    def check_model(self):
+        test_pids = [1]
+        cand_pids = [0, 2, 4]
+        problems = [
+            "This is problem 0", "This is the first question", "Second problem is here", "Another problem",
+            "This is the last problem"
+        ]
+        ctxt_list = [problems[pid] for pid in test_pids]
+        cands_list = [problems[pid] for pid in cand_pids]
+
+        model = NLPPretrainedModel(model_name="bert-base-uncased", add_linear=True, embedding_size=256)
+        cands_embedding = model(cands_list)
+        assert cands_embedding.shape == (3, 256)
+        ctxt_embedding = model(ctxt_list)
+        assert ctxt_embedding.shape == (1, 256)
+
+        scores = torch.mm(ctxt_embedding, cands_embedding.t())
+        assert scores.shape == (1, 3)
diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py
@@ -51,3 +51,4 @@
 
 # new-type policy
 from .ppof import PPOFPolicy
+from .prompt_pg import PromptPGPolicy
diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py
@@ -48,6 +48,7 @@
 from .madqn import MADQNPolicy
 from .bdq import BDQPolicy
 from .edac import EDACPolicy
+from .prompt_pg import PromptPGPolicy
 
 
 class EpsCommandModePolicy(CommandModePolicy):
@@ -426,3 +427,8 @@ def _get_setting_learn(self, command_info: dict) -> dict:
 
     def _get_setting_eval(self, command_info: dict) -> dict:
         return {}
+
+
+@POLICY_REGISTRY.register('prompt_pg_command')
+class PromptPGCommandModePolicy(PromptPGPolicy, DummyCommandModePolicy):
+    pass
diff --git a/ding/policy/prompt_pg.py b/ding/policy/prompt_pg.py
@@ -0,0 +1,258 @@
+from typing import List, Dict, Any, Tuple, Union
+from collections import namedtuple
+import torch
+
+from ding.rl_utils import get_train_sample
+from ding.torch_utils import Adam, to_device
+from ding.utils import POLICY_REGISTRY
+from ding.utils.data import default_collate, default_decollate
+from .base_policy import Policy
+
+
+@POLICY_REGISTRY.register('prompt_pg')
+class PromptPGPolicy(Policy):
+    r"""
+    Overview:
+        Policy class of Prompt Policy Gradient (PromptPG) algorithm.
+    """
+    config = dict(
+        # (string) RL policy register name (refer to function "register_policy").
+        type='prompt_pg',
+        # (bool) whether to use cuda for network.
+        cuda=True,
+        # (bool) whether use on-policy training pipeline(behaviour policy and training policy are the same)
+        on_policy=True,  # for pg strictly on policy algorithm, this line should not be modified by users
+        # (bool) whether to use deterministic action for evaluation.
+        deterministic_eval=True,
+        learn=dict(
+            # (int) the number of samples for one update.
+            batch_size=64,
+            # (float) the step size of one gradient descend.
+            learning_rate=0.001,
+            # ==============================================================
+            # The following configs is algorithm-specific
+            # ==============================================================
+            # (float) loss weight of the entropy regularization, the weight of policy network is set to 1
+            entropy_weight=0.01,
+            # (float) max grad norm value.
+            grad_norm=5,
+            # (bool) whether to ignore done signal for non-termination env.
+            ignore_done=False,
+        ),
+        collect=dict(
+            # (int) collect n_sample data, train model n_iteration times
+            # n_episode=8,
+            # (int) trajectory unroll length
+            unroll_len=1,
+            # ==============================================================
+            # The following configs is algorithm-specific
+            # ==============================================================
+            # (float) discount factor for future reward, defaults int [0, 1]
+            discount_factor=0,
+            collector=dict(get_train_sample=True),
+        ),
+        eval=dict(),
+    )
+
+    def default_model(self) -> Tuple[str, List[str]]:
+        return 'nlp_pretrained_model', ['ding.model.template.nlp_pretrained_model']
+
+    def _init_learn(self) -> None:
+        r"""
+        Overview:
+            Learn mode init method. Called by ``self.__init__``.
+            Init the optimizer, algorithm config, main and target models.
+        """
+        # Optimizer
+        self._optimizer = Adam(self._model.parameters(), lr=self._cfg.learn.learning_rate)
+
+        self._entropy_weight = self._cfg.learn.entropy_weight
+        self._grad_norm = self._cfg.learn.grad_norm
+        self._learn_model = self._model  # for compatibility
+
+    def _forward_learn(self, data: dict) -> Dict[str, Any]:
+        r"""
+        Overview:
+            Forward and backward function of learn mode.
+        Arguments:
+            - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward']
+        Returns:
+            - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss.
+        """
+        self._model.train()
+        if self._cuda:
+            data = to_device(data, self._device)
+
+        return_infos = []
+        for i in range(0, len(data), self._cfg.learn.batch_size):
+            batch = default_collate(data[i:i + self._cfg.learn.batch_size])
+            # Prepare train_sample (the question to be answered) and the candidate_samples (the prompts to be selected)
+            train_samples, cand_samples = batch["obs"]["train_sample"], batch["obs"]["candidate_samples"]
+            for ii in range(len(cand_samples)):
+                cand_samples[ii] = cand_samples[ii][0]
+            output = self._learn_model.forward(train_samples, cand_samples)
+            return_ = batch['return']
+            if self._cuda:
+                return_ = return_.to(self._device)
+
+            # calculate PG loss
+            real_act = []
+            for b in range(batch['action'].shape[0]):
+                tmp_act = []
+                act = batch['action'][b].item()
+                # The action is a combination of indexes of all selected prompts.
+                # For example, if [3, 6] is selected, action = 2 ** 3 + 2 ** 6 = 8 + 64 = 72.
+                # In this step, we calculate all the indexes.
+                idx = 0
+                while act > 0:
+                    if act % 2 != 0:
+                        tmp_act.append(idx)
+                    act = act // 2
+                    idx += 1
+                assert len(tmp_act) == self._cfg.shot_number
+                real_act.append(tmp_act)
+            real_act = torch.tensor(real_act, device=self._device)  # shape: (B, shot_number)
+            # Calculate loss.
+            total_loss = 0
+            total_policy_loss, total_entropy_loss = 0, 0
+            for ii in range(self._cfg.shot_number):
+                log_prob = output['dist'].log_prob(real_act[:, ii])
+                policy_loss = -(log_prob * return_).mean()
+                entropy_loss = -self._cfg.learn.entropy_weight * output['dist'].entropy().mean()
+                total_loss += policy_loss + entropy_loss
+                total_policy_loss += policy_loss
+                total_entropy_loss += entropy_loss
+
+            # update
+            self._optimizer.zero_grad()
+            total_loss.backward()
+
+            grad_norm = torch.nn.utils.clip_grad_norm_(
+                list(self._learn_model.parameters()),
+                max_norm=self._grad_norm,
+            )
+            self._optimizer.step()
+
+            # only record last updates information in logger
+            return_info = {
+                'cur_lr': self._optimizer.param_groups[0]['lr'],
+                'total_loss': total_loss.item(),
+                'policy_loss': total_policy_loss.item(),
+                'entropy_loss': total_entropy_loss.item(),
+                'return_abs_max': return_.abs().max().item(),
+                'grad_norm': grad_norm,
+            }
+            return_infos.append(return_info)
+        return return_infos
+
+    def _init_collect(self) -> None:
+        self._unroll_len = self._cfg.collect.unroll_len
+        self._gamma = self._cfg.collect.discount_factor
+
+    def _forward_collect(self, data: dict) -> dict:
+        data_id = list(data.keys())
+        data = default_collate(list(data.values()))
+        self._model.eval()
+        with torch.no_grad():
+            # Prepare train_sample (the question to be answered) and the candidate_samples (the prompts to be selected)
+            for ii in range(len(data['candidate_samples'])):
+                data['candidate_samples'][ii] = data['candidate_samples'][ii][0]
+            output = self._model.forward(data['train_sample'], data['candidate_samples'])
+            # Generate actions.
+            act = []
+            mask = torch.zeros_like(output['logit'])
+            for ii in range(self._cfg.shot_number):
+                dist = torch.distributions.Categorical(logits=output['logit'] + mask)
+                actions = dist.sample()
+                act.append(actions)
+                for jj in range(actions.shape[0]):
+                    mask[jj][actions[jj]] = -1e30
+            # `act` is shaped (shot_num, B)
+            real_act = []
+            for b in range(act[0].shape[0]):
+                tmp_act = torch.zeros_like(act[0])
+                for shot in act:
+                    tmp_act += 2 ** shot[b].item()
+                real_act.append(tmp_act)
+            real_act = torch.tensor(real_act)
+            # `real_act` is shaped (B)
+        output['action'] = real_act
+        if self._cuda:
+            output = to_device(output, 'cpu')
+        output = default_decollate(output)
+        return {i: d for i, d in zip(data_id, output)}
+
+    def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:
+        r"""
+        Overview:
+            Generate dict type transition data from inputs.
+        Arguments:
+            - obs (:obj:`Any`): Env observation
+            - model_output (:obj:`dict`): Output of collect model, including at least ['action']
+            - timestep (:obj:`namedtuple`): Output after env step, including at least ['obs', 'reward', 'done'] \
+                (here 'obs' indicates obs after env step).
+        Returns:
+            - transition (:obj:`dict`): Dict type transition data.
+        """
+        return {
+            'obs': obs,
+            'action': model_output['action'],
+            'reward': timestep.reward,
+            'done': timestep.done,
+        }
+
+    def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
+        r"""
+        Overview:
+            Get the trajectory and the n step return data, then sample from the n_step return data
+        Arguments:
+            - data (:obj:`list`): The trajectory's buffer list
+        Returns:
+            - samples (:obj:`dict`): The training samples generated
+        """
+        if self._cfg.learn.ignore_done:
+            raise NotImplementedError
+
+        R = 0.
+        for i in reversed(range(len(data))):
+            R = self._gamma * R + data[i]['reward']
+            data[i]['return'] = R
+        return get_train_sample(data, self._unroll_len)
+
+    def _init_eval(self) -> None:
+        pass
+
+    def _forward_eval(self, data: dict) -> dict:
+        data_id = list(data.keys())
+        data = default_collate(list(data.values()))
+        self._model.eval()
+        with torch.no_grad():
+            # Prepare train_sample (the question to be answered) and the candidate_samples (the prompts to be selected)
+            for ii in range(len(data['candidate_samples'])):
+                data['candidate_samples'][ii] = data['candidate_samples'][ii][0]
+            output = self._model.forward(data['train_sample'], data['candidate_samples'])
+            # Generate actions.
+            act = []
+            mask = torch.zeros_like(output['logit'])
+            for ii in range(self._cfg.shot_number):
+                actions = torch.argmax(output['logit'] + mask, dim=-1)
+                act.append(actions)
+                for jj in range(actions.shape[0]):
+                    mask[jj][actions[jj]] = -1e30
+            # `act` is shaped (shot_num, B)
+            real_act = []
+            for b in range(act[0].shape[0]):
+                tmp_act = torch.zeros_like(act[0])
+                for shot in act:
+                    tmp_act += 2 ** shot[b].item()
+                real_act.append(tmp_act)
+            real_act = torch.tensor(real_act)
+            # `real_act` is shaped (B)
+        output['action'] = real_act
+        if self._cuda:
+            output = to_device(output, 'cpu')
+        output = default_decollate(output)
+        return {i: d for i, d in zip(data_id, output)}
+
+    def _monitor_vars_learn(self) -> List[str]:
+        return super()._monitor_vars_learn() + ['policy_loss', 'entropy_loss', 'return_abs_max', 'grad_norm']
diff --git a/dizoo/tabmwp/__init__.py b/dizoo/tabmwp/__init__.py