In [1]:
import pandas as pd
from transformers import pipeline
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

from rl_prompt_injection.environment import ToxicityEnvironment
from rl_prompt_injection.engines import action_engine, reward_engine, state_engine
from initial_experiment_config import DataConstants, InitialExperimentConstants, BasicConfig



llm = pipeline(
    "text2text-generation", model=ExperimentConstants.MODEL
    )
def generate_response(s: str | list, model=llm) -> str: 
    return model(s)[0].get("generated_text", "")


reward_eng = reward_engine.ResponseToxicityRewardEngine()
action_eng = action_engine.Text2TextActionSpace(model=MODEL, num_actions=ExperimentConstants.NUM_ACTIONS)
state_eng = state_engine.SentenceTransformerStateEngine()
text = pd.read_csv(DataConstants.TRAIN_DATA_PATH, encoding="latin1").iloc[:,-1].dropna().to_numpy()
env = ToxicityEnvironment(
        llm = generate_response,
        reward_engine=reward_eng,
        state_engine=state_eng,
        action_engine=action_eng,
        instruction_prompt = ExperimentConstants.INSTRUCTION_PROMPT,
        texts = text,
        log_interval=BasicConfig.LOG_INTERVAL,
        experiement_dir=ExperimentConstants.OUTPUT_DIR
)


check_env(env,skip_render_check=True)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=ExperimentConstants.TIMESTEPS)

# log final elements
env.log()



  from .autonotebook import tqdm as notebook_tqdm


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10       |
|    ep_rew_mean     | 55.9     |
| time/              |          |
|    fps             | 1        |
|    iterations      | 1        |
|    time_elapsed    | 1397     |
|    total_timesteps | 2048     |
---------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x2b08536d0>

In [3]:
# add experiments
# run experiments
# what does this tell us? LLM vs Toxicity Model
# commentry: feasibility (inaccess but could imagine foundation model service)
# statistical test for model robustness
# Other experiments: