# Drug Optimization RL (Discovery2 + EvE Bio) — Google Colab Workflow\n\nThis notebook reproduces the repo workflow in Colab:\n1) download Discovery2 safety/selectivity artifacts (promiscuity scores + cytotox models)\n2) load EvE Bio drug–target activity dataset (or fall back to a tiny synthetic demo)\n3) run the Gymnasium environment + random baseline\n4) train/evaluate the tabular Q-learning baseline\n5) run the MILES/MoE concepts demo\n\n## Notes\n- The EvE Bio dataset is **gated** on Hugging Face. If you haven't accepted its access terms, `load_dataset` may fail.\n- If you have a Hugging Face token, add it as a Colab Secret named `HF_TOKEN`.\n

In [None]:
# @title Install dependencies\n!pip -q install --upgrade pip\n!pip -q install numpy pandas matplotlib gymnasium==0.29.1 joblib statsmodels lightgbm datasets huggingface_hub\n# RDKit on Colab: rdkit-pypi often works; if it fails, see the Troubleshooting cell.\n!pip -q install rdkit-pypi\n

In [None]:
# @title (Optional) Mount Google Drive (to persist datasets/models/outputs)\nfrom google.colab import drive\ndrive.mount('/content/drive')\n

In [None]:
# @title Setup paths and (optional) Hugging Face login\nimport os\nfrom pathlib import Path\n\nBASE_DIR = Path('/content')\nDATA_DIR = BASE_DIR / 'data'\nMODELS_DIR = BASE_DIR / 'models'\nOUT_DIR = BASE_DIR / 'outputs'\nfor d in [DATA_DIR, MODELS_DIR, OUT_DIR]:\n    d.mkdir(parents=True, exist_ok=True)\n\nprint('DATA_DIR:', DATA_DIR)\nprint('MODELS_DIR:', MODELS_DIR)\nprint('OUT_DIR:', OUT_DIR)\n\n# Optional HF login (helps with gated datasets)\nfrom huggingface_hub import login\nhf_token = os.environ.get('HF_TOKEN', '')\ntry:\n    # In Colab, prefer setting HF_TOKEN as a Secret; it will appear in the environment.\n    if hf_token:\n        login(token=hf_token)\n        print('Logged into Hugging Face with HF_TOKEN')\n    else:\n        print('HF_TOKEN not found in env. If the EvE dataset load fails, add a token as a Colab Secret named HF_TOKEN.')\nexcept Exception as e:\n    print('HF login error (continuing):', e)\n

In [None]:
# @title Write the project scripts into the Colab runtime\nfrom pathlib import Path\n\nPath('drug_rl_environment.py').write_text('\\nimport gymnasium as gym\\nfrom gymnasium import spaces\\nimport numpy as np\\nimport pandas as pd\\nimport joblib\\nfrom sklearn.preprocessing import MinMaxScaler\\n\\nclass DrugOptimizationEnv(gym.Env):\\n    metadata = {"render_modes": ["human"], "render_fps": 30}\\n\\n    def __init__(\\n        self,\\n        drug_target_data_path: str,\\n        promiscuity_data_path: str,\\n        cytotox_model_path: str,\\n        target_gene: str,\\n        max_steps: int = 10,\\n        efficacy_weight: float = 0.4,\\n        safety_weight: float = 0.4,\\n        selectivity_weight: float = 0.2,\\n    ):\\n        super().__init__()\\n        self.target_gene = target_gene\\n        self.max_steps = max_steps\\n        self.current_step = 0\\n        self.visited_compounds = set()\\n\\n        # Load drug-target activity data\\n        self.drug_target_df = pd.read_csv(drug_target_data_path)\\n        self.drug_target_df = self.drug_target_df[\\n            self.drug_target_df["target__gene"] == target_gene\\n        ]\\n        if self.drug_target_df.empty:\\n            raise ValueError(\\n                f"No data for target gene {target_gene} in {drug_target_data_path}"\\n            )\\n        self.compounds = self.drug_target_df["compound_id"].unique().tolist()\\n        self.n_compounds = len(self.compounds)\\n        self.compound_to_idx = {comp: i for i, comp in enumerate(self.compounds)}\\n\\n        # Load promiscuity scores\\n        self.promiscuity_df = pd.read_csv(promiscuity_data_path)\\n        # Ensure \'compound_id\' is consistent\\n        if \'cmpd_id\' in self.promiscuity_df.columns:\\n            self.promiscuity_df = self.promiscuity_df.rename(columns={\'cmpd_id\': \'compound_id\'})\\n        self.promiscuity_df = self.promiscuity_df[self.promiscuity_df[\'compound_id\'].isin(self.compounds)]\\n        self.promiscuity_scores = self.promiscuity_df.set_index(\'compound_id\')[\'promiscuity_score\'].to_dict()\\n\\n        # Load cytotoxicity model\\n        self.cytotox_model = joblib.load(cytotox_model_path)\\n        # The model expects a \'promiscuity_score\' feature\\n        # In a real scenario, you might also have other features (e.g., molecular descriptors)\\n        # For this demo, we\'ll assume it primarily uses promiscuity.\\n\\n        self.action_space = spaces.Discrete(self.n_compounds)\\n        # Observation space can be simplified for Q-learning as a single state,\\n        # or more complex for other RL approaches.\\n        # For now, let\'s just make it a single state (0) since Q-learning on full state space is hard.\\n        self.observation_space = spaces.Discrete(1)\\n\\n        self.efficacy_weight = efficacy_weight\\n        self.safety_weight = safety_weight\\n        self.selectivity_weight = selectivity_weight\\n\\n        # Reward scaling\\n        self.efficacy_scaler = MinMaxScaler()\\n        self.safety_scaler = MinMaxScaler()\\n        self.selectivity_scaler = MinMaxScaler()\\n\\n        # Pre-calculate min/max for scaling\\n        self._precalculate_scaling_bounds()\\n\\n    def _precalculate_scaling_bounds(self):\\n        # Efficacy: outcome_max_activity is usually a score, 0-100 or similar.\\n        # Assume min/max from dataset, or a reasonable range.\\n        max_activity = self.drug_target_df[\'outcome_max_activity\'].max()\\n        min_activity = self.drug_target_df[\'outcome_max_activity\'].min()\\n        self.efficacy_scaler.fit(np.array([[min_activity], [max_activity]]))\\n\\n        # Safety: Cytotoxicity model outputs a score, assume it\'s like a probability [0,1]\\n        # Or some other range. For demo, let\'s assume raw model output is 0-1 for simplicity.\\n        # A common sigmoid output would be 0-1, so min=0, max=1 is a decent starting point.\\n        self.safety_scaler.fit(np.array([[0.0], [1.0]]))\\n\\n        # Selectivity: Promiscuity scores (lower is better, higher is more promiscuous).\\n        # Assume higher promiscuity scores are worse.\\n        max_prom = self.promiscuity_df[\'promiscuity_score\'].max()\\n        min_prom = self.promiscuity_df[\'promiscuity_score\'].min()\\n        self.selectivity_scaler.fit(np.array([[min_prom], [max_prom]]))\\n\\n    def _get_obs(self):\\n        # For a simple Q-learning, the state can be a single value (e.g., 0)\\n        # More complex envs would return features of the current drug discovery process.\\n        return 0\\n\\n    def _get_info(self, compound_id=None, efficacy=None, safety=None, selectivity=None):\\n        info = {\\n            "current_step": self.current_step,\\n            "max_steps": self.max_steps,\\n            "n_compounds": self.n_compounds,\\n            "visited_compounds_count": len(self.visited_compounds),\\n        }\\n        if compound_id:\\n            info["compound_id"] = compound_id\\n            info["efficacy"] = efficacy\\n            info["safety"] = safety\\n            info["selectivity"] = selectivity\\n        return info\\n\\n    def reset(self, seed=None, options=None):\\n        super().reset(seed=seed)\\n        self.current_step = 0\\n        self.visited_compounds = set()\\n        observation = self._get_obs()\\n        info = self._get_info()\\n        return observation, info\\n\\n    def step(self, action: int):\\n        self.current_step += 1\\n        compound_id = self.compounds[action]\\n\\n        terminated = False\\n        truncated = False\\n\\n        # Penalize revisiting compounds\\n        if compound_id in self.visited_compounds:\\n            reward = -10.0 # Large penalty for revisiting\\n            terminated = True # End episode if agent revisits\\n            return self._get_obs(), reward, terminated, truncated, self._get_info(compound_id)\\n\\n        self.visited_compounds.add(compound_id)\\n\\n        # 1. Efficacy (from drug_target_df)\\n        # We assume \'outcome_max_activity\' is a measure of efficacy\\n        efficacy_data = self.drug_target_df[\\n            (self.drug_target_df["compound_id"] == compound_id)\\n        ]\\n        if not efficacy_data.empty:\\n            efficacy = efficacy_data["outcome_max_activity"].iloc[0]\\n        else:\\n            efficacy = 0.0 # No efficacy data, assume 0\\n\\n        # 2. Safety (from cytotox_model)\\n        # The cytotoxicity model expects features, assume it uses promiscuity score as input\\n        promiscuity_score = self.promiscuity_scores.get(compound_id, 0.0) # Default to 0 if not found\\n        # Assuming model.predict_proba or similar returns a toxicity probability\\n        # The Discovery2 model is a logistic model, so it likely outputs a probability or similar.\\n        try:\\n            # Assuming the model takes a single feature (promiscuity) and outputs a probability of toxicity\\n            # This is a simplification; a real model might need more features.\\n            safety_prediction = self.cytotox_model.predict_proba(np.array([[promiscuity_score]]))[:, 1][0] # Probability of being toxic\\n            safety = 1.0 - safety_prediction # Higher is safer\\n        except Exception:\\n            safety = 0.5 # Default to neutral safety if model prediction fails\\n\\n        # 3. Selectivity (from promiscuity_df)\\n        # Lower promiscuity score is better (higher selectivity)\\n        selectivity = 1.0 - (promiscuity_score / self.promiscuity_df[\'promiscuity_score\'].max()) # Normalize to 0-1, higher is more selective\\n        selectivity = max(0.0, selectivity) # Ensure it\'s not negative\\n\\n        # Normalize scores to 0-1 range using pre-fitted scalers\\n        scaled_efficacy = self.efficacy_scaler.transform(np.array([[efficacy]]))[0][0]\\n        # Safety is already 0-1 based on probability\\n        scaled_safety = self.safety_scaler.transform(np.array([[safety]]))[0][0]\\n        scaled_selectivity = self.selectivity_scaler.transform(np.array([[promiscuity_score]]))[0][0]\\n        scaled_selectivity = 1.0 - scaled_selectivity # Invert for reward: lower promiscuity -> higher reward\\n\\n        # Calculate composite reward\\n        reward = (\\n            self.efficacy_weight * scaled_efficacy\\n            + self.safety_weight * scaled_safety\\n            + self.selectivity_weight * scaled_selectivity\\n        )\\n\\n        if self.current_step >= self.max_steps:\\n            terminated = True\\n\\n        return self._get_obs(), float(reward), terminated, truncated, self._get_info(\\n            compound_id, efficacy, safety, selectivity\\n        )\\n\\n    def render(self, mode="human"):\\n        if mode == "human":\\n            print(f"Step: {self.current_step}/{self.max_steps}, Compounds explored: {len(self.visited_compounds)}")\\n\\n    def close(self):\\n        pass\\n\\n    @property\\n    def n_compounds(self):\\n        return len(self.compounds)\\n\\n')\nPath('drug_rl_training.py').write_text('\\nimport numpy as np\\nimport collections\\nimport matplotlib.pyplot as plt\\n\\nclass QLearningAgent:\\n    def __init__(self, n_actions, learning_rate, discount_factor, epsilon_start, epsilon_end, epsilon_decay):\\n        self.n_actions = n_actions\\n        self.lr = learning_rate\\n        self.gamma = discount_factor\\n        self.epsilon = epsilon_start\\n        self.epsilon_end = epsilon_end\\n        self.epsilon_decay = epsilon_decay\\n        # Q-table: state (int) -> action (int) -> Q-value (float)\\n        self.q_table = collections.defaultdict(lambda: np.zeros(n_actions))\\n\\n    def choose_action(self, state):\\n        if np.random.rand() < self.epsilon:\\n            return np.random.randint(self.n_actions)\\n        return np.argmax(self.q_table[state])\\n\\n    def learn(self, state, action, reward, next_state):\\n        # Q-learning update rule\\n        predict = self.q_table[state][action]\\n        target = reward + self.gamma * np.max(self.q_table[next_state])\\n        self.q_table[state][action] += self.lr * (target - predict)\\n\\n        # Decay epsilon\\n        self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)\\n\\ndef train_agent(env, agent, n_episodes, max_steps, verbose=True):\\n    rewards = []\\n    for episode in range(n_episodes):\\n        state, _ = env.reset() # Assuming state is simple for Q-learning\\n        total_reward = 0\\n        for step in range(max_steps):\\n            action = agent.choose_action(state)\\n            next_state, reward, terminated, truncated, _ = env.step(action)\\n            agent.learn(state, action, reward, next_state)\\n            state = next_state\\n            total_reward += reward\\n            if terminated or truncated:\\n                break\\n        rewards.append(total_reward)\\n        if verbose and (episode + 1) % 50 == 0:\\n            print(f"Episode {episode + 1}: Total Reward = {total_reward:.2f}, Epsilon = {agent.epsilon:.3f}")\\n    return {"rewards": rewards}\\n\\ndef evaluate_agent(env, agent, n_episodes):\\n    total_rewards = []\\n    for _ in range(n_episodes):\\n        state, _ = env.reset()\\n        episode_reward = 0\\n        # For evaluation, set epsilon to 0 to use greedy policy\\n        original_epsilon = agent.epsilon\\n        agent.epsilon = 0.0\\n\\n        for _ in range(env.max_steps):\\n            action = agent.choose_action(state)\\n            _, reward, terminated, truncated, _ = env.step(action)\\n            episode_reward += reward\\n            if terminated or truncated:\\n                break\\n        total_rewards.append(episode_reward)\\n        agent.epsilon = original_epsilon # Restore epsilon\\n\\n    return {"avg_reward": np.mean(total_rewards), "std_reward": np.std(total_rewards)}\\n\\ndef plot_training_results(training_stats, save_path=None):\\n    plt.figure(figsize=(10, 6))\\n    plt.plot(training_stats[\'rewards\'])\\n    plt.title(\'Training Rewards per Episode\')\\n    plt.xlabel(\'Episode\')\\n    plt.ylabel(\'Total Reward\')\\n    plt.grid(True)\\n    if save_path:\\n        plt.savefig(save_path)\\n        plt.close() # Close plot to prevent display in non-interactive environments\\n    else:\\n        plt.show()\\n')\nPath('miles_concepts_drug_rl.py').write_text('\\nimport random\\n\\nclass MoE:\\n    """Simulates a Mixture of Experts (MoE) system."""\\n    def __init__(self, num_experts, input_dim, output_dim):\\n        self.num_experts = num_experts\\n        self.input_dim = input_dim\\n        self.output_dim = output_dim\\n        self.expert_loads = {i: 0 for i in range(num_experts)}\\n        print(f"MoE initialized with {num_experts} experts, input_dim={input_dim}, output_dim={output_dim}.")\\n\\n    def route(self, input_data):\\n        """Routes input data to an expert based on a simple heuristic (e.g., random, round-robin)."""\\n        # In a real MoE, this would be a trainable gating network.\\n        # For this demo, let\'s use a round-robin assignment.\\n        expert_id = random.randrange(self.num_experts) # Simple random routing for demo\\n        self.expert_loads[expert_id] += 1\\n        return expert_id\\n\\n    def process(self, input_data, expert_id):\\n        """Simulates an expert processing the input data."""\\n        # In a real MoE, this would involve the actual expert model (e.g., a neural network).\\n        return f"Expert {expert_id} processed: {input_data}")\\n\\nclass RolloutSystem:\\n    """Simulates a distributed rollout system for RL."""\\n    def __init__(self, num_workers):\\n        self.num_workers = num_workers\\n        self.worker_tasks = {i: [] for i in range(num_workers)}\\n        print(f"Rollout system initialized with {num_workers} workers.")\\n\\n    def run_rollouts(self, policy, environments):\\n        """Distributes and runs rollouts across multiple workers."""\\n        results = []\\n        print(f"Running {len(environments)} rollouts with {self.num_workers} workers...")\\n        for i, env_config in enumerate(environments):\\n            worker_id = i % self.num_workers # Simple round-robin assignment to workers\\n            self.worker_tasks[worker_id].append(env_config)\\n            # Simulate the rollout work\\n            results.append(f"Worker {worker_id} completed rollout for {env_config} using policy \'{policy}\'.")\\n        return results\\n\\ndef demonstrate_miles_concepts():\\n    print("\\n--- Demonstrating MILES/MoE concepts for Drug RL ---")\\n\\n    # 1. Mixture of Experts (MoE)\\n    # Imagine MoE is used for different drug classes or target families.\\n    moe = MoE(num_experts=4, input_dim=256, output_dim=128)\\n    sample_compound_features = [f"features_cmpd_{i}" for i in range(10)]\\n\\n    print("\\nSimulating MoE routing and processing:")\\n    for features in sample_compound_features:\\n        expert_id = moe.route(features)\\n        result = moe.process(features, expert_id)\\n        print(f"  Input \'{features}\' routed to expert {expert_id}. Result: {result}")\\n\\n    print("  MoE Expert Load Distribution:", moe.expert_loads)\\n\\n    # 2. Distributed Rollouts\\n    # Imagine distributed rollouts for parallel exploration of drug candidates.\\n    rollout_system = RolloutSystem(num_workers=8)\\n    # Simulate configurations for different environments/tasks\\n    dummy_env_configs = [f"env_config_{i}" for i in range(20)]\\n\\n    print("\\nSimulating distributed RL rollouts:")\\n    rollout_results = rollout_system.run_rollouts("DQN_policy", dummy_env_configs)\\n    for res in rollout_results:\\n        print(f"  {res}")\\n\\n    print("\\nMILES/MoE concepts demonstration complete.")\\n    return moe, rollout_system\\n')\nPath('drug_target_analysis.py').write_text('\\nimport pandas as pd\\nimport matplotlib.pyplot as plt\\nimport seaborn as sns\\n\\ndef analyze_drug_target_data(df: pd.DataFrame, target_gene: str):\\n    print(f"\\n--- Analyzing Drug-Target Data for {target_gene} ---")\\n\\n    if df.empty:\\n        print("DataFrame is empty, no analysis to perform.")\\n        return\\n\\n    print(f"Total entries for {target_gene}: {len(df)}")\\n    print(f"Unique compounds targeting {target_gene}: {df[\'compound_id\'].nunique()}")\\n\\n    # Distribution of outcome_max_activity\\n    plt.figure(figsize=(10, 6))\\n    sns.histplot(df[\'outcome_max_activity\'], bins=20, kde=True)\\n    plt.title(f\'Distribution of Maximum Activity for {target_gene}\')\\n    plt.xlabel(\'Outcome Max Activity\')\\n    plt.ylabel(\'Count\')\\n    plt.grid(axis=\'y\', alpha=0.75)\\n    plt.show()\\n\\n    # Active vs Inactive counts\\n    if \'outcome_is_active\' in df.columns:\\n        active_counts = df[\'outcome_is_active\'].value_counts(normalize=True) * 100\\n        print(f"\\nPercentage of active compounds for {target_gene}:\\n{active_counts}")\\n\\n        plt.figure(figsize=(7, 5))\\n        sns.barplot(x=active_counts.index, y=active_counts.values, palette=\'viridis\')\\n        plt.title(f\'Percentage of Active vs. Inactive Compounds for {target_gene}\')\\n        plt.xlabel(\'Is Active\')\\n        plt.ylabel(\'Percentage\')\\n        plt.xticks([0, 1], [\'Inactive\', \'Active\'])\\n        plt.grid(axis=\'y\', alpha=0.75)\\n    plt.show()\\n\\n    print("Analysis complete.")\\n')\nprint('Wrote project scripts:', ', '.join(['drug_rl_environment.py','drug_rl_training.py','miles_concepts_drug_rl.py','drug_target_analysis.py']))\n

## Download Discovery2 artifacts\n\nWe pull:\n- `discovery2_promiscuity_scores.csv` from `pageman/discovery2-results`\n- `cubic_logistic_model.pkl` (and other models) from `pageman/discovery2-cytotoxicity-models`\n

In [None]:
# @title Download Discovery2 data + models from Hugging Face\nfrom huggingface_hub import hf_hub_download\n\n# Promiscuity scores (CSV)\npromiscuity_csv = hf_hub_download(\n    repo_id='pageman/discovery2-results',\n    repo_type='dataset',\n    filename='discovery2_promiscuity_scores.csv',\n    local_dir=str(DATA_DIR),\n    local_dir_use_symlinks=False,\n)\n\n# Cytotoxicity model(s)\ncubic_model_path = hf_hub_download(\n    repo_id='pageman/discovery2-cytotoxicity-models',\n    filename='cubic_logistic_model.pkl',\n    local_dir=str(MODELS_DIR),\n    local_dir_use_symlinks=False,\n)\n\nprint('Promiscuity CSV:', promiscuity_csv)\nprint('Cubic cytotox model:', cubic_model_path)\n

## Load EvE Bio drug–target activity data (preferred) or fall back to a tiny demo\n\nPreferred: `load_dataset('eve-bio/drug-target-activity')` then filter to a single target (default BTK) and export a small CSV.\n\nIf the dataset is gated and load fails, this notebook creates a small synthetic dataset so the rest of the workflow still runs.\n

In [None]:
# @title Load/filter EvE Bio dataset, export a target-specific CSV expected by the env\nimport pandas as pd\nfrom datasets import load_dataset\n\nTARGET_GENE = 'BTK'  # change to any gene available in the dataset\nfiltered_csv_path = DATA_DIR / f'drug_target_activity_{TARGET_GENE}.csv'\n\ndef _standardize_columns(df: pd.DataFrame) -> pd.DataFrame:\n    """Map likely column names from the HF dataset to the env's expected schema."""\n    # target gene column\n    target_candidates = ['target__gene', 'target_gene', 'target_gene_symbol', 'target_gene_name', 'target']\n    compound_candidates = ['compound_id', 'drug_id', 'compound', 'drug']\n\n    target_col = next((c for c in target_candidates if c in df.columns), None)\n    compound_col = next((c for c in compound_candidates if c in df.columns), None)\n\n    if target_col is None:\n        raise KeyError(f'Could not find a target gene column. Columns: {list(df.columns)[:50]}')\n    if compound_col is None:\n        raise KeyError(f'Could not find a compound id column. Columns: {list(df.columns)[:50]}')\n\n    # The env expects these exact names:\n    df = df.rename(columns={target_col: 'target__gene', compound_col: 'compound_id'})\n\n    # These are strongly suggested by the EvE dataset card and used by the env.\n    required = ['outcome_is_active', 'outcome_max_activity']\n    for r in required:\n        if r not in df.columns:
            raise KeyError(f'Missing required column {r}. Columns: {list(df.columns)[:50]}')\n\n    return df\n\ntry:\n    ds = load_dataset('eve-bio/drug-target-activity', split='train')\n    # Convert to pandas lazily; filter first in HF Arrow to reduce memory.\n    # If target column name differs, we'll handle after conversion.\n    df = ds.to_pandas()\n    df = _standardize_columns(df)\n    df_t = df[df['target__gene'] == TARGET_GENE].copy()\n    print('Loaded EvE dataset. Full rows:', len(df), 'Filtered rows:', len(df_t))\n    if len(df_t) == 0:\n        raise ValueError(f'No rows found for target {TARGET_GENE}. Try a different TARGET_GENE.')\n    # Keep only columns used by the env (plus target name)\n    df_t = df_t[['compound_id','target__gene','outcome_is_active','outcome_max_activity']]\n    df_t.to_csv(filtered_csv_path, index=False)\n    print('Wrote filtered target CSV:', filtered_csv_path)\nexcept Exception as e:\n    print('⚠️ Could not load the EvE Bio dataset (likely gated or missing columns).')\n    print('Reason:', repr(e))\n    print('Falling back to a tiny synthetic dataset so the RL workflow can still run.')\n    # Synthetic dataset using Discovery2 compound IDs\n    prom = pd.read_csv(promiscuity_csv)\n    demo = prom[['compound_id']].sample(n=min(200, len(prom)), random_state=42).copy()\n    demo['target__gene'] = TARGET_GENE\n    # Create a plausible activity signal and binary activity label\n    demo['outcome_max_activity'] = (demo['compound_id'].astype('category').cat.codes % 101).astype(float)\n    demo['outcome_is_active'] = demo['outcome_max_activity'] >= 50.0\n    demo.to_csv(filtered_csv_path, index=False)\n    print('Wrote synthetic target CSV:', filtered_csv_path)\n

In [None]:
# @title Instantiate DrugOptimizationEnv and run 1 random episode\nimport numpy as np\nfrom drug_rl_environment import DrugOptimizationEnv\n\nenv = DrugOptimizationEnv(\n    drug_target_data_path=str(filtered_csv_path),\n    promiscuity_data_path=str(promiscuity_csv),\n    cytotox_model_path=str(cubic_model_path),\n    target_gene=TARGET_GENE,\n    max_steps=10,\n    efficacy_weight=0.4,\n    safety_weight=0.4,\n    selectivity_weight=0.2,\n)\n\nobs, info = env.reset(seed=42)\nenv.render()\n\nepisode_reward = 0.0\nfor t in range(10):\n    action = env.action_space.sample()\n    obs, reward, terminated, truncated, info = env.step(action)\n    episode_reward += reward\n    if terminated or truncated:\n        break\n\nprint('Random episode total reward:', float(episode_reward))\nprint('Compounds explored:', len(env.visited_compounds), '/', env.n_compounds)\n

## Train the tabular Q-learning baseline and evaluate\n\nThis is the same baseline as in the repo (`drug_rl_training.py`), but we save outputs into `OUT_DIR`.\n

In [None]:
# @title Train + evaluate Q-learning agent\nfrom drug_rl_training import QLearningAgent, train_agent, evaluate_agent, plot_training_results\n\nagent = QLearningAgent(\n    n_actions=env.n_compounds,\n    learning_rate=0.1,\n    discount_factor=0.95,\n    epsilon_start=1.0,\n    epsilon_end=0.01,\n    epsilon_decay=0.995,\n)\n\ntraining_stats = train_agent(\n    env,\n    agent,\n    n_episodes=200,\n    max_steps=env.max_steps,\n    verbose=True,\n)\n\nplot_path = OUT_DIR / 'drug_rl_training_results.png'\nplot_training_results(training_stats, save_path=str(plot_path))\nprint('Saved training plot to:', plot_path)\n\neval_stats = evaluate_agent(env, agent, n_episodes=10)\nprint('Evaluation stats:', eval_stats)\n

In [None]:
# @title Compare to random policy baseline (same eval protocol)\nimport numpy as np\n\ndef eval_random(env, n_episodes=10):\n    rewards = []\n    for ep in range(n_episodes):\n        obs, info = env.reset(seed=123 + ep)\n        total = 0.0\n        for _ in range(env.max_steps):\n            a = env.action_space.sample()\n            obs, r, terminated, truncated, info = env.step(a)\n            total += r\n            if terminated or truncated:\n                break\n        rewards.append(total)\n    return float(np.mean(rewards)), float(np.std(rewards))\n\nrand_mean, rand_std = eval_random(env, n_episodes=10)\nprint(f'Random baseline: {rand_mean:.3f} ± {rand_std:.3f}')\nprint(f"Trained agent:   {eval_stats['avg_reward']:.3f} ± {eval_stats['std_reward']:.3f}")\n

## Run the MILES/MoE concepts demo\n\nThis does **not** require GPUs; it is a conceptual demonstration of how you’d scale RL with MoE routing and distributed rollouts.\n

In [None]:
# @title MILES concepts demo\nfrom miles_concepts_drug_rl import demonstrate_miles_concepts\nmoe, rollout_system = demonstrate_miles_concepts()\n

## Troubleshooting\n\n### If RDKit install fails\n- Try restarting runtime and rerunning install.\n- Alternatively, comment out RDKit-dependent parts and use only the promiscuity-based cytotox model.\n\n### If EvE Bio dataset load fails\n- The dataset is gated on Hugging Face; accept its terms in the HF UI and/or provide `HF_TOKEN`.\n- As an alternative, upload a CSV export of the EvE dataset (or a filtered subset) into `DATA_DIR` and point `filtered_csv_path` to it.\n