<a href="https://colab.research.google.com/github/paviayyala/AIML-Lab/blob/main/Agentic_Q_Learning_Discount.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Working.

**Agentic Q-Learning for Sales Discount — Colab Documentation**

A clear, copy-ready Colab README that explains the agentic Q-learning program (tools + RuleBasedAgent) you shared. Paste this into a Colab text cell (Markdown) or save as README.md to keep with your notebook.

Overview

This project demonstrates an agentic structure for a simple tabular Q-learning agent that learns discount strategies for retail sales.
It follows the pattern in your example:

small environment (DiscountEnv) that simulates customer segments and competitor pressure,

tools exposed as functions (generate_env, train_q_learning, evaluate_policy, save_policy, load_policy),

a RuleBasedAgent orchestrator that dispatches tasks like train, evaluate, and train_and_eval.

The reward is expected profit = (selling_price − cost) × purchase_probability. The agent learns when discounts improve expected profit vs. when to protect margin.


Intended Use

Train a Q-learning agent to choose discount percentages for different customer segments and competitor pressures.

Save and load resulting Q tables (policies).

Evaluate and inspect action distributions and expected profit.

Run and iterate in Google Colab — recommended for interactive runs and saving artifacts to Google Drive.

In [1]:
# agentic_q_discount.py
import os
import math
import random
import json
import logging
from typing import Tuple, Dict, Any, List

# Optional: mimic the decorator you used (no langchain import required)
def tool(func):
    func.is_tool = True
    func.name = func.__name__
    return func

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# -------------- Environment --------------
class DiscountEnv:
    """
    Simple simulated environment for discount decision:
    States: (customer_segment, competitor_pressure)
      - customer_segment in {"price_sensitive", "neutral", "loyal"}
      - competitor_pressure in {"low", "high"}
    Actions: discount percentages [0,5,10,15,20]
    Reward: expected profit = (selling_price - cost) * purchase_prob
    Purchase probability increases with discount and differs by customer segment.
    """

    segments = ["price_sensitive", "neutral", "loyal"]
    comp_pressures = ["low", "high"]
    actions = [0, 5, 10, 15, 20]  # discount percentages

    def __init__(self, base_price=100.0, cost=60.0, seed: int = 42):
        self.base_price = base_price
        self.cost = cost
        random.seed(seed)

    def sample_state(self) -> Tuple[str, str]:
        # sample a state (simulate a random incoming customer + market condition)
        seg = random.choices(self.segments, weights=[0.4, 0.4, 0.2])[0]
        cp = random.choices(self.comp_pressures, weights=[0.6, 0.4])[0]
        return (seg, cp)

    def step(self, state: Tuple[str, str], action_index: int) -> Tuple[float, dict]:
        """
        Given a state and action index, return reward and info (no next state dependency in this simple sim).
        """
        discount_pct = self.actions[action_index]
        sell_price = self.base_price * (1 - discount_pct / 100.0)

        # purchase probability model (sigmoid-like)
        seg, cp = state
        # base willingness by segment
        seg_base = {"price_sensitive": 0.2, "neutral": 0.5, "loyal": 0.75}[seg]
        # competitor pressure reduces purchase prob when high
        cp_factor = 0.85 if cp == "high" else 1.0
        # discount effect: more discount increases purchase probability
        # map discount 0..20 to bump 0..0.5
        discount_bump = (discount_pct / 20.0) * 0.5
        purchase_prob = seg_base * cp_factor + discount_bump
        # clamp
        purchase_prob = max(0.0, min(1.0, purchase_prob))

        expected_profit = (sell_price - self.cost) * purchase_prob

        # reward: expected_profit (can be negative if price < cost)
        return expected_profit, {
            "sell_price": sell_price,
            "purchase_prob": purchase_prob,
            "discount_pct": discount_pct
        }

    def state_to_index(self, state: Tuple[str, str]) -> int:
        seg_idx = self.segments.index(state[0])
        cp_idx = self.comp_pressures.index(state[1])
        return seg_idx * len(self.comp_pressures) + cp_idx

    def index_to_state(self, idx: int) -> Tuple[str, str]:
        seg_idx = idx // len(self.comp_pressures)
        cp_idx = idx % len(self.comp_pressures)
        return (self.segments[seg_idx], self.comp_pressures[cp_idx])

    @property
    def n_states(self) -> int:
        return len(self.segments) * len(self.comp_pressures)

    @property
    def n_actions(self) -> int:
        return len(self.actions)

# -------------- Tools --------------
@tool
def generate_env(config: dict = None) -> DiscountEnv:
    config = config or {}
    env = DiscountEnv(base_price=config.get("base_price", 100.0),
                      cost=config.get("cost", 60.0),
                      seed=config.get("seed", 42))
    logging.info("Environment generated with base_price=%s cost=%s", env.base_price, env.cost)
    return env

@tool
def train_q_learning(env: DiscountEnv,
                     episodes: int = 2000,
                     alpha: float = 0.1,
                     gamma: float = 0.95,
                     epsilon_start: float = 1.0,
                     epsilon_end: float = 0.05,
                     epsilon_decay: float = 0.999) -> Dict[str, Any]:
    """
    Train a tabular Q-learning agent on the simple environment.
    Returns dict with q_table and training metrics.
    """
    n_s = env.n_states
    n_a = env.n_actions
    # Initialize Q-table
    Q = [[0.0 for _ in range(n_a)] for _ in range(n_s)]

    epsilon = epsilon_start
    rewards_history: List[float] = []

    for ep in range(episodes):
        state = env.sample_state()
        s_idx = env.state_to_index(state)
        # single-step episode (stateless transitions) — treat each sample independently
        # but we run many samples per episode for stability
        ep_reward = 0.0
        steps_per_episode = 10
        for step in range(steps_per_episode):
            # Epsilon-greedy
            if random.random() < epsilon:
                a = random.randrange(n_a)
            else:
                # pick argmax
                a = max(range(n_a), key=lambda ai: Q[s_idx][ai])

            reward, info = env.step(state, a)
            ep_reward += reward

            # As environment is single-step, next state is sampled anew
            next_state = env.sample_state()
            ns_idx = env.state_to_index(next_state)

            # Q-learning update
            best_next = max(Q[ns_idx])
            Q[s_idx][a] += alpha * (reward + gamma * best_next - Q[s_idx][a])

            # set state to next for next step
            state = next_state
            s_idx = ns_idx

        # decay epsilon
        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        rewards_history.append(ep_reward)

        if (ep + 1) % max(1, (episodes // 10)) == 0:
            logging.info("Episode %d/%d: ep_reward=%.2f epsilon=%.3f", ep+1, episodes, ep_reward, epsilon)

    return {"q_table": Q, "rewards": rewards_history, "params": {"episodes": episodes, "alpha": alpha, "gamma": gamma}}

@tool
def evaluate_policy(env: DiscountEnv, q_table: List[List[float]], trials: int = 1000) -> dict:
    total_reward = 0.0
    action_counts = {a: 0 for a in env.actions}
    for _ in range(trials):
        state = env.sample_state()
        s_idx = env.state_to_index(state)
        # pick greedy action
        a_idx = max(range(env.n_actions), key=lambda ai: q_table[s_idx][ai])
        reward, info = env.step(state, a_idx)
        total_reward += reward
        action_counts[env.actions[a_idx]] += 1
    avg_reward = total_reward / trials
    logging.info("Evaluation over %d trials => avg_reward=%.4f", trials, avg_reward)
    return {"avg_reward": avg_reward, "action_distribution": action_counts}

@tool
def save_policy(q_table: List[List[float]], filename: str = "q_policy.json") -> str:
    os.makedirs("policies", exist_ok=True)
    path = os.path.join("policies", filename)
    with open(path, "w") as f:
        json.dump(q_table, f)
    logging.info("Saved policy to %s", path)
    return path

@tool
def load_policy(filename: str = "q_policy.json") -> List[List[float]]:
    path = os.path.join("policies", filename)
    with open(path, "r") as f:
        q_table = json.load(f)
    logging.info("Loaded policy from %s", path)
    return q_table

# -------------- Agent Orchestrator (Agentic style) --------------
class RuleBasedAgent:
    def __init__(self, tools):
        # map name -> function
        self.tools = {t.name: t for t in tools}

    def invoke(self, task: str, **kwargs):
        """
        Very simple dispatcher: parse task keywords and call relevant tools.
        Supported tasks:
          - 'train': train q learning (returns training result)
          - 'evaluate': evaluate saved or given policy
          - 'train_and_eval': train then evaluate and save
        """
        task_l = task.lower()

        if "train_and_eval" in task_l or ("train" in task_l and "eval" in task_l):
            env = self.tools["generate_env"](kwargs.get("env_config"))
            train_res = self.tools["train_q_learning"](
                env,
                episodes=kwargs.get("episodes", 2000),
                alpha=kwargs.get("alpha", 0.1),
                gamma=kwargs.get("gamma", 0.95),
                epsilon_start=kwargs.get("epsilon_start", 1.0),
                epsilon_end=kwargs.get("epsilon_end", 0.05),
                epsilon_decay=kwargs.get("epsilon_decay", 0.999)
            )
            q = train_res["q_table"]
            save_path = self.tools["save_policy"](q, kwargs.get("policy_name", "q_policy.json"))
            eval_res = self.tools["evaluate_policy"](env, q, trials=kwargs.get("trials", 1000))
            return {"train": train_res, "save_path": save_path, "eval": eval_res}

        if "train" in task_l:
            env = self.tools["generate_env"](kwargs.get("env_config"))
            train_res = self.tools["train_q_learning"](
                env,
                episodes=kwargs.get("episodes", 2000),
                alpha=kwargs.get("alpha", 0.1),
                gamma=kwargs.get("gamma", 0.95),
                epsilon_start=kwargs.get("epsilon_start", 1.0),
                epsilon_end=kwargs.get("epsilon_end", 0.05),
                epsilon_decay=kwargs.get("epsilon_decay", 0.999)
            )
            return {"train": train_res}

        if "evaluate" in task_l:
            env = self.tools["generate_env"](kwargs.get("env_config"))
            if "policy" in kwargs:
                q_tbl = kwargs["policy"]
            else:
                q_tbl = self.tools["load_policy"](kwargs.get("policy_name", "q_policy.json"))
            eval_res = self.tools["evaluate_policy"](env, q_tbl, trials=kwargs.get("trials", 1000))
            return {"eval": eval_res}

        return {"error": "Task not recognized. Use 'train', 'evaluate' or 'train_and_eval'."}

# -------------- Run example --------------
if __name__ == "__main__":
    tools = [generate_env, train_q_learning, evaluate_policy, save_policy, load_policy]
    agent = RuleBasedAgent(tools)

    # Example: train and evaluate, save policy
    logging.info("Starting training+evaluation run")
    result = agent.invoke(
        "train_and_eval",
        env_config={"base_price": 120.0, "cost": 70.0, "seed": 123},
        episodes=1500,
        alpha=0.08,
        gamma=0.95,
        epsilon_start=1.0,
        epsilon_end=0.05,
        epsilon_decay=0.998,
        trials=2000,
        policy_name="discount_q_policy.json"
    )

    # Print summary
    train_info = result["train"]
    eval_info = result["eval"]
    save_path = result["save_path"]
    print("\n--- Training Summary ---")
    print("Episodes:", train_info["params"]["episodes"])
    print("Last 5 episode rewards (sample):", train_info["rewards"][-5:])
    print("\n--- Evaluation Summary ---")
    print("Average expected profit per trial: {:.4f}".format(eval_info["avg_reward"]))
    print("Action distribution (discount_pct -> counts):")
    for d, cnt in sorted(eval_info["action_distribution"].items()):
        print(f"  {d}% -> {cnt}")
    print("\nPolicy saved to:", save_path)
    print("Done.")



--- Training Summary ---
Episodes: 1500
Last 5 episode rewards (sample): [218.16, 277.53499999999997, 267.535, 265.295, 232.24000000000004]

--- Evaluation Summary ---
Average expected profit per trial: 24.7966
Action distribution (discount_pct -> counts):
  0% -> 389
  5% -> 0
  10% -> 1132
  15% -> 0
  20% -> 479

Policy saved to: policies/discount_q_policy.json
Done.
