<a href="https://colab.research.google.com/github/jyotidabass/Revise-Reinforcement-Learning/blob/main/Revise_RL_and_XAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Learning From Feedback (Conceptual Code)**

In [None]:
reward = 5 # Placeholder value

def try_new_action():
    print("Trying a new action...") # Placeholder function

def repeat_action():
    print("Repeating the action...") # Placeholder function

if reward < 0:
    try_new_action()
else:
    repeat_action()

Repeating the action...


# **Tiny Code to Lock the Idea--MDP**

In [None]:
state = "active_customer"

if state == "active_customer":
    action = "offer_upgrade"
else:
    action = "send_discount"

print(action)

offer_upgrade


# **Simple Q-Learning Update**

In [None]:
Q = {
    "active_customer": {"offer_upgrade": 0.0, "send_discount": 0.0},
    "inactive_customer": {"offer_upgrade": 0.0, "send_discount": 0.0}
} # Placeholder Q-table
alpha = 0.1 # Placeholder learning rate
gamma = 0.9 # Placeholder discount factor
next_state = "active_customer" # Placeholder for the next state

Q[state][action] = Q[state][action] + alpha * (
    reward + gamma * max(Q[next_state].values()) - Q[state][action]
)

# **Conceptual Code Example--Deep Q-Networks (DQN)**

In [None]:
import torch.nn as nn
import torch
import numpy as np # Import numpy

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(state_size, 64),
            nn.ReLU(),
            nn.Linear(64, action_size)
        )

    def forward(self, state):
        # Convert numpy array to torch tensor if necessary
        if isinstance(state, np.ndarray):
            state = torch.from_numpy(state).float() # Convert to float tensor
        # Add a batch dimension if the input is a single sample (e.g., (features,) -> (1, features))
        if state.dim() == 1:
            state = state.unsqueeze(0)
        return self.model(state)

# **Tiny Code Idea---Actorâ€“Critic & Modern Reinforcement Learning**

In [None]:
def actor(state):
    print(f"Actor: Current state is {state}")
    return "some_action" # Placeholder action

def critic(state):
    print(f"Critic: Evaluating state {state}")
    return 0.5 # Placeholder value

def adjust_actor():
    print("Actor adjusted based on feedback.")

state = "current_observation" # Placeholder state
expected = 0.6 # Placeholder expected value

action = actor(state)
value  = critic(state)

if value < expected:
    adjust_actor()
else:
    print("No adjustment needed.")

Actor: Current state is current_observation
Critic: Evaluating state current_observation
Actor adjusted based on feedback.


# **Explainable AI (XAI) for Reinforcement Learning--Simple Conceptual Code for Explanation**

In [None]:
import shap
import torch

# Placeholder for a model instance, assuming a DQN model from previous conceptual code
# You would replace this with your actual trained model
state_size = 4 # Example state size
action_size = 2 # Example action size
model = DQN(state_size, action_size)

# Placeholder for state data, assuming a tensor input for the model
# You would replace this with your actual data
state_data = torch.randn(1, state_size)

# Adding a masker (background dataset) for SHAP Explainer
# For PyTorch models, shap.Explainer often expects a background dataset for the masker.
# Here, we create a simple random tensor as a placeholder masker.
masker_data = torch.randn(10, state_size) # 10 samples for background

# The fix: Explicitly create a shap.maskers.Independent object for the background data
# This ensures that shap.Explainer receives a callable masker object, not just a tensor.
# Convert masker_data to a NumPy array as shap.maskers.Independent expects it.
masker_obj = shap.maskers.Independent(masker_data.numpy())

explainer = shap.Explainer(model, masker=masker_obj)
shap_values = explainer(state_data.numpy()) # Convert state_data to NumPy array

In [None]:
safety_constraint_violated = True # Placeholder
penalty = 10 # Placeholder

if safety_constraint_violated:
    reward = reward - penalty

# **Simple Example of Explanation Evaluation**

In [None]:
explanation = {
    "demand": 0.6,
    "season": 0.3,
    "promotion": 0.1
}

print(max(explanation, key=explanation.get))

demand
