<a href="https://colab.research.google.com/github/manishadeepa/smartwriting-pen-and-pad/blob/main/reinforcement_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import random
import pickle
from IPython.display import display

class RLAgent:
    def __init__(self, state_space_size, action_space_size, alpha=0.1,
                 alpha_min=0.01, alpha_decay=0.995, gamma=0.9, epsilon=0.1,
                 epsilon_min=0.01, epsilon_decay=0.995):
        self.q_table = np.ones((state_space_size, action_space_size)) * 1.0  # Optimistic initialization
        self.alpha = alpha
        self.alpha_min = alpha_min
        self.alpha_decay = alpha_decay
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.state_space_size = state_space_size
        self.action_space_size = action_space_size

    def choose_action(self, state_idx):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.action_space_size - 1)
        return np.argmax(self.q_table[state_idx])

    def update_q(self, state_idx, action_idx, reward, next_state_idx):
        predict = self.q_table[state_idx, action_idx]
        target = reward + self.gamma * np.max(self.q_table[next_state_idx])
        self.q_table[state_idx, action_idx] += self.alpha * (target - predict)

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def decay_alpha(self):
        self.alpha = max(self.alpha_min, self.alpha * self.alpha_decay)

    def save(self, path='q_table.pkl'):
        with open(path, 'wb') as f:
            pickle.dump(self.q_table, f)

    def load(self, path='q_table.pkl'):
        with open(path, 'rb') as f:
            self.q_table = pickle.load(f)

    def get_policy(self, state_to_index, actions):
        policy = {}
        for state, state_idx in state_to_index.items():
            best_action_idx = np.argmax(self.q_table[state_idx])
            policy[state] = actions[best_action_idx]
        return policy

# Define states and actions
states = [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1),
 (1, 2), (2, 0), (2, 1), (2, 2)]
state_to_index = {s: i for i, s in enumerate(states)}
actions = ["text", "visual", "audio", "simplify", "increase"]

def simulate_user_response(state, action):
    acc, speed = state
    reward = 0

    if action == "simplify":
        if acc == 0:
            acc = min(2, acc + 1) if random.random() > 0.3 else acc
            reward = 1 if acc > state[0] else -0.5
        else:
            reward = -0.5
    elif action == "increase":
        if acc == 2:
            acc = max(0, acc - 1) if random.random() > 0.5 else acc
            reward = -1 if acc < state[0] else 0
        else:
            acc = min(2, acc + 1) if random.random() > 0.6 else acc
            reward = 1 if acc > state[0] else -0.5
    elif action == "visual":
        if speed < 2:
            acc = min(2, acc + 1) if random.random() > 0.5 else acc
            reward = 1 if acc > state[0] else -0.5
        else:
            reward = -0.5
    elif action == "audio":
        if speed > 0:
            acc = min(2, acc + 1) if random.random() > 0.5 else acc
            reward = 1 if acc > state[0] else -0.5
        else:
            reward = -0.5
    else:  # Text
        acc = min(2, acc + 1) if random.random() > 0.7 else acc
        reward = 0.5 if acc > state[0] else -1

    if action in ["visual", "audio"]:
        speed = min(2, max(0, speed + random.choice([-1, 0, 1]) if random.random() > 0.5 else speed))
    elif action == "increase":
        speed = min(2, speed + 1) if random.random() > 0.6 else speed
    elif action == "simplify":
        speed = max(0, speed - 1) if random.random() > 0.6 else speed

    return (acc, speed), reward

# Training loop
agent = RLAgent(state_space_size=len(states), action_space_size=len(actions))
eval_rewards = []
successes = 0

for episode in range(1000):
    state = random.choice(states)
    state_idx = state_to_index[state]
    total_reward = 0
    max_steps = 10

    for step in range(max_steps):
        action_idx = agent.choose_action(state_idx)
        action = actions[action_idx]

        new_state, reward = simulate_user_response(state, action)
        new_state_idx = state_to_index[new_state]

        agent.update_q(state_idx, action_idx, reward, new_state_idx)
        total_reward += reward

        state = new_state
        state_idx = new_state_idx

        if state[0] == 2:
            successes += 1
            break

    agent.decay_epsilon()
    agent.decay_alpha()
    eval_rewards.append(total_reward)

    if episode % 100 == 0:
        avg_reward = np.mean(eval_rewards[-100:]) if eval_rewards else 0
        success_rate = successes / (episode + 1)
        print(f"Episode {episode} | Avg Reward: {avg_reward:.2f} | Success Rate: {success_rate:.3f} | Epsilon: {agent.epsilon:.3f} | Alpha: {agent.alpha:.3f}")

agent.save("q_table.pkl")
print("Q-table saved!")

# Print learned policy
policy = agent.get_policy(state_to_index, actions)
print("\nLearned Policy:")
for state, action in policy.items():
    print(f"State {state}: {action}")

Episode 0 | Avg Reward: -1.50 | Success Rate: 1.000 | Epsilon: 0.100 | Alpha: 0.100
Episode 100 | Avg Reward: 0.17 | Success Rate: 0.990 | Epsilon: 0.060 | Alpha: 0.060
Episode 200 | Avg Reward: 0.37 | Success Rate: 0.995 | Epsilon: 0.037 | Alpha: 0.037
Episode 300 | Avg Reward: 0.15 | Success Rate: 0.990 | Epsilon: 0.022 | Alpha: 0.022
Episode 400 | Avg Reward: 0.38 | Success Rate: 0.990 | Epsilon: 0.013 | Alpha: 0.013
Episode 500 | Avg Reward: 0.24 | Success Rate: 0.992 | Epsilon: 0.010 | Alpha: 0.010
Episode 600 | Avg Reward: 0.26 | Success Rate: 0.993 | Epsilon: 0.010 | Alpha: 0.010
Episode 700 | Avg Reward: 0.35 | Success Rate: 0.994 | Epsilon: 0.010 | Alpha: 0.010
Episode 800 | Avg Reward: 0.45 | Success Rate: 0.995 | Epsilon: 0.010 | Alpha: 0.010
Episode 900 | Avg Reward: 0.32 | Success Rate: 0.996 | Epsilon: 0.010 | Alpha: 0.010
Q-table saved!

Learned Policy:
State (0, 0): increase
State (0, 1): simplify
State (0, 2): simplify
State (1, 0): visual
State (1, 1): audio
State (1,

In [3]:
#for user input

import numpy as np
import random
import pickle

class RLAgent:
    def __init__(self, state_space_size, action_space_size, q_table_path='q_table.pkl', alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = self._load_q_table(q_table_path)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.state_space_size = state_space_size
        self.action_space_size = action_space_size
        self.states = [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)]
        self.state_to_index = {s: i for i, s in enumerate(self.states)}
        self.actions = ["text", "visual", "audio", "simplify", "increase"]

    def _load_q_table(self, path):
        try:
            with open(path, 'rb') as f:
                return pickle.load(f)
        except FileNotFoundError:
            return np.zeros((self.state_space_size, self.action_space_size))

    def save_q_table(self, path='q_table.pkl'):
        with open(path, 'wb') as f:
            pickle.dump(self.q_table, f)

    def choose_action(self, state_idx):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.action_space_size - 1)
        return np.argmax(self.q_table[state_idx])

    def update_q(self, state_idx, action_idx, reward, next_state_idx):
        predict = self.q_table[state_idx, action_idx]
        target = reward + self.gamma * np.max(self.q_table[next_state_idx])
        self.q_table[state_idx, action_idx] += self.alpha * (target - predict)

    def get_action(self, state):
        if state not in self.state_to_index:
            return "Invalid state"
        state_idx = self.state_to_index[state]
        action_idx = self.choose_action(state_idx)
        return self.actions[action_idx]

    def update_from_feedback(self, state, action, reward, next_state):
        state_idx = self.state_to_index[state]
        action_idx = self.actions.index(action)
        next_state_idx = self.state_to_index[next_state]
        self.update_q(state_idx, action_idx, reward, next_state_idx)
        self.save_q_table()

    def load(self, path='q_table.pkl'):
        with open(path, 'rb') as f:
            self.q_table = pickle.load(f)