In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from collections import defaultdict
import pandas as pd
from stable_baselines3 import DQN
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback
import time
import random

In [None]:
#ENVIRONMENT SETUP 

class CustomerInteractionEnv(gym.Env):
    def __init__(self, dataset, total_timesteps):
        super(CustomerInteractionEnv, self).__init__()
        self.dataset = dataset
        self.current_step = 0
        self.total_reward = 0  # Track cumulative reward
        self.total_timesteps = total_timesteps  # Total timesteps for training
        self.actions = dataset['Action'].unique().tolist() + ["No Action"]
        self.action_space = spaces.Discrete(len(self.actions))
        state_size = len(dataset.iloc[0]['State'])
        self.observation_space = spaces.Box(low=0, high=1, shape=(state_size,), dtype=np.float32)
        self.visited_states = defaultdict(int)  # Track visited states
        self.action_counts = defaultdict(int)  # Track action frequencies

    def preprocess_state(self, state):
        """Convert the state to a NumPy array."""
        return np.array(state, dtype=np.float32)

    def calculate_similarity(self, action1, action2):
        """Custom similarity measure between actions."""
        return 1 if action1 == action2 else 0  # Replace with actual similarity logic

    def reset(self, seed=None, options=None):
        """Reset the environment to the first step."""
        super().reset(seed=seed)
        self.current_step = 0
        self.total_reward = 0  # Reset cumulative reward
        self.visited_states.clear()  # Reset visit counts
        self.action_counts.clear()  # Reset action frequencies
        initial_state = self.preprocess_state(self.dataset.iloc[self.current_step]['State'])
        return initial_state, {}  # Return a tuple (obs, info)

    def step(self, action):
        """Take an action and return the next state, reward, terminated, truncated, and info."""
        row = self.dataset.iloc[self.current_step]
        chosen_action = self.actions[action]

        # Increment action count
        self.action_counts[chosen_action] += 1

        if chosen_action == "No Action":
            reward = -0.01  
        elif chosen_action == row['Action']:
            reward = row['Reward']  # Positive reward
            
            # Enhance positive rewards between 10 and 20
            if 10 <= reward <= 20:
                reward = reward ** 1.5
        else:
            similarity_score = self.calculate_similarity(chosen_action, row['Action'])
            reward = similarity_score * 0.5 - 0.2

        # Penalize repeated actions
        action_penalty = 0.001 * (self.action_counts[chosen_action] - 1)
        reward -= action_penalty

        # Exploration bonus for visiting less-explored states
        state_tuple = tuple(self.preprocess_state(row['State']))
        self.visited_states[state_tuple] += 1
        decay_factor = 1 - (self.current_step / self.total_timesteps)  # Linearly decay over time
        decay_factor = max(decay_factor, 0)  # Ensure the decay factor never goes below 0
        exploration_bonus = (0.8 / np.sqrt(self.visited_states[state_tuple])) * decay_factor
        exploration_bonus = min(exploration_bonus, 1.0)  # Cap the bonus at 1.0
        reward += exploration_bonus

        # Update total reward
        self.total_reward += reward

        if self.current_step % 10_000 == 0:
            print(
                f"Step: {self.current_step}, Action: {chosen_action}, Reward: {reward:.2f}, "
                f"Exploration Bonus: {exploration_bonus:.2f}, Total Reward: {self.total_reward:.2f}"
            )

        self.current_step += 1
        terminated = self.current_step >= len(self.dataset)
        truncated = False
        if not terminated:
            next_state = self.preprocess_state(self.dataset.iloc[self.current_step]['State'])
        else:
            next_state = None
        return next_state, reward, terminated, truncated, {}

    def render(self, mode='human'):
        """Render the environment (optional)."""
        print(f"Step: {self.current_step}, Total Steps: {len(self.dataset)}")


In [None]:
#DATASET PREPARATION 

# Load the RL training dataset
rl_data = pd.read_csv("DRL_Training_Dataset_Subset.csv")

# Convert the State column to a list of floats
rl_data['State'] = rl_data['State'].apply(eval)




In [None]:
#ENVIRONMENT INITIALIZATION 

# Split into train (80%) and test (20%)
train_data, test_data = train_test_split(rl_data, test_size=0.2, random_state=42)

# Create separate environments
env_train = CustomerInteractionEnv(dataset=train_data, total_timesteps=5_000_000)
env_test = CustomerInteractionEnv(dataset=test_data, total_timesteps=500_000)  # For later evaluation

# Check the custom environment
check_env(env_train, warn=True)
check_env(env_test, warn=True)

In [None]:
# Train a DQN model

class ProgressBarCallback(BaseCallback):
    def __init__(self, total_timesteps, verbose=0):
        super(ProgressBarCallback, self).__init__(verbose)
        self.total_timesteps = total_timesteps
        self.last_time_called = time.time()  # Initialize the timer

    def _on_step(self) -> bool:
        current_time = time.time()
        # Check if one minute has passed
        if current_time - self.last_time_called >= 180:  # 180 seconds
            self.last_time_called = current_time
            # Calculate progress
            current_progress = self.num_timesteps / self.total_timesteps * 100
            if self.verbose > 0:
                print(f"Training progress: {current_progress:.2f}%")
        return True


# Hyperparameters hidden in a dictionary
drl_hyperparameters = {
    "policy": "MlpPolicy",
    "gamma": 0.98,
    "learning_rate": 1e-4,
    "buffer_size": 300_000,
    "learning_starts": 5_000,
    "batch_size": 128,
    "tau": 0.01,
    "train_freq": 4,
    "exploration_fraction": 0.5,
    "exploration_final_eps": 0.02,
    "target_update_interval": 5_000,
    "max_grad_norm": 5,
    "policy_kwargs": {"net_arch": [512, 256]},
    "verbose": 1,
    "total_timesteps": 5_000_000,
}


# Initialize the DQN model
model = DQN(
    drl_hyperparameters["policy"],
    env_train,
    gamma=drl_hyperparameters["gamma"],
    learning_rate=drl_hyperparameters["learning_rate"],
    buffer_size=drl_hyperparameters["buffer_size"],
    learning_starts=drl_hyperparameters["learning_starts"],
    batch_size=drl_hyperparameters["batch_size"],
    tau=drl_hyperparameters["tau"],
    train_freq=drl_hyperparameters["train_freq"],
    exploration_fraction=drl_hyperparameters["exploration_fraction"],
    exploration_final_eps=drl_hyperparameters["exploration_final_eps"],
    target_update_interval=drl_hyperparameters["target_update_interval"],
    max_grad_norm=drl_hyperparameters["max_grad_norm"],
    policy_kwargs=drl_hyperparameters["policy_kwargs"],
    verbose=drl_hyperparameters["verbose"],
)

# Train the model with a progress bar callback
progress_bar = ProgressBarCallback(
    total_timesteps=drl_hyperparameters["total_timesteps"], verbose=1
)
model.learn(
    total_timesteps=drl_hyperparameters["total_timesteps"], callback=progress_bar
)

model.save("dqn_customer_interaction")
print("Model saved.")


In [None]:
#MODEL EVALUATION

# Load the model
model = DQN.load("dqn_customer_interaction")

def evaluate_model_performance(env, model, dataset, max_steps=1000):
    """Evaluate model vs dataset: Assign reward only if actions match, otherwise reward = 0."""
    obs = env.reset()[0]
    model_total_reward = 0
    dataset_total_reward = 0
    steps = 0

    model_path = []
    dataset_path = []
    
    no_action_count = 0
    dm_count = 0  
    em_count = 0  
    dataset_dm_count = 0  
    dataset_em_count = 0  
    model_negative_rewards = 0
    model_positive_rewards = 0
    dataset_negative_rewards = 0
    dataset_positive_rewards = 0
    no_action_wins = 0  # When avoiding an action prevents a negative reward
    no_action_losses = 0  # When avoiding an action misses a positive reward

    while steps < max_steps:
        action, _ = model.predict(obs, deterministic=True)
        model_action = env.actions[action]
        dataset_action = dataset.iloc[steps]['Action']
        dataset_reward = dataset.iloc[steps]['Reward']

        # Assign reward only if action matches dataset
        model_reward = dataset_reward if model_action == dataset_action else 0
        model_total_reward += model_reward

        # Track positive and negative rewards
        if model_reward < 0:
            model_negative_rewards += 1
        elif model_reward > 0:
            model_positive_rewards += 1

        if dataset_reward < 0:
            dataset_negative_rewards += 1
        elif dataset_reward > 0:
            dataset_positive_rewards += 1

        # Check if "No Action" was beneficial or harmful
        if model_action == "No Action" and dataset_action != "No Action":
            if dataset_reward < 0:
                no_action_wins += 1  # Model avoided a bad action
            elif dataset_reward > 0:
                no_action_losses += 1  # Model skipped a good action

        obs, _, terminated, truncated, _ = env.step(action)

        model_path.append({"step": steps, "state": obs.tolist(), "action": model_action, "reward": model_reward})
        dataset_path.append({"step": steps, "state": obs.tolist(), "action": dataset_action, "reward": dataset_reward})
        dataset_total_reward += dataset_reward

        # Count actions
        if model_action == "No Action":
            no_action_count += 1
        elif model_action == "DM_sent":
            dm_count += 1
        elif model_action == "EMsent":
            em_count += 1

        if dataset_action == "DM_sent":
            dataset_dm_count += 1
        elif dataset_action == "EMsent":
            dataset_em_count += 1

        steps += 1
        if terminated or truncated:
            break

    # Compute evaluation metrics
    model_total_contacts = dm_count + em_count
    dataset_total_contacts = dataset_dm_count + dataset_em_count

    model_total_cost = dm_count * 1  
    dataset_total_cost = dataset_dm_count * 1  

    model_reward_per_contact = model_total_reward / model_total_contacts if model_total_contacts > 0 else 0
    dataset_reward_per_contact = dataset_total_reward / dataset_total_contacts if dataset_total_contacts > 0 else 0

    model_cost_per_reward = model_total_cost / model_total_reward if model_total_reward != 0 else float('inf')
    dataset_cost_per_reward = dataset_total_cost / dataset_total_reward if dataset_total_reward != 0 else float('inf')

    # Print Results
    print(f"Model Total Reward: {model_total_reward:.2f} (Only for matched actions)")
    print(f"Dataset Total Reward: {dataset_total_reward:.2f}")
    print(f"Total Contacts in Model: {model_total_contacts} out of {max_steps}")
    print(f"Total Contacts in Dataset: {dataset_total_contacts} out of {max_steps}")
    print(f"Difference in Contacts Between Model and Dataset: {model_total_contacts - dataset_total_contacts}")
    print(f"Model Total Cost: {model_total_cost} CHF")
    print(f"Dataset Total Cost: {dataset_total_cost} CHF")
    print(f"Model Reward per Contact: {model_reward_per_contact:.4f}")
    print(f"Dataset Reward per Contact: {dataset_reward_per_contact:.4f}")
    print(f"Model Cost per Reward Point: {model_cost_per_reward:.4f} CHF")
    print(f"Dataset Cost per Reward Point: {dataset_cost_per_reward:.4f} CHF")
    print(f"Model Negative Reward Count: {model_negative_rewards} out of {max_steps}")
    print(f"Dataset Negative Reward Count: {dataset_negative_rewards} out of {max_steps}")
    print(f"'No Action' Wins (Avoided Bad Actions): {no_action_wins} times")
    print(f"'No Action' Losses (Missed Good Actions): {no_action_losses} times")

    return model_path, dataset_path

def plot_model_performance(model_path, dataset_path):
    # Extract rewards and actions
    model_rewards = [step["reward"] for step in model_path]
    dataset_rewards = [step["reward"] for step in dataset_path]
    model_actions = [step["action"] for step in model_path]
    dataset_actions = [step["action"] for step in dataset_path]

    # Cumulative rewards
    model_cumulative_reward = np.cumsum(model_rewards)
    dataset_cumulative_reward = np.cumsum(dataset_rewards)

    # Cumulative cost (1 CHF per DM_sent)
    model_cumulative_cost = np.cumsum([1 if action == "DM_sent" else 0 for action in model_actions])
    dataset_cumulative_cost = np.cumsum([1 if action == "DM_sent" else 0 for action in dataset_actions])

    # Plot cumulative rewards
    plt.figure(figsize=(10, 5))
    plt.plot(model_cumulative_reward, label="Model Cumulative Reward", color="#2f58d4")
    plt.plot(dataset_cumulative_reward, label="Dataset Cumulative Reward", color="#ab9f7d", linestyle="dashed")
    plt.xlabel("Steps")
    plt.ylabel("Cumulative Reward")
    plt.title("Cumulative Reward Comparison")
    plt.legend()
    plt.grid()
    plt.show()

    # Plot cumulative cost
    plt.figure(figsize=(10, 5))
    plt.plot(model_cumulative_cost, label="Model Cumulative Cost", color="#2f58d4")
    plt.plot(dataset_cumulative_cost, label="Dataset Cumulative Cost", color="#ab9f7d", linestyle="dashed")
    plt.xlabel("Steps")
    plt.ylabel("Cumulative Cost (CHF)")
    plt.title("Cumulative Cost Comparison")
    plt.legend()
    plt.grid()
    plt.show()

    # Action Distribution
    model_action_counts = {action: model_actions.count(action) for action in set(model_actions)}
    dataset_action_counts = {action: dataset_actions.count(action) for action in set(dataset_actions)}

    actions = sorted(set(model_action_counts.keys()).union(dataset_action_counts.keys()))
    model_counts = [model_action_counts.get(action, 0) for action in actions]
    dataset_counts = [dataset_action_counts.get(action, 0) for action in actions]

    x = np.arange(len(actions))

    plt.figure(figsize=(10, 5))
    plt.bar(x - 0.2, model_counts, width=0.4, label="Model Actions", color="#2f58d4")
    plt.bar(x + 0.2, dataset_counts, width=0.4, label="Dataset Actions", color="#ab9f7d", alpha=0.7)
    plt.xticks(x, actions, rotation=45)
    plt.xlabel("Actions")
    plt.ylabel("Count")
    plt.title("Action Distribution: Model vs Dataset")
    plt.legend()
    plt.grid(axis="y")
    plt.tight_layout()
    plt.show()

# Sample dataset for evaluation
eval_subset = rl_data.sample(10000, random_state=42).reset_index(drop=True)
eval_env = CustomerInteractionEnv(eval_subset, total_timesteps=10_000_000)

# Run evaluation
model_path, dataset_path = evaluate_model_performance(env_test, model, test_data, max_steps=1000)

# Call function after evaluation
plot_model_performance(model_path, dataset_path)

In [None]:
#MODEL APPLICATION TEST

# Choose a random starting state from test_data
i = random.randint(0, 1000)
state = test_data['State'].iloc[i] 

if isinstance(state, str):
    state = eval(state)

obs = np.array(state, dtype=np.float32)  # Convert to NumPy array

# Get the best predicted action for this state
action, _ = model.predict(obs, deterministic=True)

# Get action name
actions = test_data['Action'].unique().tolist() + ["No Action"]
action_name = actions[action] if 0 <= action < len(actions) else "Error"

# Print the best action
print(f"Predicted best action for state {i}: {action_name}")
