In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
import pandas as pd
import matplotlib.pyplot as plt


# Neural Network for Deep Q-learning
class DQN(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)


# Replay Buffer for experience replay
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            np.array(states),
            np.array(actions),
            np.array(rewards),
            np.array(next_states),
            np.array(dones),
        )

    def __len__(self):
        return len(self.buffer)


# Agent implementing the DQN algorithm
class DQNAgent:
    def __init__(
        self,
        state_size,
        action_size,
        alpha_min,
        alpha_max,
        r_min,
        r_max,
        num_alpha_steps=10,
        num_r_steps=10,
        discount_factor=0.99,
        epsilon_start=1.0,
        epsilon_end=0.01,
        epsilon_decay=0.995,
        learning_rate=0.001,
        batch_size=64,
        update_target_freq=10,
    ):
        self.state_size = state_size
        self.action_size = action_size

        # Define the discrete action space
        self.alpha_range = np.linspace(alpha_min, alpha_max, num_alpha_steps)
        self.r_range = np.linspace(r_min, r_max, num_r_steps)
        self.alpha_steps = num_alpha_steps
        self.r_steps = num_r_steps

        # Hyperparameters
        self.gamma = discount_factor  # Discount factor
        self.epsilon = epsilon_start  # Exploration rate
        self.epsilon_min = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.update_target_freq = update_target_freq

        # Policy and target networks
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = DQN(state_size, action_size).to(self.device)
        self.target_net = DQN(state_size, action_size).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        # Optimizer
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)

        # Replay buffer
        self.memory = ReplayBuffer()

        # Metrics
        self.training_rewards = []
        self.epsilon_history = []
        self.avg_q_values = []
        self.losses = []
        self.steps = 0

    def select_action(self, state):
        if np.random.rand() <= self.epsilon:
            # Exploration: choose a random action
            return np.random.randint(0, self.action_size)
        else:
            # Exploitation: choose best action based on Q-values
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            with torch.no_grad():
                q_values = self.policy_net(state_tensor)
                self.avg_q_values.append(q_values.mean().item())
                return q_values.argmax().item()

    def action_to_params(self, action):
        # Convert discrete action index to alpha and r values
        alpha_idx = action // self.r_steps
        r_idx = action % self.r_steps

        alpha = self.alpha_range[alpha_idx]
        r = self.r_range[r_idx]

        return alpha, r

    def update(self):
        if len(self.memory) < self.batch_size:
            return

        # Sample a batch of experiences
        states, actions, rewards, next_states, dones = self.memory.sample(
            self.batch_size
        )

        # Convert to tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        # Compute Q(s_t, a) - the Q-values for the actions taken
        q_values = self.policy_net(states).gather(1, actions.unsqueeze(1))

        # Compute Q(s_{t+1}, a) for all actions a
        next_q_values = self.target_net(next_states).max(1)[0].detach()

        # Compute the expected Q values
        expected_q_values = rewards + self.gamma * next_q_values * (1 - dones)

        # Compute loss
        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))
        self.losses.append(loss.item())

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        # Gradient clipping to prevent exploding gradients
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        # Update target network periodically
        self.steps += 1
        if self.steps % self.update_target_freq == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

        # Decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        self.epsilon_history.append(self.epsilon)

    def save(self, path):
        torch.save(
            {
                "policy_net": self.policy_net.state_dict(),
                "target_net": self.target_net.state_dict(),
                "optimizer": self.optimizer.state_dict(),
                "epsilon": self.epsilon,
                "steps": self.steps,
            },
            path,
        )

    def load(self, path):
        checkpoint = torch.load(path)
        self.policy_net.load_state_dict(checkpoint["policy_net"])
        self.target_net.load_state_dict(checkpoint["target_net"])
        self.optimizer.load_state_dict(checkpoint["optimizer"])
        self.epsilon = checkpoint["epsilon"]
        self.steps = checkpoint["steps"]


# TokenomicsEnvironment simulates the blockchain environment
class TokenomicsEnvironment:
    def __init__(
        self,
        dex_data_path=None,  # Path to DEX trading data
        initial_supply=1000000,
        initial_alpha=100,
        initial_r=0.3,  # 30% reserve ratio
        episode_length=30,  # 30 days simulation
        investor_weight=0.5,  # Weight for investor interests vs project interests
        use_real_data=False,
    ):
        self.initial_supply = initial_supply
        self.initial_alpha = initial_alpha
        self.initial_r = initial_r
        self.episode_length = episode_length
        self.investor_weight = investor_weight
        self.use_real_data = use_real_data
        self.current_step = 0

        # State variables
        self.supply = initial_supply
        self.price = self.calculate_price(initial_supply, initial_alpha, initial_r)
        self.volume = 0
        self.buys = 0
        self.sells = 0
        self.tvl = 0
        self.qf_valuation = 0

        # Metrics
        self.price_stability = 100  # Higher is better
        self.project_success_rate = 0
        self.investor_return_rate = 0

        # Load real DEX data if available
        if use_real_data and dex_data_path:
            self.dex_data = pd.read_csv(dex_data_path)
        else:
            # Generate synthetic data
            self.dex_data = self.generate_synthetic_data()

    def generate_synthetic_data(self):
        # Create synthetic DEX data for simulation
        data = []
        base_price = self.price
        base_volume = self.initial_supply * 0.01  # 1% daily volume

        for day in range(90):  # Generate 90 days of data
            # Add some randomness to price movements with a slight upward bias
            price_change = np.random.normal(
                0.002, 0.02
            )  # Mean 0.2% daily growth, 2% std
            price = base_price * (1 + price_change)

            # Volume also varies with some correlation to absolute price change
            volume_multiplier = (
                1 + abs(price_change) * 5
            )  # Higher volatility, higher volume
            volume = base_volume * volume_multiplier

            # Generate buy/sell ratio
            if price_change > 0:
                buy_ratio = 0.5 + np.random.random() * 0.3  # 50-80% buys on up days
            else:
                buy_ratio = 0.2 + np.random.random() * 0.3  # 20-50% buys on down days

            buys = int(volume * buy_ratio)
            sells = int(volume * (1 - buy_ratio))

            # Generate QF valuations with some correlation to price trend
            qf_trend = np.random.normal(
                price_change * 2, 0.05
            )  # More extreme than price
            qf_valuation = max(0, self.initial_supply * self.price * (1 + qf_trend))

            # Generate TVL
            tvl = self.initial_supply * price * (0.4 + 0.2 * np.random.random())

            data.append(
                {
                    "day": day,
                    "price": price,
                    "volume": volume,
                    "buys": buys,
                    "sells": sells,
                    "tvl": tvl,
                    "qf_valuation": qf_valuation,
                }
            )

            base_price = price  # Update base price for next iteration

        return pd.DataFrame(data)

    def calculate_price(self, supply, alpha, r):
        # ABC formula: P(s) = alpha * s^beta, where beta = 1/(1-r)
        if supply == 0:
            return alpha

        beta = 1 / (1 - r)
        return alpha * (supply**beta)

    def reset(self):
        self.current_step = 0
        self.supply = self.initial_supply
        self.price = self.calculate_price(
            self.initial_supply, self.initial_alpha, self.initial_r
        )

        # Reset metrics
        self.price_stability = 100
        self.project_success_rate = 0
        self.investor_return_rate = 0

        # Get initial state
        return self.get_state()

    def get_state(self):
        # Get data for current day
        if self.use_real_data:
            day_data = self.dex_data.iloc[self.current_step % len(self.dex_data)]
        else:
            day_data = self.dex_data.iloc[self.current_step % len(self.dex_data)]

        # Normalize state values for neural network
        state = np.array(
            [
                self.price / 1000,  # Normalized price
                self.supply / self.initial_supply,  # Relative supply
                day_data["volume"] / self.initial_supply,  # Volume as % of supply
                day_data["buys"]
                / (day_data["buys"] + day_data["sells"] + 1e-10),  # Buy ratio
                day_data["tvl"] / (self.supply * self.price + 1e-10),  # TVL ratio
                day_data["qf_valuation"]
                / (self.supply * self.price + 1e-10),  # QF valuation ratio
                self.price_stability / 100,  # Normalized stability
                self.project_success_rate / 100,  # Normalized project success
                self.investor_return_rate / 100,  # Normalized investor return
            ]
        )

        return state

    def step(self, action_params):
        # Unpack action parameters
        alpha, r = action_params

        # Get data for current day
        if self.use_real_data:
            day_data = self.dex_data.iloc[self.current_step % len(self.dex_data)]
        else:
            day_data = self.dex_data.iloc[self.current_step % len(self.dex_data)]

        # Store old price for comparison
        old_price = self.price

        # Calculate new price with new parameters
        new_price = self.calculate_price(self.supply, alpha, r)

        # Update price
        price_change_pct = abs(new_price - old_price) / old_price

        # Calculate stability - lower price volatility is better
        new_stability = 100 * (
            1 - min(1, price_change_pct * 5)
        )  # Penalize large price changes
        self.price_stability = (
            0.7 * self.price_stability + 0.3 * new_stability
        )  # Weighted average

        # Simulate token demand based on price change
        if new_price > old_price:
            # Price increase: fewer buys (demand elasticity)
            buy_adjustment = max(0.5, 1 - price_change_pct * 3)
            adjusted_buys = day_data["buys"] * buy_adjustment

            # More sells due to profit-taking
            sell_adjustment = min(2.0, 1 + price_change_pct * 5)
            adjusted_sells = day_data["sells"] * sell_adjustment
        else:
            # Price decrease: more buys (buying the dip)
            buy_adjustment = min(2.0, 1 + price_change_pct * 2)
            adjusted_buys = day_data["buys"] * buy_adjustment

            # More sells due to fear
            sell_adjustment = min(2.5, 1 + price_change_pct * 8)
            adjusted_sells = day_data["sells"] * sell_adjustment

        # Update supply based on net buys/sells
        net_tokens = adjusted_buys - adjusted_sells
        self.supply += net_tokens
        self.supply = max(
            self.supply, self.initial_supply * 0.1
        )  # Don't let supply go too low

        # Update actual price based on new supply
        self.price = self.calculate_price(self.supply, alpha, r)

        # Update project success rate based on QF valuation and price
        qf_price_ratio = day_data["qf_valuation"] / (self.supply * self.price + 1e-10)
        self.project_success_rate = 0.8 * self.project_success_rate + 0.2 * min(
            100, qf_price_ratio * 50
        )

        # Update investor return rate based on price change
        if self.current_step > 0:
            price_return = (self.price / old_price - 1) * 100  # Percentage return
            self.investor_return_rate = 0.8 * self.investor_return_rate + 0.2 * min(
                100, max(0, 50 + price_return)
            )

        # Calculate reward
        price_stability_reward = self.price_stability / 25  # Scale from 0-4
        project_reward = self.project_success_rate / 25  # Scale from 0-4
        investor_reward = self.investor_return_rate / 25  # Scale from 0-4

        # Balance investor and project interests
        reward = (
            price_stability_reward * 0.2  # 20% for stability
            + investor_reward
            * self.investor_weight
            * 0.8  # 80% split between investor and project
            + project_reward * (1 - self.investor_weight) * 0.8
        )

        # Increment step
        self.current_step += 1
        done = self.current_step >= self.episode_length

        return (
            self.get_state(),
            reward,
            done,
            {
                "price": self.price,
                "supply": self.supply,
                "price_stability": self.price_stability,
                "project_success_rate": self.project_success_rate,
                "investor_return_rate": self.investor_return_rate,
            },
        )


# Training function
def train_dqn(agent, env, num_episodes=1000, max_steps=30):
    rewards_history = []

    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0

        for step in range(max_steps):
            # Select action
            action = agent.select_action(state)
            action_params = agent.action_to_params(action)

            # Take action in environment
            next_state, reward, done, info = env.step(action_params)

            # Store transition in replay buffer
            agent.memory.add(state, action, reward, next_state, done)

            # Update network
            agent.update()

            # Update state and accumulate reward
            state = next_state
            episode_reward += reward

            if done:
                break

        # Record metrics
        agent.training_rewards.append(episode_reward)

        # Print progress
        if (episode + 1) % 10 == 0:
            avg_reward = np.mean(agent.training_rewards[-10:])
            avg_loss = np.mean(agent.losses[-100:]) if agent.losses else 0
            avg_q = np.mean(agent.avg_q_values[-100:]) if agent.avg_q_values else 0
            print(
                f"Episode {episode+1}/{num_episodes}, Avg Reward: {avg_reward:.2f}, "
                f"Epsilon: {agent.epsilon:.2f}, Avg Loss: {avg_loss:.4f}, Avg Q: {avg_q:.2f}"
            )

            # Save model checkpoint
            if (episode + 1) % 100 == 0:
                agent.save(f"dqn_model_episode_{episode+1}.pt")

    return agent


# Comparison function between ABC+QF with and without RL
def compare_with_without_rl(dex_data_path=None, num_episodes=1):
    # Environment setup
    env_with_rl = TokenomicsEnvironment(
        dex_data_path=dex_data_path, use_real_data=dex_data_path is not None
    )
    env_without_rl = TokenomicsEnvironment(
        dex_data_path=dex_data_path, use_real_data=dex_data_path is not None
    )

    # RL agent (already trained)
    state_size = 9  # Number of state variables
    action_size = 100  # 10 alpha steps x 10 r steps
    agent = DQNAgent(
        state_size=state_size,
        action_size=action_size,
        alpha_min=50,
        alpha_max=500,
        r_min=0.1,
        r_max=0.9,
        num_alpha_steps=10,
        num_r_steps=10,
    )

    # Load trained model
    try:
        agent.load("dqn_model_final.pt")
    except:
        print("No trained model found. Using untrained agent.")

    # Results storage
    results_with_rl = {
        "price": [],
        "stability": [],
        "project_success": [],
        "investor_return": [],
    }

    results_without_rl = {
        "price": [],
        "stability": [],
        "project_success": [],
        "investor_return": [],
    }

    # Simulation with RL
    state = env_with_rl.reset()
    for step in range(env_with_rl.episode_length):
        action = agent.select_action(state)
        action_params = agent.action_to_params(action)

        state, _, done, info = env_with_rl.step(action_params)

        # Store results
        results_with_rl["price"].append(info["price"])
        results_with_rl["stability"].append(info["price_stability"])
        results_with_rl["project_success"].append(info["project_success_rate"])
        results_with_rl["investor_return"].append(info["investor_return_rate"])

    # Simulation without RL (fixed parameters)
    fixed_alpha = env_without_rl.initial_alpha
    fixed_r = env_without_rl.initial_r

    state = env_without_rl.reset()
    for step in range(env_without_rl.episode_length):
        state, _, done, info = env_without_rl.step((fixed_alpha, fixed_r))

        # Store results
        results_without_rl["price"].append(info["price"])
        results_without_rl["stability"].append(info["price_stability"])
        results_without_rl["project_success"].append(info["project_success_rate"])
        results_without_rl["investor_return"].append(info["investor_return_rate"])

    # Plot comparison
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))

    # Plot price
    axs[0, 0].plot(results_with_rl["price"], "b-", label="With RL")
    axs[0, 0].plot(results_without_rl["price"], "r--", label="Without RL")
    axs[0, 0].set_title("Token Price")
    axs[0, 0].set_xlabel("Days")
    axs[0, 0].set_ylabel("Price")
    axs[0, 0].legend()

    # Plot stability
    axs[0, 1].plot(results_with_rl["stability"], "b-", label="With RL")
    axs[0, 1].plot(results_without_rl["stability"], "r--", label="Without RL")
    axs[0, 1].set_title("Price Stability")
    axs[0, 1].set_xlabel("Days")
    axs[0, 1].set_ylabel("Stability Score")
    axs[0, 1].legend()

    # Plot project success
    axs[1, 0].plot(results_with_rl["project_success"], "b-", label="With RL")
    axs[1, 0].plot(results_without_rl["project_success"], "r--", label="Without RL")
    axs[1, 0].set_title("Project Success Rate")
    axs[1, 0].set_xlabel("Days")
    axs[1, 0].set_ylabel("Success Rate")
    axs[1, 0].legend()

    # Plot investor return
    axs[1, 1].plot(results_with_rl["investor_return"], "b-", label="With RL")
    axs[1, 1].plot(results_without_rl["investor_return"], "r--", label="Without RL")
    axs[1, 1].set_title("Investor Return Rate")
    axs[1, 1].set_xlabel("Days")
    axs[1, 1].set_ylabel("Return Rate")
    axs[1, 1].legend()

    plt.tight_layout()
    plt.savefig("rl_comparison.png")
    plt.show()

    # Return comparison metrics
    return {
        "with_rl": {
            "avg_price": np.mean(results_with_rl["price"]),
            "avg_stability": np.mean(results_with_rl["stability"]),
            "avg_project_success": np.mean(results_with_rl["project_success"]),
            "avg_investor_return": np.mean(results_with_rl["investor_return"]),
        },
        "without_rl": {
            "avg_price": np.mean(results_without_rl["price"]),
            "avg_stability": np.mean(results_without_rl["stability"]),
            "avg_project_success": np.mean(results_without_rl["project_success"]),
            "avg_investor_return": np.mean(results_without_rl["investor_return"]),
        },
    }


# Main function
if __name__ == "__main__":
    # Set random seeds for reproducibility
    np.random.seed(42)
    torch.manual_seed(42)
    random.seed(42)

    # Create environment
    env = TokenomicsEnvironment()

    # Define state and action sizes
    state_size = 9  # Number of state variables
    action_size = 100  # 10 alpha steps x 10 r steps

    # Create agent
    agent = DQNAgent(
        state_size=state_size,
        action_size=action_size,
        alpha_min=50,
        alpha_max=500,
        r_min=0.1,
        r_max=0.9,
        num_alpha_steps=10,
        num_r_steps=10,
    )

    # Train agent
    print("Starting training...")
    trained_agent = train_dqn(agent, env, num_episodes=500)

    # Save final model
    trained_agent.save("dqn_model_final.pt")

    # Compare with and without RL
    print("Comparing performance with and without RL...")
    comparison = compare_with_without_rl()

    print("\nAverage metrics with RL:")
    for key, value in comparison["with_rl"].items():
        print(f"{key}: {value:.2f}")

    print("\nAverage metrics without RL:")
    for key, value in comparison["without_rl"].items():
        print(f"{key}: {value:.2f}")
