## Deep Deterministic Policy Gradient (DDPG)

DDPG is an off-policy, model-free reinforcement learning algorithm designed for continuous action spaces. 

It combines ideas from DQN (value-based) and policy gradients (actor-critic methods), and was introduced by DeepMind in 2015.

Actor Network (μ): Learns the deterministic policy
 - Maps state → continuous action

Critic Network (Q): Learns the action-value function
 - Maps (state, action) → expected return


##### Implementation of DDPG for stock portfolio optimization via Pytorch

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque, namedtuple
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import yfinance as yf
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

##### Define the Environment

In [2]:
class PortfolioEnvironment:
    def __init__(self, data, initial_balance=100000, transaction_cost=0.001):
        self.data = data
        self.initial_balance = initial_balance
        self.transaction_cost = transaction_cost
        self.n_assets = len(data.columns)  
        self.current_step = 0
        self.balance = initial_balance
        self.shares_held = np.zeros(self.n_assets)
        self.portfolio_value = initial_balance
        self.previous_portfolio_value = initial_balance      
        self.prices = data.values
        self.n_steps = len(data)
        
    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.shares_held = np.zeros(self.n_assets)
        self.portfolio_value = self.initial_balance
        self.previous_portfolio_value = self.initial_balance
        
        return self._get_state()
    
    def _get_state(self):
        current_prices = self.prices[self.current_step]
        lookback = min(10, self.current_step + 1)
        price_history = self.prices[self.current_step - lookback + 1:self.current_step + 1]
        if len(price_history) >= 5:
            returns = np.diff(price_history, axis=0) / price_history[:-1]
            mean_returns = np.mean(returns, axis=0)
            volatility = np.std(returns, axis=0)
        else:
            mean_returns = np.zeros(self.n_assets)
            volatility = np.ones(self.n_assets)

        portfolio_weights = self._get_portfolio_weights()
        portfolio_return = (self.portfolio_value - self.previous_portfolio_value) / self.previous_portfolio_value

        state = np.concatenate([
            current_prices / current_prices[0],  # Normalized prices
            mean_returns,
            volatility,
            portfolio_weights,
            [self.balance / self.portfolio_value],  # Cash ratio
            [portfolio_return]  # Portfolio return
        ])
        
        return state.astype(np.float32)
    
    def _get_portfolio_weights(self):
        current_prices = self.prices[self.current_step]
        asset_values = self.shares_held * current_prices
        total_value = self.portfolio_value
        
        if total_value > 0:
            weights = asset_values / total_value
        else:
            weights = np.zeros(self.n_assets)
            
        return weights
    
    def _calculate_transaction_cost(self, old_weights, new_weights):
        total_value = self.portfolio_value
        cost = np.sum(np.abs(new_weights - old_weights)) * total_value * self.transaction_cost
        return cost
    
    def step(self, action):
        action = np.clip(action, 0, 1)  # Ensure non-negative weights
        action = action / (np.sum(action) + 1e-8)  # Normalize to sum to 1
        current_prices = self.prices[self.current_step]
        old_weights = self._get_portfolio_weights()
        transaction_cost = self._calculate_transaction_cost(old_weights, action)
        total_value = self.portfolio_value - transaction_cost
        target_values = action * total_value
        target_shares = target_values / (current_prices + 1e-8)
        self.shares_held = target_shares
        self.balance = 0  # All money invested in assets
        self.previous_portfolio_value = self.portfolio_value
        self.current_step += 1
        
        # Calculate new portfolio value
        if self.current_step < self.n_steps:
            next_prices = self.prices[self.current_step]
            self.portfolio_value = np.sum(self.shares_held * next_prices)
        else:
            self.portfolio_value = np.sum(self.shares_held * current_prices)
        
        # Calculate reward
        portfolio_return = (self.portfolio_value - self.previous_portfolio_value) / self.previous_portfolio_value
        reward = portfolio_return
        
        # Add penalty for high volatility
        if self.current_step > 1:
            returns_history = []
            start_idx = max(0, self.current_step - 30)  # Last 30 days
            for i in range(start_idx, self.current_step):
                if i > 0:
                    prev_val = np.sum(self.shares_held * self.prices[i-1])
                    curr_val = np.sum(self.shares_held * self.prices[i])
                    if prev_val > 0:
                        ret = (curr_val - prev_val) / prev_val
                        returns_history.append(ret)
            
            if len(returns_history) > 1:
                volatility_penalty = -np.std(returns_history) * 0.1
                reward += volatility_penalty
        
        # Check if done
        done = self.current_step >= self.n_steps - 1
        
        # Get next state
        if not done:
            next_state = self._get_state()
        else:
            next_state = np.zeros_like(self._get_state())
        
        return next_state, reward, done, {}

##### Define Neural Network Architectures

#### Implement DDPG Agent

#### Model Training 

##### Testing and Evaluation