In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install alpha_vantage

from alpha_vantage.timeseries import TimeSeries
import pandas as pd

api_key = 'QO84EVQ44QFRY1VY'

ts = TimeSeries(key=api_key, output_format='pandas')
data, meta = ts.get_daily(symbol='MSFT', outputsize='full')

# Clean and sort
data.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
data = data.sort_index()
print(data.tail())



Collecting alpha_vantage
  Downloading alpha_vantage-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading alpha_vantage-3.0.0-py3-none-any.whl (35 kB)
Installing collected packages: alpha_vantage
Successfully installed alpha_vantage-3.0.0
              Open    High       Low   Close      Volume
date                                                    
2025-05-02  431.74  439.44  429.9850  435.28  30757434.0
2025-05-05  432.87  439.50  432.1100  436.17  20136053.0
2025-05-06  432.20  437.73  431.1700  433.31  15104204.0
2025-05-07  433.84  438.12  431.1103  433.35  23307241.0
2025-05-08  437.93  443.67  435.6600  438.17  23491330.0


In [None]:
len(data)

6419

# Data preprocess

In [None]:
import numpy as np

# Step 1: Compute returns and moving averages
data['Return'] = data['Close'].pct_change()
data['MA5'] = data['Close'].rolling(window=5).mean()
data['MA10'] = data['Close'].rolling(window=10).mean()
data.dropna(inplace=True)

# Step 2: Normalize features (you can also use sklearn StandardScaler)
data[['Return', 'MA5', 'MA10', 'Volume']] = data[['Return', 'MA5', 'MA10', 'Volume']].apply(lambda x: (x - x.mean()) / x.std())

# Step 3: Create state windows
window_size = 10
features = ['Return', 'MA5', 'MA10', 'Volume']
states = []
for i in range(window_size, len(data)):
    window = data[features].iloc[i-window_size:i].values.flatten()
    states.append(window)

states = np.array(states)  # shape: (samples, window_size * num_features)

print(f"State shape: {states.shape}")  # Should be (samples, 10 Ã— 4 = 40)
print(len(data))

State shape: (6400, 40)
6410


In [None]:
train_data = data[data.index < '2025-04-01']
test_data = data[data.index >= '2025-04-01']


In [None]:
# Keep aligned dates for each state
state_dates = data.index[10:]  # Each state ends at this date

# Convert to NumPy array if needed
state_dates = pd.to_datetime(data.index[10:])


# Boolean masks
train_mask = state_dates < '2025-04-01'
test_mask = state_dates >= '2025-04-01'

# Split states
train_states = states[train_mask]
test_states = states[test_mask]

print(train_states.shape)
print(test_states.shape)


(6373, 40)
(26, 40)


# QNetwork

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class QNetwork(nn.Module):
    def __init__(self, input_dim=40, hidden_dim=128, output_dim=3):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)  # No softmax â€” raw Q-values


# ReplayBuffer

In [None]:
import random
from collections import deque
import numpy as np

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            np.array(states),
            np.array(actions),
            np.array(rewards, dtype=np.float32),
            np.array(next_states),
            np.array(dones, dtype=np.uint8),
        )

    def __len__(self):
        return len(self.buffer)


# TradingEnv setup

In [None]:
class TradingEnv:
    def __init__(self, states, prices, initial_balance=10000):
        self.states = states
        self.prices = prices  # usually Close prices
        self.initial_balance = initial_balance
        self.reset()

    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.shares_held = 0
        self.total_asset = self.initial_balance
        self.done = False
        return self.states[self.current_step]

    def step(self, action):
        # Actions: 0 = Hold, 1 = Buy, 2 = Sell
        price = self.prices[self.current_step]

        # Buy
        if action == 1 and self.balance >= price:
            self.shares_held += 1
            self.balance -= price

        # Sell
        elif action == 2 and self.shares_held > 0:
            self.shares_held -= 1
            self.balance += price

        # Move to next step
        self.current_step += 1
        if self.current_step >= len(self.states) - 1:
            self.done = True

        next_state = self.states[self.current_step]
        next_price = self.prices[self.current_step]

        new_total_asset = self.balance + self.shares_held * next_price
        reward = new_total_asset - self.total_asset
        self.total_asset = new_total_asset

        return next_state, reward, self.done, self.balance, self.shares_held,price


In [None]:
# Assume `train_states` is your (N, 40) NumPy array
# Assume `train_data` is your DataFrame with 'Close' prices
# Use 'Close' as the price input for reward calculation
env = TradingEnv(states=train_states, prices=train_data['Close'].values)

state = env.reset()
print("Initial total asset:", env.total_asset)

for step in range(10):  # test first 10 steps
    action = np.random.choice([0, 1, 2])  # randomly pick action
    next_state, reward, done,balance,shares_hold,price = env.step(action)

    print(f"Step {step + 1}")
    print("  Action:", ["Hold", "Buy", "Sell"][action])
    print("  Reward:", reward)
    print("  Total Asset:", env.total_asset)
    if done:
        print("Episode ended early.")
        break


Initial total asset: 10000
Step 1
  Action: Buy
  Reward: -2.1900000000005093
  Total Asset: 9997.81
Step 2
  Action: Sell
  Reward: 0.0
  Total Asset: 9997.81
Step 3
  Action: Hold
  Reward: 0.0
  Total Asset: 9997.81
Step 4
  Action: Hold
  Reward: 0.0
  Total Asset: 9997.81
Step 5
  Action: Sell
  Reward: 0.0
  Total Asset: 9997.81
Step 6
  Action: Hold
  Reward: 0.0
  Total Asset: 9997.81
Step 7
  Action: Buy
  Reward: -0.18999999999869033
  Total Asset: 9997.62
Step 8
  Action: Buy
  Reward: 4.139999999997599
  Total Asset: 10001.759999999998
Step 9
  Action: Hold
  Reward: -1.139999999999418
  Total Asset: 10000.619999999999
Step 10
  Action: Buy
  Reward: -2.790000000000873
  Total Asset: 9997.829999999998


In [None]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize Q-network and target network
policy_net = QNetwork().to(device)
target_net = QNetwork().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

# Optimizer
optimizer = optim.Adam(policy_net.parameters(), lr=1e-3)

# Replay buffer
replay_buffer = ReplayBuffer(capacity=10000)

# Loss function
loss_fn = nn.MSELoss()


In [None]:
num_episodes = 500
batch_size = 64
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.05
epsilon_decay = 0.995
target_update_freq = 5  # Episodes after which to sync target network


In [None]:
# Helper to select action (Îµ-greedy)
def select_action(state, epsilon):
    if random.random() < epsilon:
        return random.choice([0, 1, 2])  # hold, buy, sell
    else:
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = policy_net(state_tensor)
        return torch.argmax(q_values).item()

# Extract corresponding Close prices for rewards
train_prices = data['Close'].values[10:][train_mask]

# Training loop
for episode in range(num_episodes):
    env = TradingEnv(train_states, train_prices)
    state = env.reset()
    done = False
    total_reward = 0
    epsilon = max(epsilon_end, epsilon_start * (epsilon_decay ** episode))

    while not done:
        action = select_action(state, epsilon)
        next_state, reward, done, balance, shares_hold, price = env.step(action)

        replay_buffer.add(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        if len(replay_buffer) >= batch_size:
            # Sample batch
            states_b, actions_b, rewards_b, next_states_b, dones_b = replay_buffer.sample(batch_size)

            states_b = torch.FloatTensor(states_b).to(device)
            actions_b = torch.LongTensor(actions_b).unsqueeze(1).to(device)
            rewards_b = torch.FloatTensor(rewards_b).to(device)
            next_states_b = torch.FloatTensor(next_states_b).to(device)
            dones_b = torch.BoolTensor(dones_b).to(device)

            # Q(s, a)
            q_values = policy_net(states_b).gather(1, actions_b).squeeze()

            # target = r + Î³ max_a Q_target(s', a)
            with torch.no_grad():
                next_q_values = target_net(next_states_b).max(1)[0]
                targets = rewards_b + gamma * next_q_values * (~dones_b)

            # Compute loss and update
            loss = loss_fn(q_values, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Sync target network
    if (episode + 1) % target_update_freq == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode + 1}/{num_episodes} â€” Total Reward: {total_reward:.2f} â€” Epsilon: {epsilon:.3f}")


Episode 1/500 â€” Total Reward: 49106.07 â€” Epsilon: 1.000
Episode 2/500 â€” Total Reward: 7854.45 â€” Epsilon: 0.995
Episode 3/500 â€” Total Reward: 7704.83 â€” Epsilon: 0.990
Episode 4/500 â€” Total Reward: 26839.34 â€” Epsilon: 0.985
Episode 5/500 â€” Total Reward: 17367.45 â€” Epsilon: 0.980
Episode 6/500 â€” Total Reward: 18059.47 â€” Epsilon: 0.975
Episode 7/500 â€” Total Reward: 4408.85 â€” Epsilon: 0.970
Episode 8/500 â€” Total Reward: 7658.21 â€” Epsilon: 0.966
Episode 9/500 â€” Total Reward: 19565.26 â€” Epsilon: 0.961
Episode 10/500 â€” Total Reward: 7512.84 â€” Epsilon: 0.956
Episode 11/500 â€” Total Reward: 20970.99 â€” Epsilon: 0.951
Episode 12/500 â€” Total Reward: 22098.58 â€” Epsilon: 0.946
Episode 13/500 â€” Total Reward: 34464.13 â€” Epsilon: 0.942
Episode 14/500 â€” Total Reward: 12567.65 â€” Epsilon: 0.937
Episode 15/500 â€” Total Reward: 4478.64 â€” Epsilon: 0.932
Episode 16/500 â€” Total Reward: 4260.07 â€” Epsilon: 0.928
Episode 17/500 â€” Total Reward: 4977.09

In [None]:
# Define a path in your Drive
model_path = "/content/drive/MyDrive/trading_agent.pth"

# Save model state dict
torch.save(policy_net.state_dict(), model_path)
print(f"Model saved to {model_path}")


Model saved to /content/drive/MyDrive/trading_agent.pth


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
policy_net = QNetwork(input_dim=40, hidden_dim=128, output_dim=3).to(device)

# 2. Load the saved weights
#Corrected model path to the filename used during saving
model_path = "/content/drive/MyDrive/trading_agent.pth"
policy_net.load_state_dict(torch.load(model_path, map_location=device))

# 3. Set to evaluation mode
policy_net.eval()

print("âœ… Model loaded and ready for inference.")

âœ… Model loaded and ready for inference.


In [None]:
import torch.nn.functional as F


In [None]:
def test_agent(policy_net, test_states, test_prices):
    env = TradingEnv(test_states, test_prices)
    state = env.reset()
    done = False
    total_reward = 0
    portfolio_values = []

    while not done:
        # Always choose the best action (no Îµ-greedy during testing)
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = policy_net(state_tensor)
        action_probs = action_probs = F.softmax(q_values, dim=1).detach().cpu().numpy().flatten()

        #print(action_probs)
        confidence=max(action_probs)
        action = torch.argmax(q_values).item()
        next_state, reward, done, balance, shares_held,price=env.step(action)
        total_reward += reward
        state = next_state
        portfolio_values.append(env.total_asset)
        # Explicitly convert price to a float using float()
        print(f"Action: {action} |Confidence: {confidence}| Balance: ${balance} | Price: ${price} | Shares Held: {shares_held} ")

    # Explicitly convert price to a float using float()
    print(f"âœ… Final Total Asset Value: ${float(shares_held)*float(price)+balance:.2f}")

    # Explicitly convert price to a float using float()
    print(f"ðŸ“ˆ Total Test Reward: ${(float(shares_held)*float(price)+balance) -10000}")

    print(portfolio_values)

In [None]:
test_prices = data['Close'].values[10:][test_mask]
portfolio_history = test_agent(policy_net, test_states, test_prices)


Action: 0 |Confidence: 1.0| Balance: $10000 | Price: $382.19 | Shares Held: 0 
Action: 1 |Confidence: 1.0| Balance: $9617.86 | Price: $382.14 | Shares Held: 1 
Action: 0 |Confidence: 1.0| Balance: $9617.86 | Price: $373.11 | Shares Held: 1 
Action: 0 |Confidence: 1.0| Balance: $9617.86 | Price: $359.84 | Shares Held: 1 
Action: 1 |Confidence: 1.0| Balance: $9260.0 | Price: $357.86 | Shares Held: 2 
Action: 0 |Confidence: 0.9999544620513916| Balance: $9260.0 | Price: $354.56 | Shares Held: 2 
Action: 1 |Confidence: 1.0| Balance: $8869.51 | Price: $390.49 | Shares Held: 3 
Action: 2 |Confidence: 1.0| Balance: $9250.86 | Price: $381.35 | Shares Held: 2 
Action: 1 |Confidence: 1.0| Balance: $8862.41 | Price: $388.45 | Shares Held: 3 
Action: 0 |Confidence: 0.999832272529602| Balance: $8862.41 | Price: $387.81 | Shares Held: 3 
Action: 2 |Confidence: 1.0| Balance: $9248.14 | Price: $385.73 | Shares Held: 2 
Action: 0 |Confidence: 1.0| Balance: $9248.14 | Price: $371.61 | Shares Held: 2 
Act

In [None]:
def random_agent(test_states, test_prices, seed=42):
    random.seed(seed)
    env = TradingEnv(test_states, test_prices)
    state = env.reset()
    done = False
    total_reward = 0
    portfolio_values = []

    while not done:
        action = random.choice([0, 1, 2])  # hold, buy, sell
        next_state, reward, done,balance,shares_hold,price = env.step(action)
        total_reward += reward
        state = next_state
        portfolio_values.append(env.total_asset)

    print(f"ðŸŒ€ Final Total Asset (Random): ${env.total_asset:.2f}")
    print(f"ðŸŽ² Total Reward (Random): {total_reward:.2f}")

    return portfolio_values


In [None]:
random_agent(test_states,test_prices)

ðŸŒ€ Final Total Asset (Random): $10081.60
ðŸŽ² Total Reward (Random): 81.60


[np.float64(10000.0),
 np.float64(10000.0),
 np.float64(10000.0),
 np.float64(10000.0),
 np.float64(9996.699999999999),
 np.float64(10032.63),
 np.float64(10023.49),
 np.float64(10030.59),
 np.float64(10030.59),
 np.float64(10030.59),
 np.float64(10030.59),
 np.float64(10030.59),
 np.float64(10030.59),
 np.float64(10030.59),
 np.float64(10030.59),
 np.float64(10043.5),
 np.float64(10048.050000000001),
 np.float64(10047.36),
 np.float64(10050.240000000002),
 np.float64(10051.460000000001),
 np.float64(10081.6),
 np.float64(10081.6),
 np.float64(10081.6),
 np.float64(10081.6),
 np.float64(10081.6)]