# Final Production Model Training (RunPod)

Train the final production model using:
- **Baseline configuration** (no sentiment - best from ablation study)
- **Optuna-tuned hyperparameters** (Sharpe 2.28)

**Expected:** Sharpe ~1.6-2.0, Return ~45-55%, Drawdown <15%

In [None]:
# Cell 1: Setup
import os
import subprocess

REPO_URL = "https://github.com/nimeshk03/enhanced-rl-portfolio.git"
WORK_DIR = "/workspace/enhanced-rl-portfolio"

if os.path.exists(WORK_DIR):
    print("Pulling latest...")
    os.chdir(WORK_DIR)
    subprocess.run(["git", "pull"], check=True)
else:
    print("Cloning...")
    os.chdir("/workspace")
    subprocess.run(["git", "clone", REPO_URL], check=True)
    os.chdir(WORK_DIR)

print(f"Working directory: {os.getcwd()}")

In [None]:
# Cell 2: Install dependencies
!pip install -q stable-baselines3[extra] gymnasium pandas numpy

In [None]:
# Cell 4: Train Final Model
import os
import sys
import json
import numpy as np
import pandas as pd
from datetime import datetime

os.chdir("/workspace/enhanced-rl-portfolio")
sys.path.insert(0, "/workspace/enhanced-rl-portfolio")

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from src.env.enhanced_portfolio_env import EnhancedPortfolioEnv
from src.data.enhanced_processor import EnhancedDataProcessor

# ============ CONFIGURATION ============
EXPERIMENT_NAME = "final_production_model"
TOTAL_TIMESTEPS = 1_500_000

# Optuna-tuned hyperparameters (Trial 21, Sharpe 2.28)
PPO_CONFIG = {
    "learning_rate": 0.000812,
    "n_steps": 2048,
    "batch_size": 64,
    "n_epochs": 10,
    "gamma": 0.992,
    "gae_lambda": 0.95,
    "clip_range": 0.2,
    "ent_coef": 0.0024,
    "vf_coef": 0.428,
    "max_grad_norm": 0.769,
}

POLICY_KWARGS = {"net_arch": [256, 256]}

TECH_INDICATORS = [
    'macd', 'boll_ub', 'boll_lb', 'rsi_30', 'cci_30', 'dx_30',
    'close_30_sma', 'close_60_sma', 'vix', 'turbulence'
]

# ============ LOAD DATA ============
print("Loading data...")
processor = EnhancedDataProcessor(
    price_path="data/processed_data.csv",
    sentiment_path="data/historical_sentiment_complete.csv"
)
train_df, test_df = processor.get_train_test_split(
    train_end="2024-06-30",
    test_start="2024-07-01"
)

# Prepare data with day index for environment
def prepare_env_data(df):
    df = df.copy()
    df = df.sort_values(['date', 'tic']).reset_index(drop=True)
    dates = sorted(df['date'].unique())
    date_to_day = {date: i for i, date in enumerate(dates)}
    df['day'] = df['date'].map(date_to_day)
    return df.set_index('day')

train_data = prepare_env_data(train_df)
test_data = prepare_env_data(test_df)

print(f"Train: {len(train_data)} records, {train_data.index.nunique()} days")
print(f"Test: {len(test_data)} records, {test_data.index.nunique()} days")

# ============ CREATE ENVIRONMENT ============
def make_env(data, mode="train"):
    return EnhancedPortfolioEnv(
        df=data,
        stock_dim=10,
        hmax=100,
        initial_amount=100000,
        buy_cost_pct=0.001,
        sell_cost_pct=0.001,
        reward_scaling=1e-4,
        tech_indicator_list=TECH_INDICATORS,
        sentiment_feature_list=[],
        include_sentiment=False,
        normalize_obs=True,
        mode=mode,
    )

train_env = DummyVecEnv([lambda: make_env(train_data, "train")])

# ============ TRAIN MODEL ============
exp_dir = f"experiments/{EXPERIMENT_NAME}"
os.makedirs(exp_dir, exist_ok=True)

print(f"\nTraining {TOTAL_TIMESTEPS:,} timesteps on CUDA...")
print(f"Estimated time: ~30-40 minutes\n")

model = PPO(
    "MlpPolicy",
    train_env,
    **PPO_CONFIG,
    policy_kwargs=POLICY_KWARGS,
    verbose=1,
    device="cuda",
)

start_time = datetime.now()
model.learn(total_timesteps=TOTAL_TIMESTEPS, progress_bar=True)
training_time = datetime.now() - start_time

model_path = f"{exp_dir}/ppo_final_production.zip"
model.save(model_path)

print(f"\n{'='*60}")
print(f"TRAINING COMPLETE")
print(f"{'='*60}")
print(f"Time: {training_time}")
print(f"Model: {model_path}")

In [None]:
# Cell 4: Train Final Model
import os
import sys
import json
import numpy as np
import pandas as pd
from datetime import datetime

os.chdir("/workspace/enhanced-rl-portfolio")
sys.path.insert(0, "/workspace/enhanced-rl-portfolio")

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
from src.env.enhanced_portfolio_env import EnhancedPortfolioEnv
from src.data.enhanced_processor import EnhancedDataProcessor

# ============ CONFIGURATION ============
EXPERIMENT_NAME = "final_production_model"
TOTAL_TIMESTEPS = 1_500_000

# Optuna-tuned hyperparameters (Trial 21, Sharpe 2.28)
PPO_CONFIG = {
    "learning_rate": 0.000812,
    "n_steps": 2048,
    "batch_size": 64,
    "n_epochs": 10,
    "gamma": 0.992,
    "gae_lambda": 0.95,
    "clip_range": 0.2,
    "ent_coef": 0.0024,
    "vf_coef": 0.428,
    "max_grad_norm": 0.769,
}

POLICY_KWARGS = {"net_arch": [256, 256]}

TECH_INDICATORS = [
    'macd', 'boll_ub', 'boll_lb', 'rsi_30', 'cci_30', 'dx_30',
    'close_30_sma', 'close_60_sma', 'vix', 'turbulence'
]

# ============ PROGRESS CALLBACK ============
class ProgressCallback(BaseCallback):
    def __init__(self, total_timesteps, print_freq=50000):
        super().__init__()
        self.total_timesteps = total_timesteps
        self.print_freq = print_freq
        
    def _on_step(self):
        if self.num_timesteps % self.print_freq == 0:
            pct = 100 * self.num_timesteps / self.total_timesteps
            print(f"Progress: {self.num_timesteps:,}/{self.total_timesteps:,} ({pct:.1f}%)")
        return True

# ============ LOAD DATA ============
print("Loading data...")
processor = EnhancedDataProcessor(
    price_path="data/processed_data.csv",
    sentiment_path="data/historical_sentiment_complete.csv"
)
train_df, test_df = processor.get_train_test_split(
    train_end="2024-06-30",
    test_start="2024-07-01"
)

# Prepare data with day index for environment
def prepare_env_data(df):
    df = df.copy()
    df = df.sort_values(['date', 'tic']).reset_index(drop=True)
    dates = sorted(df['date'].unique())
    date_to_day = {date: i for i, date in enumerate(dates)}
    df['day'] = df['date'].map(date_to_day)
    return df.set_index('day')

train_data = prepare_env_data(train_df)
test_data = prepare_env_data(test_df)

print(f"Train: {len(train_data)} records, {train_data.index.nunique()} days")
print(f"Test: {len(test_data)} records, {test_data.index.nunique()} days")

# ============ CREATE ENVIRONMENT ============
def make_env(data, mode="train"):
    return EnhancedPortfolioEnv(
        df=data,
        stock_dim=10,
        hmax=100,
        initial_amount=100000,
        buy_cost_pct=0.001,
        sell_cost_pct=0.001,
        reward_scaling=1e-4,
        tech_indicator_list=TECH_INDICATORS,
        sentiment_feature_list=[],
        include_sentiment=False,
        normalize_obs=True,
        mode=mode,
    )

train_env = DummyVecEnv([lambda: make_env(train_data, "train")])

# ============ TRAIN MODEL ============
exp_dir = f"experiments/{EXPERIMENT_NAME}"
os.makedirs(exp_dir, exist_ok=True)

print(f"\nTraining {TOTAL_TIMESTEPS:,} timesteps...")
print(f"Estimated time: ~35-40 minutes\n")

model = PPO(
    "MlpPolicy",
    train_env,
    **PPO_CONFIG,
    policy_kwargs=POLICY_KWARGS,
    verbose=0,
    device="cpu",
)

callback = ProgressCallback(TOTAL_TIMESTEPS, print_freq=100000)

start_time = datetime.now()
model.learn(total_timesteps=TOTAL_TIMESTEPS, callback=callback)
training_time = datetime.now() - start_time

model_path = f"{exp_dir}/ppo_final_production.zip"
model.save(model_path)

print(f"\n{'='*60}")
print(f"TRAINING COMPLETE")
print(f"{'='*60}")
print(f"Time: {training_time}")
print(f"Model: {model_path}")

In [None]:
# Cell 5: Evaluate Model
import os
import sys
import json
import numpy as np
import pandas as pd
from datetime import datetime

os.chdir("/workspace/enhanced-rl-portfolio")
sys.path.insert(0, "/workspace/enhanced-rl-portfolio")

from stable_baselines3 import PPO
from src.env.enhanced_portfolio_env import EnhancedPortfolioEnv
from src.data.enhanced_processor import EnhancedDataProcessor

EXPERIMENT_NAME = "final_production_model"
exp_dir = f"experiments/{EXPERIMENT_NAME}"
model_path = f"{exp_dir}/ppo_final_production.zip"

TECH_INDICATORS = [
    'macd', 'boll_ub', 'boll_lb', 'rsi_30', 'cci_30', 'dx_30',
    'close_30_sma', 'close_60_sma', 'vix', 'turbulence'
]

# Load data
processor = EnhancedDataProcessor(
    price_path="data/processed_data.csv",
    sentiment_path="data/historical_sentiment_complete.csv"
)
_, test_df = processor.get_train_test_split(
    train_end="2024-06-30",
    test_start="2024-07-01"
)

def prepare_env_data(df):
    df = df.copy()
    df = df.sort_values(['date', 'tic']).reset_index(drop=True)
    dates = sorted(df['date'].unique())
    date_to_day = {date: i for i, date in enumerate(dates)}
    df['day'] = df['date'].map(date_to_day)
    return df.set_index('day')

test_data = prepare_env_data(test_df)

# Create test environment
test_env = EnhancedPortfolioEnv(
    df=test_data,
    stock_dim=10,
    hmax=100,
    initial_amount=100000,
    buy_cost_pct=0.001,
    sell_cost_pct=0.001,
    reward_scaling=1e-4,
    tech_indicator_list=TECH_INDICATORS,
    sentiment_feature_list=[],
    include_sentiment=False,
    normalize_obs=True,
    mode="test",
)

# Load and evaluate
model = PPO.load(model_path)
print("Evaluating...")

obs, _ = test_env.reset()
done = False
while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = test_env.step(action)
    done = terminated or truncated

stats = test_env.get_portfolio_stats()

print(f"\n{'='*60}")
print("EVALUATION RESULTS")
print(f"{'='*60}")
print(f"Sharpe Ratio:  {stats['sharpe_ratio']:.3f}")
print(f"Total Return:  {stats['total_return']*100:.2f}%")
print(f"Max Drawdown:  {stats['max_drawdown']*100:.2f}%")
print(f"Total Trades:  {stats['total_trades']}")
print(f"Final Value:   ${stats['final_value']:,.2f}")
print(f"{'='*60}")

# Save results
results = {
    "experiment_name": EXPERIMENT_NAME,
    "timestamp": datetime.now().isoformat(),
    "config": {
        "include_sentiment": False,
        "timesteps": 1_500_000,
        "learning_rate": 0.000812,
        "batch_size": 64,
        "net_arch": [256, 256],
    },
    "metrics": {
        "sharpe_ratio": float(stats['sharpe_ratio']),
        "total_return": float(stats['total_return']),
        "max_drawdown": float(stats['max_drawdown']),
        "total_trades": int(stats['total_trades']),
        "final_value": float(stats['final_value']),
    }
}

with open(f"{exp_dir}/results.json", "w") as f:
    json.dump(results, f, indent=2)
print(f"\nResults saved to {exp_dir}/results.json")

In [None]:
# Cell 6: Package for Download
import os
import shutil

os.chdir("/workspace/enhanced-rl-portfolio")

EXPERIMENT_NAME = "final_production_model"
exp_dir = f"experiments/{EXPERIMENT_NAME}"

# Copy to models folder
os.makedirs("models", exist_ok=True)
shutil.copy(f"{exp_dir}/ppo_final_production.zip", "models/ppo_final_production.zip")

# Create download zip
output_zip = f"/workspace/{EXPERIMENT_NAME}"
shutil.make_archive(output_zip, 'zip', exp_dir)

print(f"Model copied to: models/ppo_final_production.zip")
print(f"Download: {output_zip}.zip ({os.path.getsize(f'{output_zip}.zip')/1024/1024:.1f} MB)")
print(f"\nUpdate config.py:")
print('MODEL_PATH = "./models/ppo_final_production.zip"')