# Model Evaluation Comparison

Evaluate existing models on test set to find best for production.
Tests `ppo_enhanced_tuned.zip` (from Optuna) with correct settings.

In [None]:
# Cell 1: Setup
import os
import subprocess

REPO_URL = "https://github.com/nimeshk03/enhanced-rl-portfolio.git"
WORK_DIR = "/workspace/enhanced-rl-portfolio"

if os.path.exists(WORK_DIR):
    os.chdir(WORK_DIR)
    subprocess.run(["git", "pull"], check=True)
else:
    os.chdir("/workspace")
    subprocess.run(["git", "clone", REPO_URL], check=True)
    os.chdir(WORK_DIR)

print(f"Working directory: {os.getcwd()}")

In [None]:
# Cell 3: Load and check data, then evaluate ppo_enhanced_tuned.zip
import os
import sys
import numpy as np
import pandas as pd

os.chdir("/workspace/enhanced-rl-portfolio")
sys.path.insert(0, "/workspace/enhanced-rl-portfolio")

from stable_baselines3 import PPO
from src.env.enhanced_portfolio_env import EnhancedPortfolioEnv
from src.data.enhanced_processor import EnhancedDataProcessor, ProcessorConfig

# First check data date range
price_df = pd.read_csv('data/processed_data.csv')
print(f"Price data date range: {price_df['date'].min()} to {price_df['date'].max()}")
print(f"Price records: {len(price_df)}")

# Determine appropriate split based on actual data
max_date = pd.to_datetime(price_df['date'].max())
if max_date >= pd.to_datetime('2024-07-01'):
    train_end = '2024-06-30'
    test_start = '2024-07-01'
else:
    # Use last 20% of data as test
    dates = sorted(price_df['date'].unique())
    split_idx = int(len(dates) * 0.8)
    train_end = dates[split_idx - 1]
    test_start = dates[split_idx]
    print(f"\\nData doesn't reach 2024-07, using 80/20 split:")

print(f"Train end: {train_end}")
print(f"Test start: {test_start}")

# Load with Optuna's processor settings
config = ProcessorConfig(normalize_features=True, normalization_window=60)
processor = EnhancedDataProcessor(
    price_path='data/processed_data.csv',
    sentiment_path='data/historical_sentiment_complete.csv',
    config=config,
)
train_df, test_df = processor.get_train_test_split(
    train_end=train_end,
    test_start=test_start,
)
feature_info = processor.get_feature_info()

print(f"\\nTech indicators: {feature_info['tech_indicators']}")
print(f"Sentiment features: {feature_info['sentiment_features']}")

# Prepare data
def prepare_env_data(df):
    df = df.copy()
    df = df.sort_values(['date', 'tic']).reset_index(drop=True)
    dates = sorted(df['date'].unique())
    date_to_day = {date: i for i, date in enumerate(dates)}
    df['day'] = df['date'].map(date_to_day)
    return df.set_index('day')

test_data = prepare_env_data(test_df)
print(f"\\nTest data: {len(test_data)} records, {test_data.index.nunique()} days")

In [None]:
# Cell 3: Evaluate ppo_enhanced_tuned.zip with OPTUNA SETTINGS
import os
import sys
import numpy as np
import pandas as pd

os.chdir("/workspace/enhanced-rl-portfolio")
sys.path.insert(0, "/workspace/enhanced-rl-portfolio")

from stable_baselines3 import PPO
from src.env.enhanced_portfolio_env import EnhancedPortfolioEnv
from src.data.enhanced_processor import EnhancedDataProcessor, ProcessorConfig

# Load data with Optuna's processor settings
config = ProcessorConfig(normalize_features=True, normalization_window=60)
processor = EnhancedDataProcessor(
    price_path='data/processed_data.csv',
    sentiment_path='data/historical_sentiment_complete.csv',
    config=config,
)
_, test_df = processor.get_train_test_split(
    train_end='2024-06-30',
    test_start='2024-07-01',
)
feature_info = processor.get_feature_info()

print(f"Tech indicators: {feature_info['tech_indicators']}")
print(f"Sentiment features: {feature_info['sentiment_features']}")

# Prepare data
def prepare_env_data(df):
    df = df.copy()
    df = df.sort_values(['date', 'tic']).reset_index(drop=True)
    dates = sorted(df['date'].unique())
    date_to_day = {date: i for i, date in enumerate(dates)}
    df['day'] = df['date'].map(date_to_day)
    return df.set_index('day')

test_data = prepare_env_data(test_df)
print(f"\nTest data: {len(test_data)} records, {test_data.index.nunique()} days")

In [None]:
# Cell 4: Evaluate ppo_enhanced_tuned.zip (Optuna best model)
# IMPORTANT: Uses sentiment=True, normalize_obs=False (Optuna's settings)

test_env = EnhancedPortfolioEnv(
    df=test_data,
    stock_dim=feature_info['n_tickers'],
    hmax=100,
    initial_amount=100000,
    buy_cost_pct=0.001,
    sell_cost_pct=0.001,
    reward_scaling=1e-4,
    tech_indicator_list=feature_info['tech_indicators'],
    sentiment_feature_list=feature_info['sentiment_features'],
    include_sentiment=True,   # OPTUNA SETTING
    normalize_obs=False,      # OPTUNA SETTING
    print_verbosity=0,
)

# Load Optuna-tuned model
model = PPO.load('models/ppo_enhanced_tuned.zip')

# Evaluate
obs, _ = test_env.reset()
done = False
while not done:
    action, _ = model.predict(obs.reshape(1, -1), deterministic=True)
    obs, _, done, _, _ = test_env.step(action[0])

stats = test_env.get_portfolio_stats()

print('='*60)
print('PPO_ENHANCED_TUNED (Optuna model) - Test Set')
print('Settings: sentiment=True, normalize_obs=False')
print('='*60)
print(f'Sharpe Ratio:  {stats["sharpe_ratio"]:.3f}')
print(f'Total Return:  {stats["total_return"]*100:.2f}%')
print(f'Max Drawdown:  {stats["max_drawdown"]*100:.2f}%')
print(f'Total Trades:  {stats["total_trades"]}')
print(f'Final Value:   ${stats["final_value"]:,.2f}')
print('='*60)

In [None]:
# Cell 5: Compare with ppo_final_production.zip (our recent model)
# Uses sentiment=False, normalize_obs=True (WRONG settings for Optuna HPs)

TECH_INDICATORS = [
    'macd', 'boll_ub', 'boll_lb', 'rsi_30', 'cci_30', 'dx_30',
    'close_30_sma', 'close_60_sma', 'vix', 'turbulence'
]

test_env2 = EnhancedPortfolioEnv(
    df=test_data,
    stock_dim=10,
    hmax=100,
    initial_amount=100000,
    buy_cost_pct=0.001,
    sell_cost_pct=0.001,
    reward_scaling=1e-4,
    tech_indicator_list=TECH_INDICATORS,
    sentiment_feature_list=[],
    include_sentiment=False,  # Different from Optuna
    normalize_obs=True,       # Different from Optuna
    print_verbosity=0,
)

model2 = PPO.load('experiments/final_production_model/ppo_final_production.zip')

obs, _ = test_env2.reset()
done = False
while not done:
    action, _ = model2.predict(obs, deterministic=True)
    obs, _, done, _, _ = test_env2.step(action)

stats2 = test_env2.get_portfolio_stats()

print('='*60)
print('PPO_FINAL_PRODUCTION (1.5M steps) - Test Set')
print('Settings: sentiment=False, normalize_obs=True')
print('='*60)
print(f'Sharpe Ratio:  {stats2["sharpe_ratio"]:.3f}')
print(f'Total Return:  {stats2["total_return"]*100:.2f}%')
print(f'Max Drawdown:  {stats2["max_drawdown"]*100:.2f}%')
print(f'Total Trades:  {stats2["total_trades"]}')
print(f'Final Value:   ${stats2["final_value"]:,.2f}')
print('='*60)

In [None]:
# Cell 6: Summary comparison
print('\n' + '='*70)
print('MODEL COMPARISON SUMMARY')
print('='*70)
print(f'{"Model":<35} {"Sharpe":<10} {"Return":<12} {"Drawdown":<12}')
print('-'*70)
print(f'{"ppo_enhanced_tuned (Optuna)":<35} {stats["sharpe_ratio"]:<10.3f} {stats["total_return"]*100:<12.2f}% {stats["max_drawdown"]*100:<12.2f}%')
print(f'{"ppo_final_production (1.5M)":<35} {stats2["sharpe_ratio"]:<10.3f} {stats2["total_return"]*100:<12.2f}% {stats2["max_drawdown"]*100:<12.2f}%')
print('='*70)

# Recommendation
if stats['sharpe_ratio'] > stats2['sharpe_ratio']:
    print('\nRECOMMENDATION: Use ppo_enhanced_tuned.zip for production')
    print('  - Trained with sentiment (matches autonomous system architecture)')
    print('  - Better Sharpe ratio on test set')
else:
    print('\nRECOMMENDATION: Consider retraining with Optuna settings')
    print('  - sentiment=True, normalize_obs=False')
    print('  - Use longer training (500k-1M timesteps)')