# Re-Imaging Price Trends - Model Training & Evaluation

**Prerequisites**: `1_image_generation.ipynb` completed

**Execution**: CNN model training → Portfolio performance evaluation → Result visualization

In [None]:
# Environment setup
!pip install -r requirements.txt

from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/ReImaging_Price_Trends')
print(f"Current directory: {os.getcwd()}")

# GPU check
import torch
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

# GPU memory monitoring and management functions
def check_gpu_memory():
    """Check and display GPU memory status"""
    if torch.cuda.is_available():
        device = torch.cuda.current_device()
        total_memory = torch.cuda.get_device_properties(device).total_memory
        allocated = torch.cuda.memory_allocated(device)
        cached = torch.cuda.memory_reserved(device)
        
        print(f"GPU memory status:")
        print(f"   Total: {total_memory/1024**3:.1f}GB")
        print(f"   Used: {allocated/1024**3:.1f}GB ({allocated/total_memory*100:.1f}%)")
        print(f"   Cached: {cached/1024**3:.1f}GB ({cached/total_memory*100:.1f}%)")
        print(f"   Available: {(total_memory-allocated)/1024**3:.1f}GB")
        
        return allocated, total_memory
    else:
        print("GPU unavailable - CPU mode")
        return 0, 0

def cleanup_memory():
    """Memory cleanup and garbage collection"""
    import gc
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    gc.collect()
    print("Memory cleanup completed")

# Check initial GPU memory status
print("Initial GPU memory status:")
check_gpu_memory()
cleanup_memory()

In [None]:
# Check original format images (img_data_reconstructed)
import os

# Use absolute path for Colab environment
base_path = '/content/drive/MyDrive/ReImaging_Price_Trends' if 'google.colab' in str(get_ipython()) else '.'
original_data_dir = os.path.join(base_path, 'img_data_reconstructed')

print(f"Original data path: {original_data_dir}")

# Check original format (.dat + .feather) directories
required_dirs = {
    '5d (weekly)': 'weekly_5d',
    '20d (monthly)': 'monthly_20d', 
    '60d (quarterly)': 'quarterly_60d'
}

all_ready = True
for desc, dir_name in required_dirs.items():
    dir_path = os.path.join(original_data_dir, dir_name)
    if os.path.exists(dir_path):
        # Check a few year files as examples
        sample_files = [f for f in os.listdir(dir_path) if f.endswith('.dat')][:3]
        print(f"✓ {desc}: {len(sample_files)} year files (examples)")
        for f in sample_files:
            print(f"   - {f}")
    else:
        print(f"✗ {desc}: {dir_path}")
        all_ready = False

if all_ready:
    print(f"\nOriginal format data ready!")
    print("   Same .dat + .feather format as paper authors")
    print("   Will be executed with --use_original_format flag")
else:
    print(f"\nWarning: Some original data missing")
    print("Generate with create_original_format.py or use HDF5 format")

## CNN Model Training (Paper-style Ensemble Support)

**Format Used**: `.dat` + `.feather` (same as paper authors)  
**Data Location**: `img_data_reconstructed/`  
**Ensemble**: Paper-mentioned 5-model averaging supported  

### Training Method Selection:
1. **Single Model**: Fast testing
2. **Ensemble Model**: Same as paper with 5-model averaging (more stable)

In [None]:
# CNN5d model training selection

print("CNN5d training method selection:")
print("   Single model: Fast (1 model)")
print("   Ensemble: Paper method (5-model average)")

# Training method setting (changeable)
use_ensemble = False  # Change to True for ensemble training

print("GPU memory status (before training):")
check_gpu_memory()

if use_ensemble:
    print("\nCNN5d ensemble training (5 models)...")
    print("Warning: Takes 5x longer!")
    !python ensemble_train.py --model CNN5d --image_days 5 --pred_days 5 --ensemble_runs 5 --use_original_format
else:
    print("\nCNN5d single model training...")
    !python train.py --model CNN5d --image_days 5 --pred_days 5 --use_original_format

print("\nGPU memory status (after training):")
check_gpu_memory()
cleanup_memory()
print("CNN5d training completed")

In [None]:
# CNN20d model training selection

print("CNN20d training method selection:")
print("   Single model: Fast (1 model)")
print("   Ensemble: Paper method (5-model average)")

# Training method setting (changeable)
use_ensemble = False  # Change to True for ensemble training

print("GPU memory status (before training):")
check_gpu_memory()

if use_ensemble:
    print("\nCNN20d ensemble training (5 models)...")
    print("Warning: Takes 5x longer!")
    !python ensemble_train.py --model CNN20d --image_days 20 --pred_days 20 --ensemble_runs 5 --use_original_format
else:
    print("\nCNN20d single model training...")
    !python train.py --model CNN20d --image_days 20 --pred_days 20 --use_original_format

print("\nGPU memory status (after training):")
check_gpu_memory()
cleanup_memory()
print("CNN20d training completed")

In [None]:
# CNN60d model training selection (memory intensive)

print("CNN60d training method selection (Warning: Memory intensive):")
print("   Single model: Fast (1 model)")  
print("   Ensemble: Paper method (5-model average)")

# Training method setting (changeable)
use_ensemble = False  # Change to True for ensemble training

print("GPU memory status (before training):")
allocated, total = check_gpu_memory()

# Memory shortage warning
if allocated > total * 0.7:  # Warning if using more than 70%
    print("Warning: GPU memory shortage risk! Performing additional cleanup...")
    cleanup_memory()

if use_ensemble:
    print("\nCNN60d ensemble training (5 models)...")
    print("Warning: Takes 5x longer!")
    !python ensemble_train.py --model CNN60d --image_days 60 --pred_days 60 --ensemble_runs 5 --use_original_format --batch_size 64
else:
    print("\nCNN60d single model training...")
    !python train.py --model CNN60d --image_days 60 --pred_days 60 --use_original_format --batch_size 64

print("\nGPU memory status (after training):")
check_gpu_memory()
cleanup_memory()
print("CNN60d training completed")

# Evaluate all models (original format)
print("Portfolio performance evaluation started (original .dat + .feather format)...\n")

print("1. CNN5d (Weekly Strategy)")
!python test.py --model CNN5d --image_days 5 --pred_days 5 --use_original_format
cleanup_memory()

print("\n2. CNN20d (Monthly Strategy)")
!python test.py --model CNN20d --image_days 20 --pred_days 20 --use_original_format
cleanup_memory()

print("\n3. CNN60d (Quarterly Strategy)")
!python test.py --model CNN60d --image_days 60 --pred_days 60 --use_original_format
cleanup_memory()

print("\nAll evaluations completed!")
check_gpu_memory()

In [None]:
# Portfolio performance evaluation (with ensemble support)

print("Portfolio performance evaluation started...\n")
print("Evaluation methods:")
print("   Single model: Regular prediction")
print("   Ensemble: 5-model average prediction (more stable)")

# Evaluation method setting (match with training method)
use_ensemble_eval = False  # Change to True if trained with ensemble

print("1. CNN5d (Weekly Strategy)")
if use_ensemble_eval:
    !python test.py --model CNN5d --image_days 5 --pred_days 5 --ensemble --use_original_format
else:
    !python test.py --model CNN5d --image_days 5 --pred_days 5 --use_original_format
cleanup_memory()

print("\n2. CNN20d (Monthly Strategy)")
if use_ensemble_eval:
    !python test.py --model CNN20d --image_days 20 --pred_days 20 --ensemble --use_original_format
else:
    !python test.py --model CNN20d --image_days 20 --pred_days 20 --use_original_format
cleanup_memory()

print("\n3. CNN60d (Quarterly Strategy)")
if use_ensemble_eval:
    !python test.py --model CNN60d --image_days 60 --pred_days 60 --ensemble --use_original_format
else:
    !python test.py --model CNN60d --image_days 60 --pred_days 60 --use_original_format
cleanup_memory()

print("\nAll evaluations completed!")
check_gpu_memory()

## Result Visualization

In [None]:
# Final results summary & GPU memory analysis
print("Re-Imaging Price Trends - Execution completed!")
print("=" * 50)

# GPU memory analysis results
print("\nGPU memory requirements (batch size 128):")
print("   • CNN5d:  ~0.02GB (lightest)")
print("   • CNN20d: ~0.09GB (medium)")  
print("   • CNN60d: ~0.34GB (heaviest)")
print("\nNote: Actual usage may be 2-3x higher with data loading + optimizer state")

print("\nColab GPU compatibility:")
print("   • T4 (16GB): All models trainable")
print("   • V100/A100: Large batch processing possible")
print("   • CNN60d: batch size 64 recommended")

# File size summary
if os.path.exists('models'):
    print("\nTrained models:")
    for file in sorted(os.listdir('models')):
        if file.endswith('.tar'):
            size_mb = os.path.getsize(f'models/{file}') / (1024**2)
            print(f"   ✓ {file} ({size_mb:.1f}MB)")

# Paper performance comparison
print("\nExpected paper performance (comparison reference):")
print("   • Weekly (I5R5): H-L Sharpe = 7.15")
print("   • Monthly (I20R20): H-L Sharpe = 2.16") 
print("   • Quarterly (I60R60): H-L Sharpe = 0.37")

print("\nFormat used:")
print("   • Original format (.dat + .feather): Same as paper authors")
print("   • Memory efficient and Colab friendly")

print("\nCompare the portfolio evaluation results above with paper benchmarks!")
print("\nFinal GPU memory status:")
check_gpu_memory()

In [None]:
# Final results summary & ensemble information

print("Re-Imaging Price Trends - Execution completed!")
print("=" * 60)

# GPU memory analysis results
print("\nGPU memory requirements (batch size 128):")
print("   • CNN5d:  ~0.02GB (lightest)")
print("   • CNN20d: ~0.09GB (medium)")  
print("   • CNN60d: ~0.34GB (heaviest)")
print("\nNote: Actual usage may be 2-3x higher with data loading + optimizer state")

print("\nColab GPU compatibility:")
print("   • T4 (16GB): All models trainable")
print("   • V100/A100: Large batch processing possible")
print("   • CNN60d: batch size 64 recommended")

# Ensemble vs single model explanation
print("\nModel training methods:")
print("   • Single model: Fast, for testing")
print("   • Ensemble (5 models): Paper method, more stable performance")
print("     - Train same model 5 times independently")
print("     - Average 5 results during prediction")
print("     - Reduces stochastic variability")

# File size summary
if os.path.exists('models'):
    print("\nTrained models:")
    model_files = [f for f in os.listdir('models') if f.endswith('.tar')]
    
    # Single models
    single_models = [f for f in model_files if '_run' not in f]
    if single_models:
        print("   Single models:")
        for file in sorted(single_models):
            size_mb = os.path.getsize(f'models/{file}') / (1024**2)
            print(f"     ✓ {file} ({size_mb:.1f}MB)")
    
    # Ensemble models
    ensemble_models = [f for f in model_files if '_run' in f]
    if ensemble_models:
        print("   Ensemble models:")
        for file in sorted(ensemble_models):
            size_mb = os.path.getsize(f'models/{file}') / (1024**2)
            print(f"     ✓ {file} ({size_mb:.1f}MB)")

# Paper performance comparison
print("\nExpected paper performance (comparison reference):")
print("   • Weekly (I5R5): H-L Sharpe = 7.15")
print("   • Monthly (I20R20): H-L Sharpe = 2.16") 
print("   • Quarterly (I60R60): H-L Sharpe = 0.37")

print("\nFormat used:")
print("   • Original format (.dat + .feather): Same as paper authors")
print("   • Memory efficient and Colab friendly")

print("\nCheck portfolio evaluation results and compare with paper!")
print("\nFinal GPU memory status:")
check_gpu_memory()