# 🎯 Crypto Prediction with On-Demand Processing

**Process Data Only When Needed - Perfect for 16GB M4 MacBook**

This notebook implements on-demand processing where data is processed only when you need it.

## Key Features:
- **On-Demand Processing**: Process data only when requested
- **Memory Efficient**: No data stored in memory until needed
- **Flexible**: Process any amount of data on demand
- **16GB Compatible**: Designed for M4 MacBook memory constraints


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import warnings
from IPython.display import display
import psutil
import gc
import os
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
warnings.filterwarnings("ignore", category=FutureWarning)

def get_memory_usage():
    """Get current memory usage in MB."""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

def check_memory_limit(max_memory_mb=3000):
    """Check if memory usage is within limits."""
    current_memory = get_memory_usage()
    if current_memory > max_memory_mb:
        print(f"⚠️  Memory usage high: {current_memory:.1f} MB (limit: {max_memory_mb} MB)")
        return False
    return True

def force_garbage_collection():
    """Force garbage collection to free memory."""
    gc.collect()
    return get_memory_usage()

print("✅ Libraries imported successfully!")
print(f"💾 Initial memory usage: {get_memory_usage():.1f} MB")


In [None]:
# Import utilities
import sys
sys.path.append('/Users/parteekmalik/github/pytorch')

from utils import (
    get_memory_usage, check_memory_limit, force_garbage_collection,
    download_crypto_data, create_minimal_features, create_sliding_windows,
    create_lightweight_lstm_model, scale_data, train_model_memory_efficient, evaluate_model
)

print("✅ Utilities imported successfully!")
print(f"💾 Initial memory usage: {get_memory_usage():.1f} MB")


In [None]:
# On-Demand Processing Configuration
ONDEMAND_CONFIG = {
    'MAX_MEMORY_MB': 3000,        # Ultra-safe memory limit
    'SEQUENCE_LENGTH': 5,         # Short sequences
    'LAG_PERIOD': 3,             # Reduced lag features
    'TARGET_COLS': ['Open', 'High', 'Low', 'Close', 'Volume'],
    'SYMBOL': 'BTCUSDT',
    'INTERVAL': '5m',
    'YEAR': '2021',
    'MONTHS': ['01', '02', '03', '04', '05', '06'],  # 6 months only
    'MAX_SAMPLES': 10000         # Maximum samples to process at once
}

print("✅ On-demand configuration loaded!")
print(f"💾 Memory limit: {ONDEMAND_CONFIG['MAX_MEMORY_MB']:,} MB")
print(f"🔄 Sequence length: {ONDEMAND_CONFIG['SEQUENCE_LENGTH']}")
print(f"📈 Lag period: {ONDEMAND_CONFIG['LAG_PERIOD']}")
print(f"📊 Max samples: {ONDEMAND_CONFIG['MAX_SAMPLES']:,}")


In [None]:
# On-Demand Data Processor
class OnDemandProcessor:
    """Ultra memory-efficient on-demand data processor."""
    
    def __init__(self, config):
        self.config = config
        self.scaler_X = None
        self.scaler_y = None
        self.feature_cols = None
        self.is_fitted = False
        self.raw_data = None
        
    def load_data_on_demand(self, symbol, interval, year, months, max_rows=30000):
        """Load data only when needed."""
        print(f"📥 Loading data on-demand (max {max_rows:,} rows)...")
        
        all_data = []
        total_rows = 0
        
        for month in months:
            if total_rows >= max_rows:
                print(f"⚠️  Reached row limit at {total_rows:,} rows")
                break
            
            # Try Binance Vision first (more reliable), then API
            df = download_binance_vision_data(symbol, interval, year, month)
            if df is None:
                print(f"   Trying API fallback for {year}-{month}...")
                df = download_binance_klines_data(symbol, interval, year, month)
            
            if df is not None:
                # Limit rows if approaching limit
                remaining_rows = max_rows - total_rows
                if len(df) > remaining_rows:
                    df = df.head(remaining_rows)
                    print(f"   Truncated to {len(df)} rows to stay within memory limit")
                
                all_data.append(df)
                total_rows += len(df)
                
                current_memory = get_memory_usage()
                print(f"   Total rows: {total_rows:,}, Memory: {current_memory:.1f} MB")
                
                # Check memory limit
                if not check_memory_limit(self.config['MAX_MEMORY_MB']):
                    print(f"⚠️  Memory limit reached, stopping download")
                    break
            else:
                print(f"   Failed to download {year}-{month}")
        
        if all_data:
            self.raw_data = pd.concat(all_data, ignore_index=True)
            print(f"✅ Data loaded on-demand!")
            print(f"   Final shape: {self.raw_data.shape}")
            print(f"   Memory usage: {get_memory_usage():.1f} MB")
            return True
        else:
            print("❌ No data loaded")
            return False
    
    def process_data_on_demand(self, max_samples=None):
        """Process data only when requested."""
        if self.raw_data is None:
            print("❌ No data loaded. Call load_data_on_demand() first.")
            return None, None, None
        
        max_samples = max_samples or self.config['MAX_SAMPLES']
        print(f"🔄 Processing data on-demand (max {max_samples:,} samples)...")
        
        # Create features
        features = create_minimal_features(self.raw_data, self.config['LAG_PERIOD'])
        features_clean = features.dropna()
        
        if len(features_clean) < self.config['SEQUENCE_LENGTH']:
            print("❌ Not enough data for processing")
            return None, None, None
        
        # Get feature columns
        self.feature_cols = [col for col in features_clean.columns 
                           if col not in ['Open time', 'Close time'] + self.config['TARGET_COLS']]
        
        # Create sliding windows
        X, y, _ = create_sliding_windows(
            features_clean, 
            self.config['SEQUENCE_LENGTH'], 
            self.config['TARGET_COLS']
        )
        
        if len(X) == 0:
            print("❌ No valid samples created")
            return None, None, None
        
        # Limit samples if needed
        if len(X) > max_samples:
            X = X[:max_samples]
            y = y[:max_samples]
            print(f"   Limited to {max_samples:,} samples for memory efficiency")
        
        print(f"✅ On-demand processing completed!")
        print(f"   Samples: {len(X):,}")
        print(f"   Features: {len(self.feature_cols)}")
        print(f"   Memory: {get_memory_usage():.1f} MB")
        
        return X, y, self.feature_cols
    
    def fit_scalers_on_demand(self, X_sample, y_sample):
        """Fit scalers only when needed."""
        from sklearn.preprocessing import MinMaxScaler
        
        print("🔢 Fitting scalers on-demand...")
        
        self.scaler_X = MinMaxScaler()
        self.scaler_y = MinMaxScaler()
        
        X_reshaped = X_sample.reshape(-1, X_sample.shape[-1])
        self.scaler_X.fit(X_reshaped)
        self.scaler_y.fit(y_sample)
        
        self.is_fitted = True
        print(f"✅ Scalers fitted on {len(X_sample)} samples")
    
    def train_model_on_demand(self, X, y, epochs=20, batch_size=32):
        """Train model only when requested."""
        if not self.is_fitted:
            print("❌ Scalers not fitted. Call fit_scalers_on_demand() first.")
            return None
        
        print(f"🚀 Training model on-demand...")
        
        # Split data
        split_idx = int(len(X) * 0.8)
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        # Scale data
        X_train_scaled, y_train_scaled, _, _ = scale_data(X_train, y_train, fit_scalers=True)
        X_test_scaled, y_test_scaled = scale_data(X_test, y_test, self.scaler_X, self.scaler_y, fit_scalers=False)
        
        # Create model
        input_shape = (X_train_scaled.shape[1], X_train_scaled.shape[2])
        output_dim = y_train_scaled.shape[1]
        
        model = create_lightweight_lstm_model(input_shape, output_dim)
        
        # Train model
        history = train_model_memory_efficient(
            model, X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled,
            epochs=epochs, batch_size=batch_size, verbose=1
        )
        
        print(f"✅ Model training completed!")
        print(f"   Final loss: {history.history['loss'][-1]:.4f}")
        print(f"   Memory usage: {get_memory_usage():.1f} MB")
        
        return model, history, X_test_scaled, y_test_scaled

print("✅ On-demand processor loaded!")


In [None]:
# On-Demand Data Loading
print("📥 ON-DEMAND DATA LOADING")
print("=" * 50)

# Create on-demand processor
processor = OnDemandProcessor(ONDEMAND_CONFIG)

# Load data only when needed using common download function
success = processor.load_data_on_demand(
    ONDEMAND_CONFIG['SYMBOL'],
    ONDEMAND_CONFIG['INTERVAL'],
    f"{ONDEMAND_CONFIG['YEAR']} {ONDEMAND_CONFIG['MONTHS'][0]}",
    f"{ONDEMAND_CONFIG['YEAR']} {ONDEMAND_CONFIG['MONTHS'][-1]}",
    max_rows=30000  # Ultra-safe limit
)

if success:
    print(f"\n✅ Data loaded successfully!")
    print(f"📊 Data shape: {processor.raw_data.shape}")
    print(f"📅 Date range: {processor.raw_data['Open time'].min()} to {processor.raw_data['Open time'].max()}")
    print(f"💾 Memory usage: {get_memory_usage():.1f} MB")
    
    # Show sample data
    print(f"\n📋 Sample Data:")
    display(processor.raw_data.head(3))
    
    # Check memory
    if not check_memory_limit(ONDEMAND_CONFIG['MAX_MEMORY_MB']):
        print("⚠️  Memory usage high after data loading!")
        print("💡 Consider reducing data size")
    
else:
    print("❌ Data loading failed")
    processor = None


In [None]:
# On-Demand Data Processing
if processor is not None:
    print("🔄 ON-DEMAND DATA PROCESSING")
    print("=" * 50)
    
    # Process data only when needed
    X, y, feature_cols = processor.process_data_on_demand(max_samples=5000)  # Process 5k samples
    
    if X is not None:
        print(f"\n✅ Data processing completed!")
        print(f"📊 Processed samples: {len(X):,}")
        print(f"📈 Features: {len(feature_cols)}")
        print(f"💾 Memory usage: {get_memory_usage():.1f} MB")
        
        # Show sample data
        print(f"\n📋 Sample Processed Data:")
        print(f"   X shape: {X.shape}")
        print(f"   y shape: {y.shape}")
        print(f"   Feature columns: {feature_cols[:10]}...")
        
        # Check memory
        if not check_memory_limit(ONDEMAND_CONFIG['MAX_MEMORY_MB']):
            print("⚠️  Memory usage high after processing!")
            print("💡 Consider reducing sample size")
        
    else:
        print("❌ Data processing failed")
        
else:
    print("❌ Processor not available")


In [None]:
# On-Demand Model Training
if 'X' in locals() and X is not None:
    print("🚀 ON-DEMAND MODEL TRAINING")
    print("=" * 50)
    
    # Fit scalers on sample data
    sample_size = min(1000, len(X))
    processor.fit_scalers_on_demand(X[:sample_size], y[:sample_size])
    
    # Train model only when needed
    model, history, X_test_scaled, y_test_scaled = processor.train_model_on_demand(
        X, y, epochs=15, batch_size=32  # Reduced for demo
    )
    
    if model is not None:
        print(f"\n✅ Model training completed!")
        print(f"📊 Model architecture:")
        print(f"   Input shape: {X.shape[1:]}")
        print(f"   Output shape: {y.shape[1]}")
        print(f"   Parameters: {model.count_params():,}")
        print(f"💾 Memory usage: {get_memory_usage():.1f} MB")
        
        # Show training history
        print(f"\n📈 Training History:")
        print(f"   Final loss: {history.history['loss'][-1]:.4f}")
        print(f"   Final val_loss: {history.history['val_loss'][-1]:.4f}")
        
    else:
        print("❌ Model training failed")
        
else:
    print("❌ No processed data available for training")


In [None]:
# On-Demand Model Evaluation
if 'model' in locals() and model is not None:
    print("📊 ON-DEMAND MODEL EVALUATION")
    print("=" * 50)
    
    # Evaluate model
    results = evaluate_model(model, X_test_scaled, y_test_scaled, processor.scaler_y)
    
    # Show sample predictions
    print(f"\n📋 Sample Predictions (First 3):")
    for i in range(min(3, len(results['y_pred']))):
        print(f"   Sample {i+1}:")
        print(f"     Actual:   {results['y_test'][i]}")
        print(f"     Predicted: {results['y_pred'][i]}")
        print(f"     Error:    {np.abs(results['y_test'][i] - results['y_pred'][i])}")
    
    # Memory usage summary
    print(f"\n💾 Memory Usage Summary:")
    print(f"   Current memory: {get_memory_usage():.1f} MB")
    
    if get_memory_usage() < ONDEMAND_CONFIG['MAX_MEMORY_MB']:
        print(f"✅ Memory usage within limits!")
        print(f"   On-demand approach successful for 16GB M4 MacBook")
    else:
        print(f"⚠️  Memory usage high")
        print(f"   Consider reducing sample size or data")
    
    print(f"\n🎉 On-demand approach completed successfully!")
    print(f"   Perfect for flexible processing on 16GB M4 MacBook")
    
else:
    print("❌ Model not available for evaluation")
