In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.ensemble import RandomForestRegressor
import joblib
import os

In [2]:
class FinancialPredictor:
    def __init__(self):
        self.scaler = MinMaxScaler()
        self.lstm_model = None
        self.rf_model = None
        self.sequence_length = 60  # Number of time steps to look back
        
    def prepare_data(self, data, target_col='Close'):
        """Prepare data for modeling"""
        # Make a copy to avoid modifying original data
        data = data.copy()
        
        # Check for NaN values before processing
        if data[target_col].isnull().any():
            print(f"Found {data[target_col].isnull().sum()} NaN values in {target_col}")
        
        # Handle missing values
        data[target_col] = data[target_col].fillna(method='ffill')  # Forward fill
        data[target_col] = data[target_col].fillna(method='bfill')  # Backward fill for any remaining NaNs
        
        # Verify no NaN values remain
        if data[target_col].isnull().any():
            raise ValueError("Unable to handle all NaN values in the data")
        
        # Scale the data
        scaled_data = self.scaler.fit_transform(data[[target_col]])
        
        # Create sequences
        X, y = [], []
        for i in range(self.sequence_length, len(scaled_data)):
            X.append(scaled_data[i-self.sequence_length:i])
            y.append(scaled_data[i])
        
        X, y = np.array(X), np.array(y)
        
        # Split for LSTM (3D shape)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
        
        # Reshape for Random Forest (2D shape)
        X_rf = X.reshape(X.shape[0], -1)
        X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y, test_size=0.3, shuffle=False)
        
        return (X_train, X_test, y_train, y_test), (X_train_rf, X_test_rf, y_train_rf, y_test_rf)
    
    def build_lstm_model(self, input_shape):
        """Build LSTM model"""
        model = Sequential([
            LSTM(50, return_sequences=True, input_shape=input_shape),
            Dropout(0.2),
            LSTM(50, return_sequences=False),
            Dropout(0.2),
            Dense(25),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mse')
        return model

    def train_models(self, data, target_col='Close'):
        """Train both LSTM and Random Forest models"""
        # Prepare data
        lstm_data, rf_data = self.prepare_data(data, target_col)
        (X_train, X_test, y_train, y_test) = lstm_data
        (X_train_rf, X_test_rf, y_train_rf, y_test_rf) = rf_data
        
        # Train LSTM
        self.lstm_model = self.build_lstm_model((X_train.shape[1], 1))
        self.lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)
        
        # Train Random Forest
        self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.rf_model.fit(X_train_rf, y_train_rf.ravel())
        
        # Evaluate models
        lstm_predictions = self.lstm_model.predict(X_test)
        rf_predictions = self.rf_model.predict(X_test_rf)
        
        # Convert predictions back to original scale
        lstm_predictions = self.scaler.inverse_transform(lstm_predictions)
        rf_predictions = self.scaler.inverse_transform(rf_predictions.reshape(-1, 1))
        y_test_orig = self.scaler.inverse_transform(y_test)
        
        # Calculate metrics
        metrics = {
            'LSTM': {
                'MSE': mean_squared_error(y_test_orig, lstm_predictions),
                'MAE': mean_absolute_error(y_test_orig, lstm_predictions),
                'R2': r2_score(y_test_orig, lstm_predictions)
            },
            'RF': {
                'MSE': mean_squared_error(y_test_orig, rf_predictions),
                'MAE': mean_absolute_error(y_test_orig, rf_predictions),
                'R2': r2_score(y_test_orig, rf_predictions)
            }
        }
        
        return metrics

    def save_models(self, symbol):
        """Save trained models"""
        if not os.path.exists('models'):
            os.makedirs('models')
            
        # Save LSTM model
        self.lstm_model.save(f'models/lstm_{symbol}.h5')
        
        # Save Random Forest model
        joblib.dump(self.rf_model, f'models/rf_{symbol}.joblib')
        
        # Save scaler
        joblib.dump(self.scaler, f'models/scaler_{symbol}.joblib')

    def load_models(self, symbol):
        """Load trained models"""
        self.lstm_model = load_model(f'models/lstm_{symbol}.h5')
        self.rf_model = joblib.load(f'models/rf_{symbol}.joblib')
        self.scaler = joblib.load(f'models/scaler_{symbol}.joblib')

    def predict(self, data, target_col='Close'):
        """Make predictions using both models"""
        data = data.copy()
        data[target_col] = data[target_col].fillna(method='ffill')
        data[target_col] = data[target_col].fillna(method='bfill')
     
        # Prepare data
        scaled_data = self.scaler.transform(data[[target_col]])
        
        # Prepare sequence for LSTM
        sequence = scaled_data[-self.sequence_length:]
        sequence = sequence.reshape(1, self.sequence_length, 1)
        
        # Prepare data for Random Forest
        rf_input = sequence.reshape(1, -1)
        
        # Make predictions
        lstm_pred = self.lstm_model.predict(sequence)
        rf_pred = self.rf_model.predict(rf_input)
        
        # Convert predictions back to original scale
        lstm_pred = self.scaler.inverse_transform(lstm_pred)
        rf_pred = self.scaler.inverse_transform(rf_pred.reshape(-1, 1))
        
        return {
            'LSTM': lstm_pred[0][0],
            'RF': rf_pred[0][0]
        }

# Example usage
if __name__ == "__main__":
    # Load processed data
    symbol = "BTC"  # Example symbol
    data = pd.read_csv(f'data/processed/crypto/{symbol}_processed.csv')
    
    # Initialize predictor
    predictor = FinancialPredictor()
    
    # Train models
    metrics = predictor.train_models(data)
    print("\nModel Metrics:")
    print(metrics)
    
    # Save models
    predictor.save_models(symbol)
    
    # Example of making a prediction
    last_60_days = data.tail(60)
    predictions = predictor.predict(last_60_days)
    print(f"\nPredictions for next day:")
    print(f"LSTM: ${predictions['LSTM']:.2f}")
    print(f"RF: ${predictions['RF']:.2f}")

Found 1 NaN values in Close


  data[target_col] = data[target_col].fillna(method='ffill')  # Forward fill
  data[target_col] = data[target_col].fillna(method='bfill')  # Backward fill for any remaining NaNs


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Model Metrics:
{'LSTM': {'MSE': 19347714.063588604, 'MAE': 2970.4302283022585, 'R2': 0.9815194308490012}, 'RF': {'MSE': 388512327.75292975, 'MAE': 10819.388262687111, 'R2': 0.6289004005612326}}

Predictions for next day:
LSTM: $106406.53
RF: $64540.78


  data[target_col] = data[target_col].fillna(method='ffill')
  data[target_col] = data[target_col].fillna(method='bfill')
