# Imports

In [141]:
import torch
import numpy as np
import pandas as pd
from pathlib import Path
import glob
from datetime import datetime, timedelta
import warnings
import gc
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler  # For data_scaler
from torch.utils.data import DataLoader        # For DataLoader
import torch.nn as nn                          # For nn.MSELoss()
import torch.optim as optim                    # For optim.Adam()
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import traceback  # Add at the top with other imports

# TEST ON DATA

def analyze_data_timestamps(data_dir):
    """
    Analyze timestamp ranges in all CSV files in the directory.
    
    Args:
        data_dir (str): Path to directory containing CSV files
    """
    try:
        csv_files = glob.glob(str(Path(data_dir) / "*.csv"))
        if not csv_files:
            print(f"No CSV files found in {data_dir}")
            return
        
        print(f"\nFound {len(csv_files)} files. Analyzing timestamps...")
        
        for file in csv_files:
            try:
                # Read file
                df = pd.read_csv(file)
                df['ts_event'] = pd.to_datetime(df['ts_event'], utc=True)
                
                # Get timestamp range
                min_ts = df['ts_event'].min()
                max_ts = df['ts_event'].max()
                record_count = len(df)
                
                # Print results
                print(f"\nFile: {Path(file).name}")
                print(f"Records: {record_count:,}")
                print(f"Time range: {min_ts} to {max_ts}")
                
                # Print sample of data points
                print("\nSample timestamps (first 5):")
                for ts in df['ts_event'].head().dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ'):
                    print(ts)
                    
                # Optional: analyze gaps
                time_diffs = df['ts_event'].diff()
                max_gap = time_diffs.max()
                if max_gap.total_seconds() > 60:  # Show gaps larger than 1 minute
                    gap_idx = time_diffs.idxmax()
                    gap_start = df['ts_event'].iloc[gap_idx-1]
                    gap_end = df['ts_event'].iloc[gap_idx]
                    print(f"\nLargest gap: {max_gap}")
                    print(f"Gap period: {gap_start} to {gap_end}")
                
            except Exception as e:
                print(f"Error processing {Path(file).name}: {str(e)}")
                traceback.print_exc()
                continue
                
    except Exception as e:
        print(f"Error analyzing directory: {str(e)}")
        traceback.print_exc()

# Use the function like this:
data_dir = r"C:\Users\cinco\Desktop\DATA FOR SCRIPTS\data bento data\test"
analyze_data_timestamps(data_dir)

# Config setup

In [142]:

class Config:
    # Training on May 3rd data
    TRAIN_START_DATE = "2018-05-03T08:00:46.000000Z"
    TRAIN_END_DATE = "2018-05-03T20:00:00.000000Z"  # Use first part for training
    
    # Testing on remaining May 3rd data
    TEST_START_DATE = "2018-05-03T20:00:00.000000Z"
    TEST_END_DATE = "2018-05-03T23:59:58.000000Z"
    

    DATA_DIR = r"C:\Users\cinco\Desktop\DATA FOR SCRIPTS\data bento data\test"
    
    # Model parameters
    BATCH_SIZE = 32
    HIDDEN_SIZE = 64
    NUM_LAYERS = 2
    LEARNING_RATE = 0.001
    EPOCHS = 50
    PATIENCE = 10
    
    # Training parameters
    TRAIN_VAL_SPLIT = 0.8
    NUM_WORKERS = 0
    RANDOM_SEED = 42
    
    # Dynamic parameters (to be set during runtime)
    sequence_length = None
    prediction_length = None
    


    @classmethod
    def validate_dates(cls):
        """Validate the configuration dates"""
        try:
            # Add small buffer to avoid exact matches
            train_start = pd.to_datetime(cls.TRAIN_START_DATE, utc=True)
            train_end = pd.to_datetime(cls.TRAIN_END_DATE, utc=True) + pd.Timedelta(microseconds=1)
            test_start = pd.to_datetime(cls.TEST_START_DATE, utc=True)
            test_end = pd.to_datetime(cls.TEST_END_DATE, utc=True) + pd.Timedelta(microseconds=1)
            
            print(f"\nValidating date ranges:")
            print(f"Train period: {train_start} to {train_end}")
            print(f"Test period: {test_start} to {test_end}")
            
            assert train_start < train_end, "Training start date must be before training end date"
            assert test_start <= test_end, "Test start date must be before or equal to test end date"
            assert train_end >= test_start, "Training end date should be at or after test start date"
            
            return True
        except Exception as e:
            print(f"Date validation error: {str(e)}")
            return False
    
    @classmethod
    def analyze_time_series(cls, data_dir=None):
        """Analyze time series data to determine appropriate sequence and prediction lengths."""
        try:
            import pandas as pd
            import glob
            from pathlib import Path
            import numpy as np
            
            # If no data_dir provided, use the class's DATA_DIR
            if data_dir is None:
                data_dir = cls.DATA_DIR
                
            all_diffs = []
            csv_files = glob.glob(str(Path(data_dir) / "*.csv"))
            
            # Process all CSV files to analyze time differences
            for file in csv_files:
                df = pd.read_csv(file)
                df['ts_event'] = pd.to_datetime(df['ts_event'])
                df = df.sort_values('ts_event')
                time_diffs = df['ts_event'].diff().dt.total_seconds()
                all_diffs.extend(time_diffs.dropna().tolist())
            
            if not all_diffs:
                raise ValueError("No valid time differences found in the data")
            
            # Calculate statistics
            median_diff = np.median(all_diffs)
            mean_diff = np.mean(all_diffs)
            std_diff = np.std(all_diffs)
            
            # Set sequence and prediction lengths based on data characteristics
            # Sequence length: typical number of observations in 30 minutes
            typical_observations_per_30min = int((30 * 60) / median_diff)
            cls.sequence_length = min(max(typical_observations_per_30min, 10), 100)
            
            # Prediction length: typical number of observations in 5 minutes
            typical_observations_per_5min = int((5 * 60) / median_diff)
            cls.prediction_length = min(max(typical_observations_per_5min, 5), 30)
            
            print(f"\nTime Series Analysis Results:")
            print(f"Median time between observations: {median_diff:.2f} seconds")
            print(f"Mean time between observations: {mean_diff:.2f} seconds")
            print(f"Standard deviation: {std_diff:.2f} seconds")
            print(f"Selected sequence length: {cls.sequence_length} observations")
            print(f"Selected prediction length: {cls.prediction_length} observations")
            
            return True
        except Exception as e:
            print(f"Error analyzing time series: {str(e)}")
            # Set default values if analysis fails
            cls.sequence_length = 100
            cls.prediction_length = 30
            return False

    @classmethod
    def initialize(cls):
        """Initialize configuration and validate settings"""
        if not cls.validate_dates():
            raise ValueError("Date validation failed")
        if not cls.analyze_time_series():
            print("Warning: Using default sequence and prediction lengths")
        return True
    


# Model Defintion

In [143]:
class PricePredictionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(PricePredictionLSTM, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        
        # Add batch normalization
        self.batch_norm = nn.BatchNorm1d(hidden_size)
        
        # Add dropout for regularization
        self.dropout = nn.Dropout(0.2)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # Initialize hidden state with zeros
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        if len(x.shape) == 2:
            x = x.unsqueeze(1)  # Add time dimension if not present
            
        # LSTM forward pass
        out, _ = self.lstm(x, (h0, c0))
        
        # Apply batch normalization
        out = self.batch_norm(out[:, -1, :])
        
        # Apply dropout
        out = self.dropout(out)
        
        # Decode the hidden state
        out = self.fc(out)
        
        return out

# dataset classes

In [144]:
import pandas as pd
import torch
import torch.nn as nn
from datetime import datetime
import glob
from pathlib import Path
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

class TimeSeriesDataset(Dataset):
    def __init__(self, data, sequence_length=60):
        """
        Args:
            data: Normalized DataFrame with timestamp index and features
            sequence_length: Number of time steps to look back
        """
        self.sequence_length = sequence_length
        
        # Convert DataFrame to numpy array
        if isinstance(data, pd.DataFrame):
            self.data = data.values
        else:
            self.data = data
            
        # Create sequences
        self.sequences = self._create_sequences()
    
    def _create_sequences(self):
        sequences = []
        for i in range(len(self.data) - self.sequence_length):
            # Get sequence of prices and features
            sequence = self.data[i:(i + self.sequence_length)]
            target = self.data[i + self.sequence_length, 0]  # Price is first column
            sequences.append((sequence, target))
        return sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.FloatTensor(sequence), torch.FloatTensor([target])







# training function

In [145]:
def train_model(model, train_loader, val_loader, criterion, optimizer, 
                num_epochs, device, patience=10):
    """Train the model with early stopping."""
    best_val_loss = float('inf')
    no_improve = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        
        for sequences, targets in train_loader:
            sequences, targets = sequences.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for sequences, targets in val_loader:
                sequences, targets = sequences.to(device), targets.to(device)
                outputs = model(sequences)
                val_loss += criterion(outputs, targets).item()
        
        # Print progress
        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

# Data Processing

In [146]:
def process_csv_file(file_path):
    """Process a single CSV file with proper timestamp and price handling"""
    df = pd.read_csv(file_path)
    # Convert timestamp string to datetime
    df['ts_event'] = pd.to_datetime(df['ts_event'], format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)
    # Convert to local timezone if needed
    # df['ts_event'] = df['ts_event'].dt.tz_convert('Your_Timezone')
    
    # Ensure price is float
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    
    # Sort by timestamp
    df = df.sort_values('ts_event')
    
    return df[['ts_event', 'price']]
def process_csv_files(config):
    """Process all CSV files in the data directory."""
    all_data = []
    csv_files = glob.glob(str(Path(config.DATA_DIR) / "*.csv"))
    
    if not csv_files:
        raise ValueError(f"No CSV files found in {config.DATA_DIR}")
    
    for file in csv_files:
        # Read CSV with proper timestamp parsing
        df = pd.read_csv(file)
        df['ts_event'] = pd.to_datetime(df['ts_event'])
        df = df.set_index('ts_event')
        df = df.sort_index()  # Sort by timestamp
        all_data.append(df)
    
    # Combine all data and sort by timestamp
    combined_df = pd.concat(all_data)
    combined_df = combined_df.sort_index()
    
    # Convert config dates to pandas datetime
    train_start = pd.to_datetime(config.TRAIN_START_DATE)
    train_end = pd.to_datetime(config.TRAIN_END_DATE)
    test_start = pd.to_datetime(config.TEST_START_DATE)
    test_end = pd.to_datetime(config.TEST_END_DATE)
    
    # Split into train and test
    train_data = combined_df[(combined_df.index >= train_start) & 
                            (combined_df.index <= train_end)].copy()
    
    test_data = combined_df[(combined_df.index >= test_start) & 
                           (combined_df.index <= test_end)].copy()
    
    return train_data, test_data
def prepare_data(train_data, test_data, sequence_length=60):
    """Prepare data for LSTM model."""
    # Add technical indicators
    def add_features(df):
        df['returns'] = df['price'].pct_change()
        df['log_returns'] = np.log1p(df['returns'])
        
        # Add rolling statistics
        for window in [5, 10, 20]:
            df[f'ma_{window}'] = df['price'].rolling(window=window).mean()
            df[f'std_{window}'] = df['price'].rolling(window=window).std()
            
        # Add time features
        df['hour'] = df.index.hour
        df['minute'] = df.index.minute
        df['day_of_week'] = df.index.dayofweek
        
        return df.fillna(method='ffill').fillna(method='bfill')
    
    train_data = add_features(train_data)
    test_data = add_features(test_data)
    
    # Scale the data
    scaler = MinMaxScaler()
    train_scaled = scaler.fit_transform(train_data)
    test_scaled = scaler.transform(test_data)
    
    # Create datasets
    train_dataset = TimeSeriesDataset(train_scaled, sequence_length)
    test_dataset = TimeSeriesDataset(test_scaled, sequence_length)
    
    return train_dataset, test_dataset, scaler

def create_data_loaders(train_dataset, test_dataset, batch_size=32, num_workers=0):
    """Create DataLoaders for training and testing."""
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        drop_last=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        drop_last=False
    )
    
    return train_loader, test_loader

# visuals and pre processing

In [147]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler



def add_technical_features(df):
    """Add technical indicators as features."""
    # Convert only numeric columns to float32
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    df[numeric_columns] = df[numeric_columns].astype('float32')
    
    df['returns'] = df['price'].pct_change()
    
    windows = [5, 15, 30, 60]
    for window in windows:
        df[f'sma_{window}'] = df['price'].rolling(window=window).mean()
        df[f'std_{window}'] = df['price'].rolling(window=window).std()
    
    # Time-based features should be handled separately
    df['hour'] = df.index.hour
    df['minute'] = df.index.minute
    df['day_of_week'] = df.index.dayofweek
    
    # Ensure numeric columns are float32
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    df[numeric_columns] = df[numeric_columns].astype('float32')
    
    df = df.ffill().bfill()    
    return df


# Config and Setup

In [148]:
def initialize_training_environment(config):
    """Initialize training environment and device"""
    torch.manual_seed(config.RANDOM_SEED)
    np.random.seed(config.RANDOM_SEED)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.backends.cudnn.benchmark = True
    print(f"Using device: {device}")
    return device

In [149]:
def generate_predictions(model, scaled_data, data_scaler, config, device):
    """Generate model predictions"""
    model.eval()
    predictions_list = []
    
    with torch.no_grad():
        # Ensure scaled_data is a numpy array of float type
        scaled_data = np.array(scaled_data, dtype=np.float32)
        
        # Get the last sequence from training data
        last_sequence = scaled_data[-config.sequence_length:].reshape(1, config.sequence_length, -1)
        last_sequence = torch.FloatTensor(last_sequence).to(device)
        current_sequence = last_sequence
        
        # Generate predictions
        for _ in range(config.prediction_length):
            pred = model(current_sequence)
            predictions_list.append(pred.cpu().numpy()[0])
            
            # Update sequence for next prediction
            new_sequence = current_sequence.clone()
            new_sequence = new_sequence[:, 1:, :]
            new_row = torch.zeros((1, 1, scaled_data.shape[1])).to(device)
            new_row[:, :, 0] = pred
            current_sequence = torch.cat([new_sequence, new_row], dim=1)
    
    # Convert predictions back to original scale
    predictions = np.array(predictions_list)
    predictions_reshaped = np.zeros((len(predictions), scaled_data.shape[1]))
    predictions_reshaped[:, 0] = predictions.flatten()
    predicted_prices = data_scaler.inverse_transform(predictions_reshaped)[:, 0]
    
    return predicted_prices


In [150]:
def process_timestamps(df):
    """
    Process timestamps from ISO format and ensure proper time series continuity
    
    Args:
        df (pd.DataFrame): DataFrame with ts_event column in ISO format
    
    Returns:
        pd.DataFrame: Processed DataFrame with proper timestamps
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Convert ISO timestamps to pandas datetime if not already datetime
    if not pd.api.types.is_datetime64_any_dtype(df['ts_event']):
        df['ts_event'] = pd.to_datetime(df['ts_event'], format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)
    
    # Sort by timestamp
    df = df.sort_values('ts_event')
    
    # Calculate time differences between consecutive timestamps
    df['time_diff'] = df['ts_event'].diff()
    
    # Add timestamp-based features
    df['hour'] = df['ts_event'].dt.hour
    df['minute'] = df['ts_event'].dt.minute
    df['second'] = df['ts_event'].dt.second
    df['microsecond'] = df['ts_event'].dt.microsecond
    
    return df

def prepare_training_data(train_df, config):
    """Prepare data for training with consistent feature handling"""
    try:
        # Store original data before modifications
        original_data = {
            'timestamp': train_df.index,
            'price': train_df['price'].copy()
        }
        
        print("Adding technical features...")
        # Add technical features while keeping track of feature names
        feature_columns = ['price']  # Start with price as first feature
        
        # Calculate returns
        train_df['returns'] = train_df['price'].pct_change()
        feature_columns.append('returns')
        
        # Add rolling statistics
        windows = [5, 15, 30, 60]
        for window in windows:
            train_df[f'sma_{window}'] = train_df['price'].rolling(window=window).mean()
            train_df[f'std_{window}'] = train_df['price'].rolling(window=window).std()
            feature_columns.extend([f'sma_{window}', f'std_{window}'])
        
        # Fill NaN values
        train_df = train_df.fillna(method='ffill').fillna(method='bfill')
        
        print("Scaling data...")
        # Scale only the numeric features
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(train_df[feature_columns])
        
        # Store feature names with the scaler for later use
        scaler.feature_names_ = feature_columns
        
        return scaled_data, scaler, pd.DataFrame(original_data)
        
    except Exception as e:
        print(f"Error in prepare_training_data: {str(e)}")
        return None, None, None


def prepare_data_for_visualization(model, train_df, test_data, config, scaler):
    """Prepare data for visualization with consistent feature handling"""
    try:
        # Get test data with a small buffer
        test_start = pd.to_datetime(config.TEST_START_DATE, utc=True) - pd.Timedelta(microseconds=1)
        test_end = pd.to_datetime(config.TEST_END_DATE, utc=True)
        
        # Filter test data
        test_data = train_df[
            (train_df.index > test_start) & 
            (train_df.index <= test_end)
        ].copy()
        
        # Validate input data
        if test_data.empty:
            print(f"Test period: {test_start} to {test_end}")
            print(f"Available data range: {train_df.index.min()} to {train_df.index.max()}")
            raise ValueError("Test data is empty. Check your date ranges.")
            
        # Print data ranges for debugging
        print(f"Test data range: {test_data.index.min()} to {test_data.index.max()}")
        print(f"Test data shape: {test_data.shape}")
        
        # Create DataFrame for visualization
        actual_data = pd.DataFrame({
            'ts_event': test_data.index,
            'price': test_data['price'],
            'type': 'actual'
        })
        
        # Generate predictions
        train_data_numpy = train_df.select_dtypes(include=[np.number]).values
        predicted_prices = generate_predictions(model, train_data_numpy, scaler, config, next(iter(model.parameters())).device)
        
        # Create predictions DataFrame
        predictions_df = pd.DataFrame({
            'ts_event': pd.date_range(
                start=test_data.index[0],
                periods=len(predicted_prices),
                freq='1min'
            ),
            'price': predicted_prices,
            'type': 'predicted'
        })
        
        return actual_data, predictions_df
            
    except Exception as e:
        print(f"Error in prepare_data_for_visualization: {str(e)}")
        traceback.print_exc()
        return None, None


def plot_predictions_with_actual(actual_data, predictions_df, config):
    """Plot actual vs predicted prices with proper timestamp handling"""
    plt.figure(figsize=(15, 8))
    
    # Make sure timestamps are datetime
    actual_data['ts_event'] = pd.to_datetime(actual_data['ts_event'])
    predictions_df['ts_event'] = pd.to_datetime(predictions_df['ts_event'])
    
    # Plot actual prices
    actual_mask = actual_data['type'] == 'actual'
    if any(actual_mask):
        plt.plot(actual_data[actual_mask]['ts_event'],
                actual_data[actual_mask]['price'],
                label='Actual Price',
                color='green',
                linewidth=2)
        
        # Print actual price stats
        print("\nActual Price Statistics:")
        print(actual_data[actual_mask]['price'].describe())
    
    # Plot predicted prices
    predicted_mask = predictions_df['type'] == 'predicted'
    if any(predicted_mask):
        plt.plot(predictions_df[predicted_mask]['ts_event'],
                predictions_df[predicted_mask]['price'],
                label='Predicted Price',
                color='blue',
                linewidth=2,
                linestyle='--')
        
        # Print predicted price stats
        print("\nPredicted Price Statistics:")
        print(predictions_df[predicted_mask]['price'].describe())
    
    # Add vertical line at prediction start
    pred_start = pd.to_datetime(config.TEST_START_DATE, format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)
    plt.axvline(x=pred_start, color='red', linestyle=':', label='Prediction Start')
    
    plt.title('Price Prediction Analysis', fontsize=16, pad=20)
    plt.xlabel('Time', fontsize=12)
    plt.ylabel('Price ($)', fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    
    # Format axes
    plt.gcf().autofmt_xdate()
    plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.6f'))
    
    plt.tight_layout()
    plt.show()

In [151]:
def load_and_process_data(config):
    """Load and process all data files with proper timestamp handling"""
    all_data = []
    csv_files = glob.glob(str(Path(config.DATA_DIR) / "*.csv"))
    
    if not csv_files:
        raise ValueError(f"No CSV files found in directory: {config.DATA_DIR}")
    
    print(f"Found {len(csv_files)} CSV files")
    print("Processing data files...")
    
    for file in tqdm(csv_files):
        try:
            # Read CSV
            df = pd.read_csv(file)
            
            # Convert timestamps properly without trying to localize
            df['ts_event'] = pd.to_datetime(df['ts_event'], utc=True)
            
            # Convert config dates to datetime with UTC timezone
            train_start = pd.to_datetime(config.TRAIN_START_DATE, utc=True)
            train_end = pd.to_datetime(config.TRAIN_END_DATE, utc=True)
            
            # Add a small buffer to avoid exact timestamp matching issues
            train_start = train_start - pd.Timedelta(seconds=1)
            train_end = train_end + pd.Timedelta(seconds=1)
            
            # Filter data
            mask = (df['ts_event'] >= train_start) & (df['ts_event'] <= train_end)
            df = df[mask]
            
            if not df.empty:
                all_data.append(df)
                print(f"File {Path(file).name}: Found {len(df)} records in time range")
            
        except Exception as e:
            print(f"Error processing file {file}: {str(e)}")
            traceback.print_exc()
            continue
    
    if not all_data:
        raise ValueError(
            f"No data found between {config.TRAIN_START_DATE} and {config.TRAIN_END_DATE}\n"
            f"Please check your date ranges and data directory: {config.DATA_DIR}"
        )
    
    # Combine all data
    combined_df = pd.concat(all_data)
    combined_df = combined_df.sort_values('ts_event')
    
    # Print data range information
    print(f"\nData range: {combined_df['ts_event'].min()} to {combined_df['ts_event'].max()}")
    print(f"Total records: {len(combined_df)}")
    
    # Create regular time intervals if needed
    print("\nResampling to regular intervals...")
    combined_df.set_index('ts_event', inplace=True)
    # Resample to 1-minute intervals
    combined_df = combined_df.resample('1min').last().ffill()
    
    return combined_df

In [152]:
def create_data_loaders(scaled_data, config):
    """Create train and validation data loaders"""
    train_size = int(config.TRAIN_VAL_SPLIT * len(scaled_data))
    
    train_dataset = TimeSeriesDataset(scaled_data[:train_size], sequence_length=config.sequence_length)
    val_dataset = TimeSeriesDataset(scaled_data[train_size:], sequence_length=config.sequence_length)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        num_workers=config.NUM_WORKERS,
        drop_last=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        num_workers=config.NUM_WORKERS,
        drop_last=True
    )
    
    return train_loader, val_loader

In [153]:
def create_comparison_dataframes(predicted_prices, actual_data, config):
    """Create DataFrames for comparison"""
    last_timestamp = pd.to_datetime(config.TEST_START_DATE)
    
    # Create timestamp index for predictions
    pred_index = pd.date_range(
        start=last_timestamp,
        periods=config.prediction_length + 1,
        freq='1min'
    )[1:]
    
    predictions_df = pd.DataFrame({
        'timestamp': pred_index,
        'price': predicted_prices,
        'type': 'predicted'
    })
    
    # Get actual prices for comparison
    actual_prices = actual_data[
        (actual_data['timestamp'] <= pd.to_datetime(config.TEST_END_DATE)) &
        (actual_data['timestamp'] >= pd.to_datetime(config.TEST_START_DATE))
    ].copy()
    
    actual_prices = actual_prices.rename(columns={'actual_price': 'price'})
    actual_prices['type'] = 'actual'
    
    # Combine actual and predicted
    comparison_df = pd.concat([actual_prices, predictions_df], axis=0)
    comparison_df = comparison_df.sort_values('timestamp').reset_index(drop=True)
    
    return predictions_df, comparison_df

def create_prediction_timestamps(config, n_predictions):
    """Create properly spaced timestamps for predictions"""
    # Convert test start date to datetime
    test_start = pd.to_datetime(config.TEST_START_DATE, format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)
    
    # Create timestamp range for predictions
    pred_timestamps = pd.date_range(
        start=test_start,
        periods=n_predictions + 1,  # +1 to include start
        freq='1min',  # adjust frequency as needed
        tz='UTC'
    )
    
    return pred_timestamps

In [154]:
def plot_predictions(comparison_df, config):
    """Plot actual vs predicted prices with proper timestamp handling"""
    plt.figure(figsize=(15, 8))
    
    # Convert timestamps to datetime if they aren't already
    comparison_df['timestamp'] = pd.to_datetime(comparison_df['timestamp'])
    
    # Plot actual prices
    actual_mask = comparison_df['type'] == 'actual'
    if any(actual_mask):
        plt.plot(comparison_df[actual_mask]['timestamp'],
                comparison_df[actual_mask]['price'],
                label='Actual Price',
                color='green',
                linewidth=2)
    
    # Plot predicted prices
    predicted_mask = comparison_df['type'] == 'predicted'
    if any(predicted_mask):
        plt.plot(comparison_df[predicted_mask]['timestamp'],
                comparison_df[predicted_mask]['price'],
                label='Predicted Price',
                color='blue',
                linewidth=2,
                linestyle='--')
    
    # Add vertical line at prediction start
    pred_start = pd.to_datetime(config.TEST_START_DATE, format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)
    plt.axvline(x=pred_start, color='red', linestyle=':', label='Prediction Start')
    
    plt.title('Price Prediction Analysis', fontsize=16, pad=20)
    plt.xlabel('Time', fontsize=12)
    plt.ylabel('Price ($)', fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    
    # Format x-axis
    plt.gcf().autofmt_xdate()  # Rotate and align x-axis labels
    
    # Use tight layout
    plt.tight_layout()
    
    plt.show()

In [155]:
def calculate_metrics(comparison_df):
    """Calculate and display prediction metrics"""
    actual_mask = comparison_df['type'] == 'actual'
    predicted_mask = comparison_df['type'] == 'predicted'
    overlap_timestamps = set(comparison_df[actual_mask]['timestamp']) & set(comparison_df[predicted_mask]['timestamp'])
    
    if overlap_timestamps:
        overlap_df = comparison_df[comparison_df['timestamp'].isin(overlap_timestamps)].copy()
        actual_values = overlap_df[actual_mask]['price'].values
        predicted_values = overlap_df[predicted_mask]['price'].values
        
        metrics = {
            'MSE': np.mean((actual_values - predicted_values) ** 2),
            'RMSE': np.sqrt(np.mean((actual_values - predicted_values) ** 2)),
            'MAE': np.mean(np.abs(actual_values - predicted_values)),
            'MAPE': np.mean(np.abs((actual_values - predicted_values) / actual_values)) * 100,
            'Correlation': np.corrcoef(actual_values, predicted_values)[0, 1] if len(actual_values) > 1 else np.nan
        }
        
        metrics_df = pd.DataFrame([metrics]).T.round(4)
        metrics_df.columns = ['Value']
        
        print("\nPrediction Metrics:")
        display(metrics_df)
        
        return metrics_df
    else:
        print("\nNo overlap period found between actual and predicted prices.")
        return None

In [None]:
def main(config):
    """Main training and prediction pipeline"""
    try:
        # Initialize configuration and environment
        if not config.initialize():
            raise ValueError("Configuration initialization failed")
        device = initialize_training_environment(config)
        
        print("\nInitialization complete:")
        print(f"Sequence length: {config.sequence_length}")
        print(f"Prediction length: {config.prediction_length}")
        print(f"Using device: {device}")
        
        # Load and process data
        print("\nLoading and processing data...")
        train_df = load_and_process_data(config)
        if train_df is None or train_df.empty:
            raise ValueError("No training data loaded")
        
        # Print data range information
        print(f"\nLoaded data range: {train_df.index.min()} to {train_df.index.max()}")
        print(f"Number of training samples: {len(train_df)}")
        
        print("\nPreparing training data...")
        scaled_data, data_scaler, actual_data = prepare_training_data(train_df, config)
        if scaled_data is None:
            raise ValueError("Failed to scale data")
        
        print("\nCreating data loaders...")
        train_loader, val_loader = create_data_loaders(scaled_data, config)
        
        # Initialize model
        print("\nInitializing model...")
        input_size = scaled_data.shape[1]  # Number of features
        model = PricePredictionLSTM(
            input_size=input_size,
            hidden_size=config.HIDDEN_SIZE,
            num_layers=config.NUM_LAYERS
        ).to(device)
        
        # Setup training
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
        
        # Train model
        print(f"\nStarting training for {config.EPOCHS} epochs...")
        train_model(model, train_loader, val_loader, criterion, optimizer, 
                   config.EPOCHS, device, config.PATIENCE)
        
        # Prepare test data with buffer
        test_start = pd.to_datetime(config.TEST_START_DATE, utc=True) - pd.Timedelta(microseconds=1)
        test_end = pd.to_datetime(config.TEST_END_DATE, utc=True) + pd.Timedelta(microseconds=1)
        
        test_data = train_df[
            (train_df.index > test_start) & 
            (train_df.index <= test_end)
        ].copy()
        
        if test_data.empty:
            print(f"\nTest period: {test_start} to {test_end}")
            print(f"Available data range: {train_df.index.min()} to {train_df.index.max()}")
            raise ValueError(f"No test data found in specified range")
        
        print(f"\nTest data range: {test_data.index.min()} to {test_data.index.max()}")
        print(f"Number of test samples: {len(test_data)}")
        
        # Generate predictions
        print("\nGenerating predictions...")
        actual_data_test, predictions_df = prepare_data_for_visualization(
            model, train_df, test_data, config, data_scaler
        )
        
        if predictions_df is None or predictions_df.empty:
            raise ValueError("Failed to generate predictions")
        
        # Visualize results
        print("\nVisualizing results...")
        plot_predictions_with_actual(actual_data_test, predictions_df, config)
        
        # Calculate metrics
        print("\nCalculating metrics...")
        comparison_df = prepare_comparison_dataframe(actual_data_test, predictions_df)
        metrics_df = calculate_metrics(comparison_df)
        
        # Print summary statistics
        print("\nSummary Statistics:")
        print_summary_statistics(comparison_df)
        
        print("\nPipeline completed successfully!")
        return model, comparison_df, metrics_df
        
    except Exception as e:
        print(f"\nError in main pipeline: {str(e)}")
        traceback.print_exc()
        return None, None, None
    
    finally:
        # Cleanup
        try:
            torch.cuda.empty_cache()
        except:
            pass

def run_pipeline(config_class=Config):
    """Wrapper function to run the main pipeline with proper setup"""
    try:
        # Create Config instance if needed
        config = config_class() if isinstance(config_class, type) else config_class
            
        # Run main pipeline
        model, comparison_df, metrics_df = main(config)
        
        if model is None:
            print("\nPipeline failed!")
            return None, None, None
            
        # Save results if successful
        save_results(model, comparison_df, metrics_df)
        
        return model, comparison_df, metrics_df
        
    except Exception as e:
        print(f"\nFatal error in pipeline: {str(e)}")
        traceback.print_exc()
        return None, None, None

def prepare_comparison_dataframe(actual_data_test, predictions_df):
    """Prepare comparison DataFrame with consistent column names"""
    if 'ts_event' not in actual_data_test.columns:
        actual_data_test['ts_event'] = actual_data_test.index
    if 'ts_event' not in predictions_df.columns:
        predictions_df['ts_event'] = predictions_df.index
        
    comparison_df = pd.concat([actual_data_test, predictions_df])
    return comparison_df.sort_values('ts_event').reset_index(drop=True)

def print_summary_statistics(comparison_df):
    """Print summary statistics for actual and predicted prices"""
    for data_type in ['actual', 'predicted']:
        data_mask = comparison_df['type'] == data_type
        if any(data_mask):
            print(f"\n{data_type.title()} Price Statistics:")
            print(comparison_df[data_mask]['price'].describe())

def save_results(model, comparison_df, metrics_df):
    """Save model, comparison results, and metrics"""
    try:
        torch.save(model.state_dict(), 'final_model.pth')
        
        if comparison_df is not None:
            comparison_df_save = comparison_df.copy()
            if 'ts_event' in comparison_df_save.columns:
                comparison_df_save['ts_event'] = comparison_df_save['ts_event'].dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
            comparison_df_save.to_csv('comparison_results.csv', index=False)
        
        if metrics_df is not None:
            metrics_df.to_csv('metrics_results.csv')
        
        print("\nResults saved successfully!")
    except Exception as e:
        print(f"\nWarning: Failed to save results: {str(e)}")


In [157]:
model, comparison_df, metrics_df = run_pipeline(Config)


Validating date ranges:
Train period: 2018-05-03 08:00:46+00:00 to 2018-05-03 20:00:00.000001+00:00
Test period: 2018-05-03 20:00:00+00:00 to 2018-05-03 23:59:58.000001+00:00

Time Series Analysis Results:
Median time between observations: 0.00 seconds
Mean time between observations: 1.69 seconds
Standard deviation: 27.72 seconds
Selected sequence length: 100 observations
Selected prediction length: 30 observations
Using device: cuda

Initialization complete:
Sequence length: 100
Prediction length: 30
Using device: cuda

Loading and processing data...
Found 2 CSV files
Processing data files...


100%|██████████| 2/2 [00:00<00:00, 14.54it/s]
  train_df = train_df.fillna(method='ffill').fillna(method='bfill')


File xnas.itch_NVDA_20180503_to_20180504.csv: Found 41983 records in time range

Data range: 2018-05-03 08:00:46.643023992+00:00 to 2018-05-03 20:00:00.225802840+00:00
Total records: 41983

Resampling to regular intervals...

Loaded data range: 2018-05-03 08:00:00+00:00 to 2018-05-03 20:00:00+00:00
Number of training samples: 721

Preparing training data...
Adding technical features...
Scaling data...

Creating data loaders...

Initializing model...

Starting training for 50 epochs...
Epoch 1/50
Training Loss: 0.3244
Validation Loss: 0.9766
Epoch 2/50
Training Loss: 0.2202
Validation Loss: 0.8382
Epoch 3/50
Training Loss: 0.1557
Validation Loss: 0.6840
Epoch 4/50
Training Loss: 0.1355
Validation Loss: 0.5724
Epoch 5/50
Training Loss: 0.0980
Validation Loss: 0.3591
Epoch 6/50
Training Loss: 0.0849
Validation Loss: 0.1177
Epoch 7/50
Training Loss: 0.0735
Validation Loss: 0.0381
Epoch 8/50
Training Loss: 0.0629
Validation Loss: 0.0513
Epoch 9/50
Training Loss: 0.0573
Validation Loss: 0.01

Traceback (most recent call last):
  File "C:\Users\cinco\AppData\Local\Temp\ipykernel_30416\3484768983.py", line 106, in prepare_data_for_visualization
    predicted_prices = generate_predictions(model, train_data_numpy, scaler, config, next(iter(model.parameters())).device)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cinco\AppData\Local\Temp\ipykernel_30416\3343492228.py", line 17, in generate_predictions
    pred = model(current_sequence)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\cinco\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\nn\modules\module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\cinco\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\nn\modules\module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^