# Imports

In [58]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import os
from datetime import datetime, timedelta


# Config

In [59]:
import pandas as pd
import glob
from pathlib import Path
import numpy as np

class Config:
    # Date ranges
    TRAIN_START_DATE = "2018-05-02T08:44:39.292059872Z"
    TRAIN_END_DATE = "2018-05-02T23:59:55.940964414Z"
    TEST_START_DATE = "2018-05-02T23:59:55.940964414Z"
    TEST_END_DATE = "2018-05-03T23:59:58.000000Z"

    # Data directory
    DATA_DIR = r"C:\Users\cinco\Desktop\DATA FOR SCRIPTS\data bento data\test"

    # Model parameters
    BATCH_SIZE = 32
    HIDDEN_SIZE = 64
    NUM_LAYERS = 2
    LEARNING_RATE = 0.001
    EPOCHS = 2
    PATIENCE = 10

    # Training parameters
    NUM_WORKERS = 0
    RANDOM_SEED = 42

    # Dynamic parameters
    sequence_length = None
    prediction_length = None

    @classmethod
    def validate_dates(cls):
        try:
            train_start = pd.to_datetime(cls.TRAIN_START_DATE, utc=True)
            train_end = pd.to_datetime(cls.TRAIN_END_DATE, utc=True) + pd.Timedelta(microseconds=1)
            test_start = pd.to_datetime(cls.TEST_START_DATE, utc=True)
            test_end = pd.to_datetime(cls.TEST_END_DATE, utc=True) + pd.Timedelta(microseconds=1)
            
            print(f"\nValidating date ranges:")
            print(f"Train period: {train_start} to {train_end}")
            print(f"Test period: {test_start} to {test_end}")
            
            assert train_start < train_end, "Training start date must be before training end date"
            assert test_start <= test_end, "Test start date must be before or equal to test end date"
            assert train_end >= test_start, "Training end date should be at or after test start date"
            
            return True
        except Exception as e:
            print(f"Date validation error: {str(e)}")
            return False

    @classmethod
    def analyze_time_series(cls):
        try:
            all_diffs = []
            csv_files = glob.glob(str(Path(cls.DATA_DIR) / "*.csv"))
            
            for file in csv_files:
                df = pd.read_csv(file)
                df['ts_event'] = pd.to_datetime(df['ts_event'])
                df = df.sort_values('ts_event')
                time_diffs = df['ts_event'].diff().dt.total_seconds()
                all_diffs.extend(time_diffs.dropna().tolist())
            
            if not all_diffs:
                raise ValueError("No valid time differences found in the data")
            
            median_diff = np.median(all_diffs)
            mean_diff = np.mean(all_diffs)
            std_diff = np.std(all_diffs)
            
            typical_observations_per_30min = int((30 * 60) / median_diff)
            cls.sequence_length = min(max(typical_observations_per_30min, 10), 100)
            
            typical_observations_per_5min = int((5 * 60) / median_diff)
            cls.prediction_length = min(max(typical_observations_per_5min, 5), 30)
            
            print(f"\nTime Series Analysis Results:")
            print(f"Median time between observations: {median_diff:.2f} seconds")
            print(f"Mean time between observations: {mean_diff:.2f} seconds")
            print(f"Standard deviation: {std_diff:.2f} seconds")
            print(f"Selected sequence length: {cls.sequence_length} observations")
            print(f"Selected prediction length: {cls.prediction_length} observations")
            
            return True
        except Exception as e:
            print(f"Error analyzing time series: {str(e)}")
            cls.sequence_length = 100
            cls.prediction_length = 30
            return False

    @classmethod
    def initialize(cls):
        if not cls.validate_dates():
            raise ValueError("Date validation failed")
        if not cls.analyze_time_series():
            print("Warning: Using default sequence and prediction lengths")
        return True


# Data Loading Function


In [60]:
def load_csv_data(directory, columns=['ts_event', 'price']):
    data = []
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(directory, file))
            data.append(df[columns])
    return pd.concat(data, ignore_index=True)

# Data Preprocessing Function


In [61]:
def preprocess_data(train_df, test_df, sequence_length=60):
    scaler = MinMaxScaler()
    train_prices = scaler.fit_transform(train_df['price'].values.reshape(-1, 1))
    test_prices = scaler.transform(test_df['price'].values.reshape(-1, 1))
    
    # Create sequences for training data
    X_train, y_train = create_sequences(train_prices, sequence_length)
    
    # Create sequences for test data
    X_test, y_test = create_sequences(test_prices, sequence_length)
    
    return X_train, y_train, X_test, y_test, scaler

def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length])
        y.append(data[i+sequence_length])
    return np.array(X), np.array(y)


# Custom Dataset


In [62]:
class PriceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# LSTM Model


In [63]:
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, num_layers=1):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Training Function


In [64]:
def train_model(model, train_loader, criterion, optimizer):
    model.train()
    for epoch in range(Config.EPOCHS):
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch+1}/{Config.EPOCHS}], Loss: {loss.item():.4f}')


# Prediction Function


In [65]:
def predict(model, test_data, scaler):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for i in range(len(test_data)):
            X = torch.FloatTensor(test_data[i]).unsqueeze(0)
            y_pred = model(X)
            predictions.append(scaler.inverse_transform(y_pred.numpy())[0][0])
    
    return predictions


# Visualization

In [None]:
def plot_predicted_vs_actual(df):
    plt.figure(figsize=(12, 6))
    plt.plot(df['ts_event'], df['actual_price'], label='Actual Price')
    plt.plot(df['ts_event'], df['predicted_price'], label='Predicted Price')
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.title('Predicted vs Actual Prices')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Calculate and print RMSE
    rmse = np.sqrt(((df['actual_price'] - df['predicted_price']) ** 2).mean())
    print(f"Root Mean Square Error: {rmse}")

# Main execution


In [None]:
if __name__ == "__main__":
    # Initialize configuration
    Config.initialize()

    # Load data
    df = load_csv_data(Config.DATA_DIR)
    df['ts_event'] = pd.to_datetime(df['ts_event'])

    # Split data into train and test sets
    train_mask = (df['ts_event'] >= pd.to_datetime(Config.TRAIN_START_DATE, utc=True)) & \
                 (df['ts_event'] <= pd.to_datetime(Config.TRAIN_END_DATE, utc=True))
    test_mask = (df['ts_event'] >= pd.to_datetime(Config.TEST_START_DATE, utc=True)) & \
                (df['ts_event'] <= pd.to_datetime(Config.TEST_END_DATE, utc=True))

    train_df = df[train_mask]
    test_df = df[test_mask]

    # Preprocess data
    X_train, y_train, X_test, y_test, scaler = preprocess_data(train_df, test_df, sequence_length=Config.sequence_length)
    
    # Create dataset and dataloader for training data
    train_dataset = PriceDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=Config.NUM_WORKERS)
    
    # Initialize model, loss function, and optimizer
    model = LSTMModel(hidden_size=Config.HIDDEN_SIZE, num_layers=Config.NUM_LAYERS)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=Config.LEARNING_RATE)
    
    # Train the model
    train_model(model, train_loader, criterion, optimizer)
    
    # Make predictions on test data
    test_predictions = predict(model, X_test, scaler)
    
    # Create a DataFrame with test predictions
    test_dates = test_df['ts_event'][Config.sequence_length:].reset_index(drop=True)
    predictions_df = pd.DataFrame({
        'ts_event': test_dates,
        'predicted_price': test_predictions,
        'actual_price': scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
    })
    
    # Plot predicted vs actual prices
    plot_predicted_vs_actual(predictions_df)
    
    print(predictions_df)



Validating date ranges:
Train period: 2018-05-02 08:44:39.292059872+00:00 to 2018-05-02 23:59:55.940965414+00:00
Test period: 2018-05-02 23:59:55.940964414+00:00 to 2018-05-03 23:59:58.000001+00:00

Time Series Analysis Results:
Median time between observations: 0.00 seconds
Mean time between observations: 1.69 seconds
Standard deviation: 27.72 seconds
Selected sequence length: 100 observations
Selected prediction length: 30 observations


TypeError: preprocess_data() missing 1 required positional argument: 'test_df'