In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
def create_time_lagged_features(df, target_col, hours_ahead=6):
    """
    Create time-shifted features for prediction
    """
    df = df.sort_values('date')
    
    # Create lagged features 
    for col in df.select_dtypes(include=[np.number]).columns:
        if col not in ['hour', 'month', 'day', 'day_of_week', 'longitude', 'latitude', 'departures', 'STL1_GDS0_DBLY']:
            df[f'{col}_lag1'] = df[col].shift(1)
            df[f'{col}_lag3'] = df[col].shift(3)
            df[f'{col}_lag6'] = df[col].shift(6)
            df[f'{col}_lag12'] = df[col].shift(12)
    
    # Create target variable
    df['target'] = df[target_col].shift(-hours_ahead)
    return df

In [3]:
def preprocess_data(df, hours_ahead=6):
    """
    Preprocess the weather data for model training with time-shifted features
    """
    # Convert date to useful features
    df['date'] = pd.to_datetime(df['date'])
    df['hour'] = df['date'].dt.hour
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    
    # Select relevant features
    base_features = [
        'longitude', 'latitude', 'hour', 'month', 'day', 'day_of_week',
        'STL1_GDS0_DBLY', '2T_GDS0_SFC', 'low_availability', 'high_availability',
        'complete', '2D_GDS0_SFC', 'STL2_GDS0_DBLY', 'STL3_GDS0_DBLY', 
        'SKT_GDS0_SFC', 'STL4_GDS0_DBLY', 'population', 'departures'
    ]
    
    # Handle missing values
    df['low_availability'] = df['low_availability'].fillna(0)
    df['high_availability'] = df['high_availability'].fillna(0)
    df['departures'] = df['departures'].fillna(0)
    
    # Create time-lagged features
    df = create_time_lagged_features(df[base_features + ['date']], 'STL1_GDS0_DBLY', hours_ahead)
    df = df.bfill().ffill()    
    df = df.drop(columns=['date', 'STL1_GDS0_DBLY'])
    
    return df

In [4]:
def train_model(X_train, y_train):
    """
    Train an MLP
    """
    model = MLPRegressor(
        hidden_layer_sizes=(512, 256, 128, 64, 32, 16),
        activation='relu',
        solver='adam',
        max_iter=2500,
        early_stopping=True,
        validation_fraction=0.1,
        verbose=True,
        learning_rate='adaptive',
        learning_rate_init=0.0008,
        alpha=0.0001,
        random_state=42)
    
    model.fit(X_train, y_train)    
    return model

In [5]:
def evaluate_model(model, X, y, set_name=""):
    """
    Evaluate model performance with multiple metrics
    """
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    mae = mean_absolute_error(y, predictions)
    r2 = model.score(X, y)
    
    print(f'{set_name} Results:')
    print(f'R² Score: {r2:.4f}')
    print(f'MSE: {mse:.4f}')
    print(f'MAE: {mae:.4f}')
    print(f'RMSE: {np.sqrt(mse):.4f}\n')
    
    return predictions

In [6]:
csv_file = 'dataset.csv'
full_data = pd.read_csv(csv_file)
full_data_clean = preprocess_data(full_data, hours_ahead=6)

In [None]:
if __name__ == '__main__':
    csv_file = 'dataset.csv'
    full_data = pd.read_csv(csv_file)
    
    # 1. Clean and preprocess data with time-lagged features
    full_data_clean = preprocess_data(full_data, hours_ahead=6)
    
    # 2. Split features and target
    y = full_data_clean['target']
    X = full_data_clean.drop(columns=['target'])
    
    # 3. Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 4. Split data chronologically
    train_size = int(len(X_scaled) * 0.85)
    X_train, X_test = X_scaled[:train_size], X_scaled[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    
    # 5. Train model
    model = train_model(X_train, y_train)
    
    # 6. Evaluate model
    print("Model Evaluation:")
    train_preds = evaluate_model(model, X_train, y_train, "Training")
    test_preds = evaluate_model(model, X_test, y_test, "Test")

Iteration 1, loss = 703.63226986
Validation score: 0.061245
Iteration 2, loss = 33.19755484
Validation score: 0.026089
Iteration 3, loss = 32.46035061
Validation score: 0.122938
Iteration 4, loss = 31.77101378
Validation score: 0.056749
Iteration 5, loss = 31.11200401
Validation score: 0.140273
Iteration 6, loss = 31.00765594
Validation score: 0.153815
Iteration 7, loss = 30.74636679
Validation score: 0.164528
Iteration 8, loss = 30.11507687
Validation score: 0.135411
Iteration 9, loss = 30.28702238
Validation score: 0.113956
Iteration 10, loss = 30.21292979
Validation score: -0.006238
Iteration 11, loss = 30.18665557
Validation score: 0.160387
Iteration 12, loss = 29.89521725
Validation score: 0.146118
Iteration 13, loss = 29.70905002
Validation score: 0.140401
Iteration 14, loss = 29.62040336
Validation score: 0.195558
Iteration 15, loss = 29.57857424
Validation score: 0.166823
Iteration 16, loss = 29.42650067
Validation score: 0.186096
Iteration 17, loss = 29.31437639
Validation sco