In [14]:
import pandas as pd
import os
import plotly.express as px

# Define file path
file_path = os.path.join('dataset', 'energydata_complete.csv')

# Load dataset
df = pd.read_csv(file_path)

# Read and display first dataframe head
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [15]:
# Select only numeric columns for correlation analysis
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate correlation matrix
corr_matrix = numeric_df.corr()

# Display correlation with 'Appliances' sorted descending
print("Correlation with Appliances:")
print(corr_matrix['Appliances'].sort_values(ascending=False))

# Visualize correlation matrix
fig = px.imshow(corr_matrix, text_auto=False, aspect="auto", title="Correlation Matrix")
fig.show()

Correlation with Appliances:
Appliances     1.000000
lights         0.197278
T2             0.120073
T6             0.117638
T_out          0.099155
Windspeed      0.087122
RH_1           0.086031
T3             0.085060
T1             0.055447
T4             0.040281
T8             0.039572
RH_3           0.036292
T7             0.025801
T5             0.019760
RH_4           0.016965
Tdewpoint      0.015353
T9             0.010010
RH_5           0.006955
Visibility     0.000230
rv1           -0.011145
rv2           -0.011145
Press_mm_hg   -0.034885
RH_9          -0.051462
RH_7          -0.055642
RH_2          -0.060465
RH_6          -0.083178
RH_8          -0.094039
RH_out        -0.152282
Name: Appliances, dtype: float64


In [16]:
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Sort by date to ensure proper lag calculation
df = df.sort_values('date').reset_index(drop=True)

# Extract time features
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['hour'] = df['date'].dt.hour

# === LAG FEATURES ===
# Data is recorded every 10 minutes, so:
# 1 hour = 6 rows, 6 hours = 36 rows, 24 hours = 144 rows
df['Appliances_lag_1h'] = df['Appliances'].shift(6)
df['Appliances_lag_6h'] = df['Appliances'].shift(36)
df['Appliances_lag_24h'] = df['Appliances'].shift(144)

# === ROLLING AVERAGES ===
df['Appliances_roll_3h'] = df['Appliances'].rolling(window=18).mean()
df['Appliances_roll_6h'] = df['Appliances'].rolling(window=36).mean()
df['Appliances_roll_12h'] = df['Appliances'].rolling(window=72).mean()

# Drop rows with NaN (from lag/rolling)
df = df.dropna().reset_index(drop=True)

# Drop date and lights
df.drop(columns=['date', 'lights'], inplace=True)

print(f"Dataset size after lag features: {len(df)} rows")
print(f"New features: Appliances_lag_1h, Appliances_lag_6h, Appliances_lag_24h")
print(f"Rolling features: Appliances_roll_3h, Appliances_roll_6h, Appliances_roll_12h")
df.head()

Dataset size after lag features: 19591 rows
New features: Appliances_lag_1h, Appliances_lag_6h, Appliances_lag_24h
Rolling features: Appliances_roll_3h, Appliances_roll_6h, Appliances_roll_12h


Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,rv2,month,day_of_week,hour,Appliances_lag_1h,Appliances_lag_6h,Appliances_lag_24h,Appliances_roll_3h,Appliances_roll_6h,Appliances_roll_12h
0,60,20.066667,42.833333,19.0,42.418182,19.79,44.7,19.26,42.56,17.6,...,36.465481,1,1,17,30.0,50.0,60.0,71.111111,101.388889,92.361111
1,60,20.0,42.6725,19.0,42.433333,19.79,44.663333,19.2,42.56,17.6,...,22.628542,1,1,17,40.0,50.0,60.0,71.111111,101.666667,92.5
2,210,20.0,42.53,18.99,42.471818,19.79,44.59,19.2,42.626667,17.6,...,8.756338,1,1,17,50.0,50.0,50.0,79.444444,106.111111,94.861111
3,380,20.033333,43.496667,18.902222,42.58,19.823333,44.59,19.2,42.76,17.6,...,37.334913,1,1,17,40.0,60.0,50.0,80.0,115.0,99.583333
4,370,20.033333,42.963333,18.89,42.56,19.89,44.59,19.36,43.566667,17.533333,...,5.249382,1,1,17,40.0,60.0,60.0,89.444444,123.611111,104.166667


In [17]:
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

# Define Target (y) and Features (X)
y = df['Appliances']
X = df.drop(columns=['Appliances'])

print(f"Target: Appliances")
print(f"Features ({len(X.columns)}): {list(X.columns)}")

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features for Neural Network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

Target: Appliances
Features (35): ['T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1', 'rv2', 'month', 'day_of_week', 'hour', 'Appliances_lag_1h', 'Appliances_lag_6h', 'Appliances_lag_24h', 'Appliances_roll_3h', 'Appliances_roll_6h', 'Appliances_roll_12h']
Training samples: 15672, Test samples: 3919


In [27]:
# Define all models including Neural Network
models = {
    'Linear Regression': (LinearRegression(), False),
    'Random Forest': (RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1), False),
    'XGBoost': (XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.1, random_state=42, n_jobs=-1), False),
    'MLP Neural Network': (MLPRegressor(hidden_layer_sizes=(128, 64, 32), max_iter=500, 
                                         early_stopping=True, random_state=42), True)
}

results = []

for name, (model, use_scaled) in models.items():
    print(f"Training {name}...")
    
    X_tr = X_train_scaled if use_scaled else X_train
    X_te = X_test_scaled if use_scaled else X_test
    
    # Training
    start_train = time.time()
    model.fit(X_tr, y_train)
    train_time = time.time() - start_train
    
    # Prediction
    start_pred = time.time()
    y_pred = model.predict(X_te)
    pred_time = time.time() - start_pred
    
    # Test metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Train metrics for overfitting check
    y_pred_train = model.predict(X_tr)
    r2_train = r2_score(y_train, y_pred_train)
    
    results.append({
        'Model': name,
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'R²': round(r2, 4),
        'Train R²': round(r2_train, 4),
        'R² Gap': round(r2_train - r2, 4),
        'Train Time (s)': round(train_time, 3),
        'Pred Time (s)': round(pred_time, 5)
    })

results_df = pd.DataFrame(results)
print("\n=== Model Comparison (with Lag Features) ===")
results_df.sort_values('R²', ascending=False)

Training Linear Regression...
Training Random Forest...
Training XGBoost...
Training MLP Neural Network...

=== Model Comparison (with Lag Features) ===


Unnamed: 0,Model,MAE,RMSE,R²,Train R²,R² Gap,Train Time (s),Pred Time (s)
2,XGBoost,31.42,65.52,0.6104,0.9187,0.3083,0.631,0.00636
1,Random Forest,32.56,68.53,0.5737,0.9026,0.3288,6.524,0.05184
3,MLP Neural Network,37.42,74.0,0.503,0.659,0.156,7.483,0.00352
0,Linear Regression,43.67,82.46,0.3828,0.3925,0.0097,0.012,0.00245


In [25]:
# Train vs Test Performance Comparison (Overfitting Check)
train_test_comparison = []

for name, (model, use_scaled) in models.items():
    X_tr = X_train_scaled if use_scaled else X_train
    X_te = X_test_scaled if use_scaled else X_test
    
    # Predictions
    y_pred_train = model.predict(X_tr)
    y_pred_test = model.predict(X_te)
    
    # Metrics
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    # MAPE calculation (avoid division by zero)
    mape_train = np.mean(np.abs((y_train - y_pred_train) / np.maximum(y_train, 1))) * 100
    mape_test = np.mean(np.abs((y_test - y_pred_test) / np.maximum(y_test, 1))) * 100
    
    train_test_comparison.append({
        'Model': name,
        'Train R²': round(r2_train, 4),
        'Test R²': round(r2_test, 4),
        'R² Gap': round(r2_train - r2_test, 4),
        'Train MAE': round(mae_train, 2),
        'Test MAE': round(mae_test, 2),
        'Train MAPE (%)': round(mape_train, 2),
        'Test MAPE (%)': round(mape_test, 2),
        'Train RMSE': round(rmse_train, 2),
        'Test RMSE': round(rmse_test, 2)
    })

comparison_df = pd.DataFrame(train_test_comparison)
print("=== Train vs Test Performance ===")
print("(R² Gap indicates overfitting: higher gap = more overfitting)")
comparison_df.sort_values('Test R²', ascending=False)

=== Train vs Test Performance ===
(R² Gap indicates overfitting: higher gap = more overfitting)


Unnamed: 0,Model,Train R²,Test R²,R² Gap,Train MAE,Test MAE,Train MAPE (%),Test MAPE (%),Train RMSE,Test RMSE
2,XGBoost,0.9187,0.6104,0.3083,16.48,31.42,19.79,30.9,29.04,65.52
1,Random Forest,0.9026,0.5737,0.3288,17.29,32.56,18.76,31.41,31.78,68.53
3,MLP Neural Network,0.659,0.503,0.156,31.67,37.42,33.53,37.65,59.45,74.0
0,Linear Regression,0.3925,0.3828,0.0097,42.16,43.67,44.73,45.87,79.36,82.46


In [20]:
# Feature Importance from XGBoost
xgb_model = models['XGBoost'][0]
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("=== Top 15 Feature Importances (XGBoost) ===")
print(feature_importance.head(15))

fig = px.bar(feature_importance.head(15), x='Importance', y='Feature', orientation='h',
             title='Top 15 Feature Importances')
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.show()

=== Top 15 Feature Importances (XGBoost) ===
                Feature  Importance
32   Appliances_roll_3h    0.187662
28                 hour    0.102380
29    Appliances_lag_1h    0.037112
33   Appliances_roll_6h    0.036591
34  Appliances_roll_12h    0.030341
12                   T7    0.027590
14                   T8    0.026843
4                    T3    0.026785
18                T_out    0.026771
8                    T5    0.026314
10                   T6    0.026196
6                    T4    0.024977
27          day_of_week    0.024722
31   Appliances_lag_24h    0.023781
20               RH_out    0.022986


In [None]:
# TensorFlow/Keras Deep Learning Models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

# Set random seed for reproducibility
tf.random.set_seed(42)

# Prepare data for TensorFlow (already scaled)
X_train_tf = X_train_scaled
X_test_tf = X_test_scaled
y_train_tf = y_train.values
y_test_tf = y_test.values

# Define multiple TensorFlow models
def create_deep_nn():
    """Deep Feedforward Neural Network"""
    model = keras.Sequential([
        layers.Dense(256, activation='relu', input_shape=(X_train_tf.shape[1],)),
        layers.Dropout(0.3),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

def create_wide_deep_nn():
    """Wide & Deep Neural Network"""
    model = keras.Sequential([
        layers.Dense(512, activation='relu', input_shape=(X_train_tf.shape[1],)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
                  loss='mse', metrics=['mae'])
    return model

def create_residual_nn():
    """Neural Network with Residual Connections"""
    inputs = layers.Input(shape=(X_train_tf.shape[1],))
    
    # First block
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.2)(x)
    
    # Residual block 1
    residual = x
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Add()([x, residual])
    
    # Dense layers
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(32, activation='relu')(x)
    outputs = layers.Dense(1)(x)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Early stopping callback
early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

# Train and evaluate TensorFlow models
tf_results = []
tf_models = {
    'TF Deep NN': create_deep_nn,
    'TF Wide & Deep NN': create_wide_deep_nn,
    'TF Residual NN': create_residual_nn
}

for name, model_fn in tf_models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")
    
    model = model_fn()
    
    # Training
    start_train = time.time()
    history = model.fit(
        X_train_tf, y_train_tf,
        validation_split=0.2,
        epochs=100,
        batch_size=64,
        callbacks=[early_stop],
        verbose=0
    )
    train_time = time.time() - start_train
    
    # Prediction
    start_pred = time.time()
    y_pred = model.predict(X_test_tf, verbose=0).flatten()
    pred_time = time.time() - start_pred
    
    # Metrics
    mae = mean_absolute_error(y_test_tf, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test_tf, y_pred))
    r2 = r2_score(y_test_tf, y_pred)
    
    # Train metrics
    y_pred_train = model.predict(X_train_tf, verbose=0).flatten()
    r2_train = r2_score(y_train_tf, y_pred_train)
    
    tf_results.append({
        'Model': name,
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'R²': round(r2, 4),
        'Train R²': round(r2_train, 4),
        'R² Gap': round(r2_train - r2, 4),
        'Train Time (s)': round(train_time, 3),
        'Pred Time (s)': round(pred_time, 5),
        'Epochs': len(history.history['loss'])
    })
    
    print(f"✓ {name} - Test R²: {r2:.4f}, MAE: {mae:.2f}, RMSE: {rmse:.2f}")

# Combine all results
tf_results_df = pd.DataFrame(tf_results)
combined_results = pd.concat([results_df, tf_results_df], ignore_index=True)

print("\n" + "="*80)
print("=== COMPLETE MODEL COMPARISON (All Models) ===")
print("="*80)
combined_results.sort_values('R²', ascending=False)


Training TF Deep NN...



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



✓ TF Deep NN - Test R²: 0.4973, MAE: 33.84, RMSE: 74.42

Training TF Wide & Deep NN...



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



✓ TF Wide & Deep NN - Test R²: 0.2687, MAE: 40.66, RMSE: 89.76

Training TF Residual NN...
✓ TF Residual NN - Test R²: 0.3053, MAE: 42.02, RMSE: 87.49

=== COMPLETE MODEL COMPARISON (All Models) ===
                Model    MAE   RMSE      R²  Train Time (s)  Pred Time (s)  \
2             XGBoost  31.42  65.52  0.6104           0.729        0.00968   
1       Random Forest  32.56  68.53  0.5737           6.611        0.06259   
3  MLP Neural Network  37.42  74.00  0.5030           7.463        0.00269   
4          TF Deep NN  33.84  74.42  0.4973          42.661        0.21983   
0   Linear Regression  43.67  82.46  0.3828           0.018        0.00216   
6      TF Residual NN  42.02  87.49  0.3053           8.580        0.26363   
5   TF Wide & Deep NN  40.66  89.76  0.2687          12.473        0.27602   

   Train R²  R² Gap  Epochs  
2       NaN     NaN     NaN  
1       NaN     NaN     NaN  
3       NaN     NaN     NaN  
4    0.7209  0.2235   100.0  
0       NaN     NaN     Na

In [None]:
# PyTorch Deep Learning Models
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Set random seed for reproducibility
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Prepare data for PyTorch
X_train_pt = torch.FloatTensor(X_train_scaled).to(device)
X_test_pt = torch.FloatTensor(X_test_scaled).to(device)
y_train_pt = torch.FloatTensor(y_train.values).reshape(-1, 1).to(device)
y_test_pt = torch.FloatTensor(y_test.values).reshape(-1, 1).to(device)

input_dim = X_train_pt.shape[1]

# Define PyTorch model architectures
class DeepNN(nn.Module):
    """Deep Feedforward Neural Network"""
    def __init__(self, input_dim):
        super(DeepNN, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        return self.network(x)

class WideDeepNN(nn.Module):
    """Wide & Deep Neural Network with Batch Normalization"""
    def __init__(self, input_dim):
        super(WideDeepNN, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        return self.network(x)

class ResidualBlock(nn.Module):
    """Residual Block for skip connections"""
    def __init__(self, dim):
        super(ResidualBlock, self).__init__()
        self.fc1 = nn.Linear(dim, dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(dim, dim)
    
    def forward(self, x):
        residual = x
        out = self.relu(self.fc1(x))
        out = self.dropout(out)
        out = self.fc2(out)
        out += residual
        return self.relu(out)

class ResidualNN(nn.Module):
    """Neural Network with Residual Connections"""
    def __init__(self, input_dim):
        super(ResidualNN, self).__init__()
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.res_block1 = ResidualBlock(128)
        self.res_block2 = ResidualBlock(128)
        self.output_layers = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        x = self.input_layer(x)
        x = self.res_block1(x)
        x = self.res_block2(x)
        x = self.output_layers(x)
        return x

# Training function
def train_pytorch_model(model, X_train, y_train, X_val, y_val, epochs=100, batch_size=64, lr=0.001):
    """Train PyTorch model with early stopping"""
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Create data loaders
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    best_val_loss = float('inf')
    patience = 15
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, y_val).item()
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    # Restore best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return model, epoch + 1

# Split train data for validation
val_size = int(0.2 * len(X_train_pt))
X_train_train = X_train_pt[:-val_size]
y_train_train = y_train_pt[:-val_size]
X_train_val = X_train_pt[-val_size:]
y_train_val = y_train_pt[-val_size:]

# Train and evaluate PyTorch models
pytorch_results = []
pytorch_models_dict = {
    'PyTorch Deep NN': DeepNN,
    'PyTorch Wide & Deep NN': WideDeepNN,
    'PyTorch Residual NN': ResidualNN
}

for name, model_class in pytorch_models_dict.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")
    
    model = model_class(input_dim).to(device)
    
    # Training
    start_train = time.time()
    model, epochs_trained = train_pytorch_model(
        model, X_train_train, y_train_train, X_train_val, y_train_val,
        epochs=100, batch_size=64, lr=0.001
    )
    train_time = time.time() - start_train
    
    # Prediction
    model.eval()
    with torch.no_grad():
        start_pred = time.time()
        y_pred = model(X_test_pt).cpu().numpy().flatten()
        pred_time = time.time() - start_pred
        
        # Train predictions for overfitting check
        y_pred_train = model(X_train_pt).cpu().numpy().flatten()
    
    # Metrics
    mae = mean_absolute_error(y_test.values, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test.values, y_pred))
    r2 = r2_score(y_test.values, y_pred)
    
    # Train metrics
    r2_train = r2_score(y_train.values, y_pred_train)
    
    pytorch_results.append({
        'Model': name,
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'R²': round(r2, 4),
        'Train R²': round(r2_train, 4),
        'R² Gap': round(r2_train - r2, 4),
        'Train Time (s)': round(train_time, 3),
        'Pred Time (s)': round(pred_time, 5),
        'Epochs': epochs_trained
    })
    
    print(f"✓ {name} - Test R²: {r2:.4f}, MAE: {mae:.2f}, RMSE: {rmse:.2f}")

# Append PyTorch results to combined results
pytorch_results_df = pd.DataFrame(pytorch_results)
combined_results = pd.concat([results_df, tf_results_df, pytorch_results_df], ignore_index=True)

print("\n" + "="*80)
print("=== FINAL MODEL COMPARISON (All Models Including PyTorch) ===")
print("="*80)
combined_results.sort_values('R²', ascending=False)

Using device: cpu

Training PyTorch Deep NN...
✓ PyTorch Deep NN - Test R²: 0.5006, MAE: 34.70, RMSE: 74.18

Training PyTorch Wide & Deep NN...
✓ PyTorch Wide & Deep NN - Test R²: 0.5066, MAE: 34.17, RMSE: 73.73

Training PyTorch Residual NN...
✓ PyTorch Residual NN - Test R²: 0.4610, MAE: 34.06, RMSE: 77.07

=== FINAL MODEL COMPARISON (All Models Including PyTorch) ===


Unnamed: 0,Model,MAE,RMSE,R²,Train R²,R² Gap,Train Time (s),Pred Time (s),Epochs
2,XGBoost,31.42,65.52,0.6104,0.9187,0.3083,0.631,0.00636,
1,Random Forest,32.56,68.53,0.5737,0.9026,0.3288,6.524,0.05184,
8,PyTorch Wide & Deep NN,34.17,73.73,0.5066,0.6573,0.1507,44.242,0.00896,53.0
3,MLP Neural Network,37.42,74.0,0.503,0.659,0.156,7.483,0.00352,
7,PyTorch Deep NN,34.7,74.18,0.5006,0.6892,0.1886,50.838,0.00318,97.0
4,TF Deep NN,33.84,74.42,0.4973,0.7209,0.2235,42.661,0.21983,100.0
9,PyTorch Residual NN,34.06,77.07,0.461,0.7168,0.2559,62.755,0.00321,91.0
0,Linear Regression,43.67,82.46,0.3828,0.3925,0.0097,0.012,0.00245,
6,TF Residual NN,42.02,87.49,0.3053,0.3012,-0.0042,8.58,0.26363,15.0
5,TF Wide & Deep NN,40.66,89.76,0.2687,0.2766,0.0079,12.473,0.27602,15.0
