In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

# Suppress warnings for cleaner notebook output
import warnings
warnings.filterwarnings('ignore')

# Wrapper for CatBoost to make it compatible with sklearn pipelines
class CatBoostRegressorWrapper(CatBoostRegressor, BaseEstimator, RegressorMixin):
    pass  # Inherits methods; BaseEstimator provides __sklearn_tags__

# Load high correlation features data
data = pd.read_csv('preprocessed_real_estate_high_corr.csv')

# Load original data to get date_of_transfer
original_data = pd.read_csv('real_estate.csv')
original_data['date_of_transfer'] = pd.to_datetime(original_data['date_of_transfer'])
original_data = original_data[(original_data['date_of_transfer'] >= '2025-04-01') & (original_data['date_of_transfer'] <= '2025-12-31')]

# Add date_of_transfer to preprocessed data (matching indices)
data['date_of_transfer'] = original_data['date_of_transfer'].values[:len(data)]

print(f"Data shape: {data.shape}")
print(f"Columns: {data.columns.tolist()}")
print(f"Date range: {data['date_of_transfer'].min()} to {data['date_of_transfer'].max()}")

Data shape: (354089, 6)
Columns: ['price', 'postcode_mean_price', 'street_mean_price', 'town_city_mean_price', 'district_mean_price', 'date_of_transfer']
Date range: 2025-04-01 00:00:00 to 2025-11-28 00:00:00


In [14]:
# Sort by date
data = data.sort_values('date_of_transfer').reset_index(drop=True)

# Define target
TARGET = 'price'

# Define features (all except target and date)
feature_cols = [col for col in data.columns if col != TARGET and col != 'date_of_transfer']

print(f"Target: {TARGET}")
print(f"Features: {feature_cols}")
print(f"Total features: {len(feature_cols)}")
print(f"Data sorted by date_of_transfer")

Target: price
Features: ['postcode_mean_price', 'street_mean_price', 'town_city_mean_price', 'district_mean_price']
Total features: 4
Data sorted by date_of_transfer


In [15]:
# ============================================================
# PREPARE DATA FOR MODELING
# ============================================================

X = data[feature_cols]
y = data[TARGET]

# Train-test split (80-20)
train_size = int(0.8 * len(data))
X_train = X.iloc[:train_size]
X_test = X.iloc[train_size:]
y_train = y.iloc[:train_size]
y_test = y.iloc[train_size:]

print(f"\nDataset size: {len(data)}")
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")


Dataset size: 354089
Training samples: 283271, Test samples: 70818
Feature matrix shape: (354089, 4)
Target shape: (354089,)


In [16]:
X = data[feature_cols]
y = data[TARGET]  # Single output target

# Train-test split (shuffle=False for time-series)
train_size = int(0.8 * len(data))
X_train = X.iloc[:train_size]
X_test = X.iloc[train_size:]
y_train = y.iloc[:train_size]
y_test = y.iloc[train_size:]

print(f"\nDataset size after feature engineering: {len(data)}")
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")




Dataset size after feature engineering: 354089
Training samples: 283271, Test samples: 70818


In [17]:
# Define models (single output regressors)
models = {
    'GradientBoosting': Pipeline([
        ('scaler', StandardScaler()),
        ('model', GradientBoostingRegressor(random_state=42))
    ]),
    'LightGBM': Pipeline([
        ('scaler', StandardScaler()),
        ('model', LGBMRegressor(random_state=42, verbose=-1))
    ]),
    'CatBoost': Pipeline([
        ('scaler', StandardScaler()),
        ('model', CatBoostRegressorWrapper(random_state=42, verbose=0))
    ]),
    'AdaBoost': Pipeline([
        ('scaler', StandardScaler()),
        ('model', AdaBoostRegressor(random_state=42))
    ]),
    'XGBoost': Pipeline([
        ('scaler', StandardScaler()),
        ('model', XGBRegressor(random_state=42))
    ])
}



In [18]:
def train_models(models, X_train, y_train):
    trained_models = {}
    for name, pipeline in models.items():
        print(f"Training {name}...")
        pipeline.fit(X_train, y_train)
        trained_models[name] = pipeline
    return trained_models



In [19]:
def evaluate_models(models, X_test, y_test):
    results = {}
    for name, pipeline in models.items():
        y_pred = pipeline.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        results[name] = mse
        print(f"{name} MSE: {mse:.4f}")
    return results



In [None]:
# Train on full training data
from xml.parsers.expat import model


trained_models = train_models(models, X_train, y_train)

# Evaluate
eval_results = evaluate_models(trained_models, X_test, y_test)

# ============================================================
# MULTI-HORIZON FORECASTING
# ============================================================

horizons = [1, 3, 5, 7, 14, 30]  # days ahead to predict

def prepare_horizon_data(X, y, horizon):
    y_horizon = y.shift(-horizon)
    X_horizon = X.copy()
    valid_idx = ~y_horizon.isna()
    X_horizon = X_horizon[valid_idx]\
    y_horizon = y_horizon[valid_idx]
    train_size = int(0.8 * len(X_horizon))
    return X_horizon.iloc[:train_size], X_horizon.iloc[train_size:], y_horizon.iloc[:train_size], y_horizon.iloc[train_size:]



Training GradientBoosting...
Training LightGBM...
Training CatBoost...
Training AdaBoost...
Training XGBoost...
GradientBoosting MSE: 98168874195.7867
LightGBM MSE: 385257831765.5060
CatBoost MSE: 388055293922.5231
AdaBoost MSE: 154434799153.1350
XGBoost MSE: 383339598818.7050


In [21]:
all_horizon_data = {}
y_test_all_horizons = {}
model_performances = {}

for horizon in horizons:
    print(f"\n=== Horizon: {horizon} days ===")
    X_train_h, X_test_h, y_train_h, y_test_h = prepare_horizon_data(X, y, horizon)
    
    trained_h = train_models(models, X_train_h, y_train_h)
    
    horizon_preds = {}
    for name, pipeline in trained_h.items():
        preds = pipeline.predict(X_test_h)
        horizon_preds[name] = preds
    
    all_horizon_data[horizon] = horizon_preds
    y_test_all_horizons[horizon] = y_test_h
    
    perf = evaluate_models(trained_h, X_test_h, y_test_h)
    model_performances[horizon] = perf




=== Horizon: 1 days ===
Training GradientBoosting...
Training LightGBM...
Training CatBoost...
Training AdaBoost...
Training XGBoost...
GradientBoosting MSE: 603614470762.5846
LightGBM MSE: 714016211464.9281
CatBoost MSE: 1278477933787.1687
AdaBoost MSE: 7806559792114.2334
XGBoost MSE: 1565314490115.0593

=== Horizon: 3 days ===
Training GradientBoosting...
Training LightGBM...
Training CatBoost...
Training AdaBoost...
Training XGBoost...
GradientBoosting MSE: 636922751034.5118
LightGBM MSE: 698037567090.0652
CatBoost MSE: 803182815846.1000
AdaBoost MSE: 14425543726441.1055
XGBoost MSE: 9551341097289.6094

=== Horizon: 5 days ===
Training GradientBoosting...
Training LightGBM...
Training CatBoost...
Training AdaBoost...
Training XGBoost...
GradientBoosting MSE: 586986025374.2119
LightGBM MSE: 1297970341866.1284
CatBoost MSE: 1037315083843.0856
AdaBoost MSE: 596843601051.4246
XGBoost MSE: 751605859342.4871

=== Horizon: 7 days ===
Training GradientBoosting...
Training LightGBM...
Train

KeyboardInterrupt: 

In [None]:
# ============================================================
# VISUALIZATION FUNCTIONS (Adapted for single target)
# ============================================================

def plot_combined_horizons_zoomed(all_horizon_data, y_test_data, zoom_factor=0.15):
    horizons = sorted(all_horizon_data.keys())
    
    for horizon in horizons:
        horizon_data = all_horizon_data[horizon]
        
        fig, ax = plt.subplots(figsize=(16, 8))
        
        model_colors = {
            'GradientBoosting': '#1f77b4',
            'LightGBM': '#ff7f0e',
            'CatBoost': '#2ca02c',
            'AdaBoost': '#d62728',
            'XGBoost': '#9467bd'
        }
        
        # Plot actual values
        actual = y_test_data[horizon]
        ax.plot(actual, label='Actual', linewidth=2.5, color='#333333', alpha=0.7, zorder=5, linestyle='-')
        
        # Plot model predictions
        for model_name, preds in sorted(horizon_data.items()):
            color = model_colors.get(model_name, '#999999')
            ax.plot(preds, label=model_name, alpha=0.85, linewidth=2, color=color, linestyle='--')
        
        # Zooming
        all_series = [actual] + list(horizon_data.values())
        y_min = min(np.min(s) for s in all_series)
        y_max = max(np.max(s) for s in all_series)
        y_range = y_max - y_min
        y_padding = zoom_factor * y_range
        ax.set_ylim(y_min - y_padding, y_max + y_padding)
        
        ax.set_xlim(-50, len(actual) + 50)
        
        ax.set_title(f'Horizon {horizon} days ahead - Model Comparison (Zoomed)', fontsize=16, fontweight='bold', pad=20)
        ax.set_xlabel('Time Steps', fontsize=13)
        ax.set_ylabel('Mean Real Estate Price', fontsize=13)
        ax.legend(loc='best', fontsize=12, framealpha=0.95, shadow=True, fancybox=True)
        ax.grid(True, alpha=0.4, linestyle='--', linewidth=0.8)
        ax.set_facecolor('#f8f9fa')
        ax.minorticks_on()
        ax.grid(which='minor', alpha=0.2, linestyle=':', linewidth=0.5)
        ax.yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
        
        plt.tight_layout()
        plt.show()

def plot_residuals(all_horizon_data, y_test_data):
    horizons = sorted(all_horizon_data.keys())
    
    for horizon in horizons:
        horizon_data = all_horizon_data[horizon]
        actual = y_test_data[horizon]
        
        fig, ax = plt.subplots(figsize=(16, 6))
        
        model_colors = {
            'GradientBoosting': '#1f77b4',
            'LightGBM': '#ff7f0e',
            'CatBoost': '#2ca02c',
            'AdaBoost': '#d62728',
            'XGBoost': '#9467bd'
        }
        
        for model_name, preds in sorted(horizon_data.items()):
            residuals = preds - actual
            color = model_colors.get(model_name, '#999999')
            ax.plot(residuals, label=model_name, alpha=0.8, linewidth=1.5, color=color)
        
        ax.axhline(y=0, color='black', linestyle='-', linewidth=2, alpha=0.3, label='Zero Error')
        
        ax.set_title(f'Horizon {horizon} - Prediction Residuals (Prediction - Actual)', fontsize=16, fontweight='bold', pad=20)
        ax.set_xlabel('Time Steps', fontsize=13)
        ax.set_ylabel('Residual (Prediction Error)', fontsize=13)
        ax.legend(loc='best', fontsize=11, framealpha=0.95)
        ax.grid(True, alpha=0.4, linestyle='--')
        ax.set_facecolor('#f8f9fa')
        
        plt.tight_layout()
        plt.show()

def plot_focused_window(all_horizon_data, y_test_data, start_idx=0, window_size=500):
    horizons = sorted(all_horizon_data.keys())
    
    for horizon in horizons:
        horizon_data = all_horizon_data[horizon]
        actual = y_test_data[horizon]
        
        end_idx = min(start_idx + window_size, len(actual))
        
        fig, ax = plt.subplots(figsize=(32, 20))
        
        model_colors = {
            'GradientBoosting': '#1f77b4',
            'LightGBM': '#ff7f0e',
            'CatBoost': '#2ca02c',
            'AdaBoost': '#d62728',
            'XGBoost': '#9467bd'
        }
        
        x_range = range(start_idx, end_idx)
        ax.plot(x_range, actual[start_idx:end_idx], label='Actual', linewidth=2, color='#333333', alpha=0.5, linestyle='--')
        
        for model_name, preds in sorted(horizon_data.items()):
            color = model_colors.get(model_name, '#999999')
            ax.plot(x_range, preds[start_idx:end_idx], label=model_name, alpha=0.9, linewidth=2.5, color=color)
        
        ax.set_title(f'Horizon {horizon} - Time Steps {start_idx} to {end_idx} (Extreme Zoom)', fontsize=16, fontweight='bold', pad=20)
        ax.set_xlabel('Time Steps', fontsize=13)
        ax.set_ylabel('Mean Real Estate Price', fontsize=13)
        ax.legend(loc='best', fontsize=12, framealpha=0.95)
        ax.grid(True, alpha=0.5, linestyle='--')
        ax.minorticks_on()
        ax.grid(which='minor', alpha=0.25, linestyle=':')
        ax.set_facecolor('#f8f9fa')
        
        plt.tight_layout()
        plt.show()

# Example usage:
plot_combined_horizons_zoomed(all_horizon_data, y_test_all_horizons, zoom_factor=0.005)

# To see residuals
# plot_residuals(all_horizon_data, y_test_all_horizons)

# To see a specific time window with extreme detail
# plot_focused_window(all_horizon_data, y_test_all_horizons, start_idx=500, window_size=3000)

: 