In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Generate synthetic dataset
np.random.seed(0)
n_samples = 1000
observed_data = np.random.normal(loc=10, scale=2, size=n_samples)  # Observed data
ensemble_mean_mod = np.random.normal(loc=10, scale=3, size=n_samples)  # Ensemble mean predictions
ensemble_std_mod = np.random.uniform(low=0, high=2, size=n_samples)  # Ensemble standard deviation predictions

# Create a DataFrame
data = pd.DataFrame({
    'Observed Data': observed_data,
    'Ensemble Mean Prediction': ensemble_mean_mod,
    'Ensemble Std Deviation Prediction': ensemble_std_mod
})

# Display the first few rows of the dataset
print(data.head())


   Observed Data  Ensemble Mean Prediction  Ensemble Std Deviation Prediction
0      13.528105                 11.667888                           0.801021
1      10.800314                 12.677422                           0.821831
2      11.957476                  8.733056                           0.206506
3      14.481786                 10.314142                           1.038198
4      13.735116                 10.684160                           0.193606


In [6]:
# Splitting the data into training and testing sets
X = data.drop('Observed Data', axis=1)
y = data['Observed Data']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining the XGBoost model
model = xgb.XGBRegressor(objective ='reg:squarederror', seed=42)

# Training the XGBoost model
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Calculating mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 5.792258874788212
