# load the data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('D:\Conformalized_Quantile_Regression\LUCAS_2015_features_V3.csv')

In [2]:
# Convert the column to numeric, omitting non-double values
df['OC'] = pd.to_numeric(df['OC'], errors='coerce')

# Drop rows with NaN values (non-double values)
df.dropna(subset=['OC'], inplace=True)

In [3]:
df

Unnamed: 0,SR_B3_1,SR_B3_2,SR_B3_3,SR_B3_4,SR_B4_1,SR_B4_2,SR_B4_3,SR_B4_4,SR_B5_1,SR_B5_2,...,average_7,average_8,average_9,average_10,average_11,average_12,average_13,average_14,OC,point_id
0,0.065401,0.065401,0.063259,0.063259,0.058280,0.058280,0.053137,0.053137,0.341738,0.341738,...,532.638051,498.618622,502.788853,704.755629,402.898849,469.73883,617.714286,396.468312,24.6,26581768
1,0.037099,0.036429,0.035900,0.035307,0.034250,0.033989,0.032527,0.032337,0.179388,0.175070,...,512.001688,489.344548,512.317155,678.640244,404.306692,481.17483,593.808163,402.009979,21.9,26581792
2,0.072997,0.075304,0.071491,0.072426,0.063204,0.065369,0.060863,0.061394,0.347341,0.352894,...,519.103506,509.807511,457.939797,668.159475,448.914535,505.68683,625.240816,395.747479,18.4,26581954
3,0.026582,0.026362,0.026356,0.026522,0.022784,0.022857,0.022424,0.022880,0.189531,0.186850,...,520.863506,495.344548,490.902061,686.594090,401.957672,480.77883,613.922449,398.876646,48.0,26601784
4,0.051178,0.051178,0.047617,0.047617,0.031081,0.031081,0.028169,0.028169,0.358772,0.358772,...,497.983506,490.677881,449.362438,634.390244,435.981202,489.49483,594.461224,388.101646,25.2,26601978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21853,0.073742,0.073323,0.073456,0.073102,0.110663,0.109788,0.111120,0.110324,0.185107,0.184097,...,644.881688,600.148252,477.336023,746.617167,403.169437,416.83883,690.440816,344.026646,8.4,64881666
21854,0.085582,0.085886,0.086188,0.086588,0.071660,0.072601,0.071150,0.072110,0.402217,0.400485,...,636.685324,590.337140,476.524702,736.836398,395.087084,410.73883,680.738776,341.705812,10.8,64901668
21855,0.084536,0.087933,0.081859,0.085314,0.071451,0.077255,0.066329,0.072025,0.391538,0.383660,...,636.685324,590.337140,476.524702,736.836398,395.087084,410.73883,680.738776,341.705812,6.7,64901672
21856,0.087019,0.087398,0.086868,0.087840,0.099747,0.102079,0.099499,0.102728,0.286553,0.283027,...,648.761688,602.381585,490.090740,754.286398,404.941986,417.09883,695.187755,351.993312,5.7,64961676


In [4]:
X = df.iloc[:,:-2]
y = np.log (df.iloc[:,-2] + 1) #log transformation for outputs

In [5]:
import sys

def compute_coverage(y_test,y_lower,y_upper,significance,name=""):
    """ Compute average coverage and length, and print results

    Parameters
    ----------

    y_test : numpy array, true labels (n)
    y_lower : numpy array, estimated lower bound for the labels (n)
    y_upper : numpy array, estimated upper bound for the labels (n)
    significance : float, desired significance level
    name : string, optional output string (e.g. the method name)

    Returns
    -------

    coverage : float, average coverage
    avg_length : float, average length

    """
    in_the_range = np.sum((y_test >= y_lower) & (y_test <= y_upper))
    coverage = in_the_range / len(y_test) * 100
    print("%s: Percentage in the range (expecting %.2f): %f" % (name, 100 - significance*100, coverage))
    sys.stdout.flush()

    avg_length = abs(np.mean(y_lower - y_upper))
    print("%s: Average length: %f" % (name, avg_length))
    sys.stdout.flush()

    return coverage, avg_length

In [None]:
import numpy as np

def calculate_quantile_coverage_probability(data, true_quantile, confidence_interval):
    """
    Calculate the quantile coverage probability.

    Parameters:
    - data: NumPy array or list of data points.
    - true_quantile: The true quantile value to be estimated (e.g., 0.95 for a 95% quantile).
    - confidence_interval: List containing two values [alpha, 1 - alpha], where alpha is the significance level.

    Returns:
    - coverage_probability: True if the true quantile is within the confidence interval, False otherwise.
    """

    # Calculate the sample quantile
    sample_quantile = np.percentile(data, 100 * true_quantile)

    # Check if the true quantile falls within the confidence interval
    coverage_probability = (sample_quantile >= np.percentile(data, 100 * confidence_interval[0])) and \
                          (sample_quantile <= np.percentile(data, 100 * confidence_interval[1]))

    return coverage_probability

# # Example usage:
# data = np.random.normal(loc=0, scale=1, size=1000)
# true_quantile = 0.95
# confidence_interval = [0.05, 0.95]
# coverage_probability = calculate_quantile_coverage_probability(data, true_quantile, confidence_interval)
# print(f"Coverage Probability: {int(coverage_probability)}")


## Bootstrapping RF

In [6]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Number of bootstrap samples
n_bootstraps = 5

n_resample = 500

# Initialize lists to store predictions for each bootstrap sample
bootstrap_predictions = []

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [500, 1000],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest regressor
rf = RandomForestRegressor(random_state=42)

# Perform Grid Search to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=8)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Fit a Random Forest model with the best hyperparameters
best_rf = RandomForestRegressor(**best_params, random_state=42)


# Perform bootstrapping and calculate predictions as before
for _ in range(n_bootstraps):
    X_boot, y_boot = resample(X_train, y_train, n_samples = len(y_train) - n_resample , random_state=np.random.randint(0, 100))
    best_rf.fit(X_boot, y_boot)
    y_pred = best_rf.predict(X_test)
    bootstrap_predictions.append(y_pred)

# Calculate the mean and standard deviation of predictions
mean_predictions = np.mean(bootstrap_predictions, axis=0)
std_predictions = np.std(bootstrap_predictions, axis=0)

# Calculate upper and lower prediction bands (e.g., 95% confidence interval)
alpha = 0.1  # 95% confidence interval
z_score = 1.96  # Z-score for a 95% confidence interval

upper_band = np.max(bootstrap_predictions, axis=0)
lower_band = np.min(bootstrap_predictions, axis=0)

# Calculate Mean Squared Error (MSE) on the test set
mse = mean_squared_error(y_test, mean_predictions)

print(f"Best Hyperparameters: {best_params}")
print(f"Mean Squared Error on Test Set: {(mse)}")


Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 1000}
Mean Squared Error on Test Set: 0.4663052147773841


In [7]:
coverage_RF, length_RF = compute_coverage(y_test, lower_band, upper_band, alpha, name = "Boot-RF")

Boot-RF: Percentage in the range (expecting 90.00): 15.576395
Boot-RF: Average length: 0.256380


In [10]:
len(y_boot)

17476

In [None]:
#for K-fold

x_train = np.asarray(X)
y_train = np.asarray(y)

In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.quantile_regression import QuantReg

# Create your dataset (x_train, y_train, x_test, y_test)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "min_samples_leaf": [1, 2, 5],
    "n_estimators": [100, 500, 1000],
    "max_features": [None, "sqrt", "log2"]
}

# Perform nested cross-validation (e.g., 5-fold outer, 3-fold inner)
outer_cv = KFold(n_splits = 5, shuffle=True, random_state=422)
inner_cv = KFold(n_splits = 5, shuffle=True, random_state=422)

# Initialize lists to store results
mean_mse_scores = []
std_mse_scores = []

for train_index, test_index in outer_cv.split(x_train):
    x_train_outer, x_test_outer = x_train[train_index], x_train[test_index]
    y_train_outer, y_test_outer = y_train[train_index], y_train[test_index]

    # Initialize the inner loop results
    inner_residual_matrix = []
    Observed = []
    Estimated = []

    for inner_train_index, inner_test_index in inner_cv.split(x_train_outer):
        x_train_inner, x_val_inner = x_train_outer[inner_train_index], x_train_outer[inner_test_index]
        y_train_inner, y_val_inner = y_train_outer[inner_train_index], y_train_outer[inner_test_index]

        # Initialize the inner GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(
            estimator=RandomForestRegressor(random_state=422),
            param_grid=param_grid,
            scoring='neg_mean_squared_error',
            cv=inner_cv,
            n_jobs=-1
        )

        # Fit the GridSearchCV to the inner training data
        grid_search.fit(x_train_inner, y_train_inner)

        # Get the best hyperparameters from the inner loop
        best_params = grid_search.best_params_

        # Create a RandomForestRegressor with the best hyperparameters
        rf_model = RandomForestRegressor(
            random_state=422,
            min_samples_leaf=best_params['min_samples_leaf'],
            n_estimators=best_params['n_estimators'],
            max_features=best_params['max_features']
        )

        # Fit the model to the inner training data
        rf_model.fit(x_train_inner, y_train_inner)

        # Make predictions on the inner validation data
        predictions = rf_model.predict(x_val_inner)

        # Calculate residuals for the inner fold and store them in the inner_residual_matrix
        Observed.append(y_val_inner)
        Estimated.append(predictions)
       
        # residuals = y_val_inner - predictions
        # inner_residual_matrix.append(residuals)
        

    # # Calculate the mean and standard deviation of MSE scores for the inner loop results
    # inner_mse_scores = [mean_squared_error(y_val_inner, predictions) for predictions in rf_model.staged_predict(x_test_outer)]
    # mean_inner_mse = np.mean(inner_mse_scores)
    # std_inner_mse = np.std(inner_mse_scores)

    # # Append the mean and standard deviation of inner MSE to the lists of outer loop results
    # mean_mse_scores.append(mean_inner_mse)
    # std_mse_scores.append(std_inner_mse)

    # Vertically stack the padded arrays
    max_cols = max(len(row) for row in Observed)
    stacked_arrays_Observed = np.vstack([np.pad(row, (0, max_cols - len(row)), mode='constant') for row in Observed])
    stacked_arrays_Observed = np.array(stacked_arrays_Observed).flatten().reshape(-1, 1)

    max_cols = max(len(row) for row in Estimated)
    stacked_arrays_Estimated = np.vstack([np.pad(row, (0, max_cols - len(row)), mode='constant') for row in Estimated])
    stacked_arrays_Estimated = np.array(stacked_arrays_Estimated).flatten().reshape(-1, 1)

    # # Vertically stack the padded arrays
    # # stacked_arrays = np.vstack(inner_residual_matrix) 

    
    # Now, you can fit a quantile regression model using the inner_residual_matrix and x_test_outer
    quantile_model = sm.QuantReg(stacked_arrays_Observed, stacked_arrays_Estimated)
    quantile_results = quantile_model.fit(q=0.5)  # Fit the model for the median (you can choose other quantiles)

    # Make predictions on the test data using the quantile regression model
    quantile_predictions = quantile_results.predict(y_test_outer)

    # Define the quantiles you want to estimate (e.g., 10th and 90th percentiles)
    quantiles = [0.9]

    # Initialize lists to store lower and upper quantile predictions
    lower_quantile_predictions = []
    upper_quantile_predictions = []

    # Calculate prediction intervals for each quantile
    for quantile in quantiles:
        print(quantile)
        # Use quantile_results.get_prediction to estimate the prediction interval
        prediction_interval = quantile_results.get_prediction(y_test_outer).summary_frame(alpha=1-quantile)

        # Extract lower and upper bounds of the prediction interval
        lower_bound = prediction_interval['obs_ci_lower']
        upper_bound = prediction_interval['obs_ci_upper']

        coverage_RF, length_RF = compute_coverage(y_test_outer, lower_bound, upper_bound, 1-quantile, name = "PP-RF")
        

        # Append lower and upper bounds to the respective lists
        # lower_quantile_predictions.append(lower_bound)
        # upper_quantile_predictions.append(upper_bound)

    # Now, you have lower and upper quantile predictions for each specified quantile
    # You can use these to estimate uncertainty or construct prediction intervals as needed


In [None]:
picp

In [None]:
# compute and print average coverage and average length
coverage_RF, length_RF = helper.compute_coverage(y_test_rf,
                                                 y_lower_rf,
                                                 y_upper_rf,
                                                 alpha,
                                                 "Random Forests")

In [None]:
# from sklearn.model_selection import KFold, GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# import numpy as np
# import statsmodels.api as sm

# def fit_inner_rf_model(x_train_inner, y_train_inner):
#     # Initialize the inner GridSearchCV for hyperparameter tuning
#     param_grid = {
#         "min_samples_leaf": [1, 2, 5],
#         "n_estimators": [100, 500, 1000],
#         "max_features": [None, "sqrt", "log2"]
#     }

#     grid_search = GridSearchCV(
#         estimator=RandomForestRegressor(random_state=422),
#         param_grid=param_grid,
#         scoring='neg_mean_squared_error',
#         cv=inner_cv,
#         n_jobs=-1
#     )

#     # Fit the GridSearchCV to the inner training data
#     grid_search.fit(x_train_inner, y_train_inner)

#     # Get the best hyperparameters from the inner loop
#     best_params = grid_search.best_params_

#     # Create a RandomForestRegressor with the best hyperparameters
#     rf_model = RandomForestRegressor(
#         random_state=422,
#         min_samples_leaf=best_params['min_samples_leaf'],
#         n_estimators=best_params['n_estimators'],
#         max_features=best_params['max_features']
#     )

#     # Fit the model to the inner training data
#     rf_model.fit(x_train_inner, y_train_inner)

#     return rf_model

# def calculate_inner_residuals(rf_model, x_val_inner):
#     # Make predictions on the inner validation data
#     predictions = rf_model.predict(x_val_inner)

#     # Calculate residuals for the inner fold
#     residuals = y_val_inner - predictions

#     return residuals

# def fit_outer_quantile_regression(x_train_outer, y_train_outer, inner_residual_matrix):
#     quantile = 0.5  # Specify the quantile you want to estimate (e.g., 0.5 for the median)

#     # Fit a quantile regression model using x_train_outer and inner_residual_matrix
#     quantile_model = sm.QuantReg(y_train_outer, np.column_stack((x_train_outer, inner_residual_matrix)))
#     quantile_results = quantile_model.fit(q=quantile)  # Fit the model for the median (you can choose other quantiles)

#     return quantile_results

# def calculate_prediction_intervals(quantile_results, new_data_x, new_data_outer_residuals):
#     quantile = 0.5  # Specify the quantile you want to estimate (e.g., 0.5 for the median)

#     # Make predictions on new data using the fitted quantile regression model
#     new_data_predictions = quantile_results.predict(np.column_stack((new_data_x, new_data_outer_residuals)))

#     # Calculate prediction intervals for the specified quantile using quantile_results.get_prediction
#     prediction_interval = quantile_results.get_prediction(np.column_stack((new_data_x, new_data_outer_residuals))).summary_frame(alpha=1-quantile)

#     # Extract lower and upper bounds of the prediction interval
#     lower_bound = prediction_interval['obs_ci_lower']
#     upper_bound = prediction_interval['obs_ci_upper']

#     return new_data_predictions, lower_bound, upper_bound

# def main():
#     # Create your dataset (x_train, y_train, x_test, y_test)

#     # Perform nested cross-validation (e.g., 5-fold outer, 3-fold inner)
#     outer_cv = KFold(n_splits=5, shuffle=True, random_state=422)
#     inner_cv = KFold(n_splits=3, shuffle=True, random_state=422)

#     # Initialize lists to store results
#     mean_mse_scores = []
#     std_mse_scores = []

#     # Initialize the outer residual matrix to store residuals from test predictions
#     outer_residual_matrix = []

#     for train_index, test_index in outer_cv.split(x_train):
#         x_train_outer, x_test_outer = x_train[train_index], x_train[test_index]
#         y_train_outer, y_test_outer = y_train[train_index], y_train[test_index]

#         # Initialize the inner loop results
#         inner_residual_matrix = []

#         for inner_train_index, inner_test_index in inner_cv.split(x_train_outer):
#             x_train_inner, x_val_inner = x_train_outer[inner_train_index], x_train_outer[inner_test_index]
#             y_train_inner, y_val_inner = y_train_outer[inner_train_index], y_train_outer[inner_test_index]

#             # Fit the inner random forest model
#             rf_model = fit_inner_rf_model(x_train_inner, y_train_inner)

#             # Calculate residuals for the inner fold and store them in the inner_residual_matrix
#             inner_residuals = calculate_inner_residuals(rf_model, x_val_inner)
#             inner_residual_matrix.append(inner_residuals)

#         # Combine residuals from the inner loop into a single matrix
#         inner_residual_matrix = np.vstack(inner_residual_matrix)

#         # Fit the outer quantile regression model
#         quantile_results = fit_outer_quantile_regression(x_train_outer, y_train_outer, inner_residual)


## Data splitting

We begin by splitting the data into a proper training set and a calibration set. Recall that the main idea is to fit a regression model on the proper training samples, then use the residuals on a held-out validation set to quantify the uncertainty in future predictions.

In [None]:
# # divide the data into proper training set and calibration set
# idx = np.random.permutation(n_train)
# split_point = int(np.floor(n_train * 0.9))
# # idx_train, idx_cal = idx[:n_half], idx[n_half:2*n_half]

# # Split the indices into training and calibration sets
# idx_train, idx_cal = idx[:split_point], idx[split_point:]

# # zero mean and unit variance scaling 
# scalerX = StandardScaler()
# scalerX = scalerX.fit(x_train[idx_train])

# # scale
# x_train = scalerX.transform(x_train)
# x_test = scalerX.transform(x_test)

# # scale the labels by dividing each by the mean absolute response
# mean_y_train = np.mean(np.abs(y_train[idx_train]))
# # y_train = np.squeeze(y_train)/mean_y_train

# #using log transformation to see whether the results are improved
# y_train = np.log(np.squeeze(y_train))
# y_test = np.log(np.squeeze(y_test))

In [None]:
print(np.exp(y_train))