This file contains all the required experiments 

Project: Conformal prediction for digital soil mapping

Author: Nafiseh Kakhani, University of Tuebingen

Date: 16/10/2023

### import packages

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.metrics import mean_pinball_loss, mean_squared_error
import random
import numpy as np
np.warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
import os
import sys
import random

### load data

In [2]:
# df = pd.read_csv('D:\Conformalized_Quantile_Regression\LUCAS_2015_features_V3.csv')
df = pd.read_csv('D:\Conformal_prediction_all_exprs\Conformal_Prediction_DSM\LUCAS_2015_features_V3.csv')

# Convert the column to numeric, omitting non-double values
df['OC'] = pd.to_numeric(df['OC'], errors='coerce')

# Drop rows with NaN values (non-double values)
df.dropna(subset=['OC'], inplace=True)

### inputs and output of the model

In [3]:
X = df.iloc[0:100,:-2]
y = df.iloc[0:100,-2]

In [4]:
seeds = [123, 450, 45, 609]

# random_state_train_test = seed
# random.seed(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)

# if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(seed)
    
# desired miscoverage error
alpha = 0.1

# desired quanitile levels
quantiles = [0.05, 0.95]

# used to determine the size of test set
test_ratio = 0.2
train_ratio = 0.8 #this value indicates the train ratio with regard to the val/cal ratio

In [12]:

def split_train_test(X, y, test_ratio):

    # Divide the dataset into test and train based on the test_ratio parameter
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio)
    
    # Let us keep the indices for test samples for further mapping
    keep_inds = x_test.index.tolist()
    point_id = df.loc[keep_inds, 'point_id']
    
    # Reshape the data
    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train)
    x_test = np.asarray(x_test)
    y_test = np.asarray(y_test)
    
    # Compute input dimensions
    n_train = x_train.shape[0]
    in_shape = x_train.shape[1]
    
    # # Display basic information
    # print("Dimensions: train set (n=%d, p=%d) ; test set (n=%d, p=%d)" % 
    #       (x_train.shape[0], x_train.shape[1], x_test.shape[0], x_test.shape[1]))
    
    return x_train, x_test, y_train, y_test, point_id, n_train

# # Example usage:
# x_train, x_test, y_train, y_test, point_id, in_shape = split_train_test(X, y, test_ratio)


In [13]:
x_train, x_test, y_train, y_test, point_id, n_train = split_train_test(X, y, test_ratio)


In [14]:

def preprocess_training_and_scaling(x_train, x_test, y_train, y_test, n_train):

    # Divide the data into proper training set and calibration set
    idx = np.random.permutation(n_train)
    split_point = int(np.floor(n_train * train_ratio))
    
    # Split the indices into training and calibration sets
    idx_train, idx_cal = idx[:split_point], idx[split_point:]

    # Zero mean and unit variance scaling
    scalerX = StandardScaler()
    scalerX = scalerX.fit(x_train[idx_train])

    # Scale
    x_train = scalerX.transform(x_train)
    x_test = scalerX.transform(x_test)

    # # Optionally scale the labels by dividing each by the mean absolute response
    # if mean_response:
    #     mean_y_train = np.mean(np.abs(y_train[idx_train]))
    #     y_train = np.squeeze(y_train) / mean_y_train
    
    # Display basic information
    print("Dimensions: train set (n=%d, p=%d) ; validation/calibration set (n=%d, p=%d)" % 
          (x_train.shape[0], x_train.shape[1], x_test.shape[0], x_test.shape[1]))

    # Using log transformation to see whether the results are improved
    y_train = np.log(np.squeeze(y_train))
    y_test = np.log(np.squeeze(y_test))

    return x_train, x_test, y_train, y_test, idx_train, idx_cal


In [15]:
#Example usage:
x_train, x_test, y_train, y_test, idx_train, idx_cal = preprocess_training_and_scaling(x_train, x_test, y_train, y_test, n_train)

Dimensions: train set (n=80, p=72) ; validation/calibration set (n=20, p=72)


In [16]:
from cqr import helper
from nonconformist.nc import RegressorNc
from nonconformist.nc import QuantileRegErrFunc

#########################################################
# Quantile random forests parameters
# (See QuantileForestRegressorAdapter class in helper.py)
#########################################################

# the number of trees in the forest
n_estimators = 1000

# the minimum number of samples required to be at a leaf node
# (default skgarden's parameter)
min_samples_leaf = 1

# the number of features to consider when looking for the best split
# (default skgarden's parameter)
max_features = x_train.shape[1]

# target quantile levels
quantiles_forest = [quantiles[0]*100, quantiles[1]*100]

# use cross-validation to tune the quantile levels?
cv_qforest = True

# when tuning the two QRF quantile levels one may
# ask for a prediction band with smaller average coverage
# to avoid too conservative estimation of the prediction band
# This would be equal to coverage_factor*(quantiles[1] - quantiles[0])
coverage_factor = 0.85

# ratio of held-out data, used in cross-validation
cv_test_ratio = 0.05

# seed for splitting the data in cross-validation.
# Also used as the seed in quantile random forests function
cv_random_state = 1

# determines the lowest and highest quantile level parameters.
# This is used when tuning the quanitle levels by cross-validation.
# The smallest value is equal to quantiles[0] - range_vals.
# Similarly, the largest value is equal to quantiles[1] + range_vals.
cv_range_vals = 30

# sweep over a grid of length num_vals when tuning QRF's quantile parameters                   
cv_num_vals = 10

# Number of bootstrap samples
n_bootstraps = 5

In [17]:
def create_and_save_dataframe(y_lower, y_upper, y_test, point_id, output_file):

    df_oc = pd.DataFrame({
        'lower_oc': np.exp(y_lower),
        'upper_oc': np.exp(y_upper),
        'predicted_oc': (np.exp(y_upper) + np.exp(y_lower)) / 2,
        'standard_uncertainty': (np.exp(y_upper) - np.exp(y_lower)) / np.mean(np.exp(y_upper) + np.exp(y_lower)),
        'test_oc': np.exp(y_test),
        'Point_ID': point_id
    })
    
    # Save the DataFrame as a CSV file
    df_oc.to_csv(output_file, index=False)

# Example usage:
# create_and_save_dataframe(y_lower, y_upper, y_test, point_id, 'output.csv')


In [18]:
def train_gradient_boosting_models(x_train, y_train, x_test, point_id):

    # Reference: https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html

    
    all_models = {}
    common_params = dict(
        learning_rate = 0.05,
        n_estimators = 200,
        max_depth = 2,
        min_samples_leaf = 9,
        min_samples_split = 9,
    )

    for alpha_gb in [0.05, 0.5, 0.95]:
        gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha_gb, **common_params)
        all_models["q %1.2f" % alpha_gb] = gbr.fit(x_train, y_train)
    
    # For the sake of comparison, we also fit a baseline model trained with the usual (mean) squared error (MSE).
    # gbr_ls = GradientBoostingRegressor(loss="squared_error", **common_params)
    # all_models["mse"] = gbr_ls.fit(x_train, y_train)

    y_lower = all_models["q 0.05"].predict(x_test)
    y_upper = all_models["q 0.95"].predict(x_test)

    return y_lower, y_upper, point_id

# Example usage:
# trained_models, lower_predictions, upper_predictions = train_gradient_boosting_models(x_train, y_train, x_test)


In [19]:
params_qforest = dict()
params_qforest["random_state"] = 0
params_qforest["min_samples_leaf"] = min_samples_leaf
params_qforest["n_estimators"] = n_estimators
params_qforest["max_features"] = x_train.shape[1]

params_qforest["CV"]= cv_qforest
params_qforest["coverage_factor"] = coverage_factor
params_qforest["test_ratio"]=cv_test_ratio
params_qforest["random_state"]=cv_random_state
params_qforest["range_vals"] = cv_range_vals
params_qforest["num_vals"] = cv_num_vals

In [21]:
for i, seed in enumerate(seeds):
  
  # Set random seeds for reproducibility
  random.seed(seed)
  np.random.seed(seed)

  print(f"Iteration {i}")
  
  x_train, x_test, y_train, y_test, point_id, n_train = split_train_test(X, y, test_ratio)
  # Now we split the data in to different train/cal stes:
  x_train, x_test, y_train, y_test, idx_train, idx_cal = preprocess_training_and_scaling(x_train, x_test, y_train, y_test, n_train)

  #########################################################
  # fit a simple ridge regression model (sanity check)
  #########################################################
  model = linear_model.RidgeCV()
  model = model.fit(x_train, np.squeeze(y_train))
  predicted_data = model.predict(x_test).astype(np.float32)

  # calculate the normalized mean squared error
  print("Ridge relative error: %f" % (np.sum((np.squeeze(y_test)-predicted_data)**2)/np.sum(np.squeeze(y_test)**2)))
  sys.stdout.flush()

  #########################################################
  # Random forests + Bootstrapping
  #########################################################

  # Initialize lists to store predictions for each bootstrap sample
  bootstrap_predictions = []

  # Standard bootstrap: Omit around 36.8% of samples in each iteration
  sample_size = int(0.632 * x_train.shape[0])  # 63.2% of training samples
    
  for _ in range(n_bootstraps):
        # Create a new bootstrap sample by randomly selecting samples with replacement
        bootstrap_indices = np.random.choice(x_train.shape[0], size=sample_size, replace=True)
        X_bootstrap = x_train[bootstrap_indices]
        y_bootstrap = y_train[bootstrap_indices]
        
        # Create and train a new model on the bootstrap sample
        bootstrap_model = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state = 42)
        bootstrap_model.fit(X_bootstrap, y_bootstrap)
        
        # Make predictions on the test data using the bootstrap model
        y_bootstrap_pred = bootstrap_model.predict(x_test)
        
        # Append the predictions to the list
        bootstrap_predictions.append(y_bootstrap_pred)
    

  y_upper = np.max(bootstrap_predictions, axis=0)
  y_lower = np.min(bootstrap_predictions, axis=0)

  output_file = f'Boot_RF_{i}.csv'
  create_and_save_dataframe(y_lower, y_upper, y_test, point_id, output_file)

 #########################################################
 # Quantile random forests 
 #########################################################

  model_full = helper.QuantileForestRegressorAdapter(model = None,
                                                    fit_params=None,
                                                    quantiles=np.dot(100,quantiles),
                                                    params = params_qforest)
  
  model_full.fit(x_train, y_train)
  tmp = model_full.predict(x_test)

  y_lower = tmp[:,0]
  y_upper = tmp[:,1]

  output_file = f'quantile_forest_{i}.csv'
  create_and_save_dataframe(y_lower, y_upper, y_test, point_id, output_file)

  #########################################################
  # Quantile random forests + Conformal predicition 
  #########################################################

  model = helper.QuantileForestRegressorAdapter(model = None,
                                              fit_params=None,
                                              quantiles=quantiles_forest,
                                              params = params_qforest)

  nc = RegressorNc(model, QuantileRegErrFunc())
  y_lower, y_upper = helper.run_icp(nc, x_train, y_train, x_test, idx_train, idx_cal, alpha)

  output_file = f'cqr_quantile_forest_{i}.csv'
  create_and_save_dataframe(y_lower, y_upper, y_test, point_id, output_file)
  
  #########################################################
  # Prediction Intervals for Gradient Boosting Regression 
  #########################################################

  y_lower, y_upper, point_id = train_gradient_boosting_models(x_train, y_train, x_test, point_id)
  output_file = f'quantile_GBoost_{i}.csv'
  create_and_save_dataframe(y_lower, y_upper, y_test, point_id, output_file)

Iteration 0
Dimensions: train set (n=80, p=72) ; validation/calibration set (n=20, p=72)
Ridge relative error: 0.081019
Iteration 1
Dimensions: train set (n=80, p=72) ; validation/calibration set (n=20, p=72)
Ridge relative error: 0.055656
Iteration 2
Dimensions: train set (n=80, p=72) ; validation/calibration set (n=20, p=72)
Ridge relative error: 0.073776
Iteration 3
Dimensions: train set (n=80, p=72) ; validation/calibration set (n=20, p=72)
Ridge relative error: 0.041950
