Create a class for the inductive cross conformal predictor for the case of regression 

In [1]:
import numpy as np
import copy
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt


class cross_confrom_regress():
    
    
    def __init__(self, X_train, y_train, model):
        self.X_train = X_train
        self.y_train = y_train
        self.model = model
        self.fitted_model = model.fit(X_train, y_train)
        
        
    def get_prediction_interval(self, e, get_test_error = False, X_test = None, y_test = None):
        
        """
        This function gives the prediction interval for a particular error rate e. The validity of that particular interval
        can be checked by using the the interval test error rate. To do this set get_test_error = True.
        """
        X_train = self.X_train # get object attributes 
        y_train = self.y_train
        model = self.model
        
        kf = KFold(shuffle = True, random_state = 211)
        all_non_conform = np.array([]) # empty array to store all non conformity scores 

        for rest_indx, fold_indx in kf.split(X_train):
            X_rest, X_fold = X_train[rest_indx], X_train[fold_indx] # fold is used as calibration set
            y_rest, y_fold = y_train[rest_indx], y_train[fold_indx]
            current_model = copy.deepcopy(model) # make a copy to prevent variable inheritance
            current_model.fit(X_rest, y_rest) # fit the current model with the data not in the current fold
        
            y_hat = current_model.predict(X_fold) # find the predictions for every sample 
            current_nonconform = np.absolute(y_fold - y_hat)  # calculate non conformity scores 
            all_non_conform = np.append(all_non_conform, current_nonconform) #append current non conform scores to all
    
        non_conform = np.sort(all_non_conform) # sort the non confromity scores in accending order 

        k = int((1 - e) * (non_conform.size + 1)) # calculate the non-confimity index for given e 
        c = non_conform[k] # obtain the non-conformity score which satisfies e (-1 as python indexs from o)
        
        prediction_interval = c # set c as the prediction interval
        
        if get_test_error == True:
            
            fitted_model = self.fitted_model
            current_model = copy.deepcopy(fitted_model) # get a copy of the fitted model
            y_hat = current_model.predict(X_test) # get predicted y values for the test set 
            error = np.zeros(y_test.size) # create an empty 
    
            for i in range(y_test.size):
                current_prediction = y_hat[i] # get the current y_hat
                lower_bound = current_prediction - c # get the upper and lower bounds of the curent y hat
                upper_bound = current_prediction + c
        
                if y_test[i] < lower_bound or y_test[i] > upper_bound: # if out of prediction interval 
                    error[i] = 1 # set error to 1 if there an error on current y_hat
    
                error_rate = np.mean(error) # get the mean of the error rates 
        
        else:
            error_rate = None
            
        return prediction_interval, error_rate
    
    
        def get_calibration_curve(self, X_test, y_test, get_inter = False):
            
            """
            This function produces a calibration curve to check the validitiy of the confornal predictor. Also, if 
            get_inter = True, then the intervals for error rates between 0.01 and 1 with step 0.01 may be obtained. 
            """
            
            error_rates = np.arange(0.01, 1, 0.01) # get error rates in range 0.01 to 1 with step 0.01
            test_error_rates = np.zeros(error_rates.size)
            all_inter = np.zeros(error_rates.size)
            
            for i in range(error_rate.size):
                # use previous function to get test error rate for current error rate in loop
                predict_inter, t_error_rate = get_prediction_interval(error_rate[i], True, X_test, y_test)
                test_error_rates[i] = t_error_rate
                all_inter[i] = predict_inter
            
            plt.plot(error_rates, test_error_rates) # plot the calibration curve 
            plt.xlabel('error rate')
            plt.ylabel('test error rate')
            plt.title('Calibration curve')
            plt.show()
            
            if get_inter == False:
                all_inter = None
            
            return all_inter