# *Loading and Splitting Datasets*

In [134]:
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
iris.data.shape

(150, 4)

In [135]:
ion_X = np.genfromtxt("ionosphere.txt", delimiter = ",", usecols = np.arange(34))
ion_y = np.genfromtxt("ionosphere.txt", delimiter = ",", usecols = 34)



In [136]:
from sklearn.model_selection import train_test_split

ion_X_train, ion_X_test, ion_y_train, ion_y_test = train_test_split(ion_X, ion_y, random_state = 13)

In [137]:
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris['data'],iris['target'], random_state = 13)

In [138]:
def NearestNeighbor2(X_train, X_test, y_train, y_test):
    err_no = 0 # error counter
    nn_index = 0 # index of nearest neighbor
    for i in range(len(X_test)): # check each test sample[*]
        difference = 1000.0 # difference initialised to large number
        for j in range (len(X_train)): # [*]against all training samples
            # we take the difference between neighbors and add values of all features in this array together
            # we then divide this number by the number of features in samples and take its abs value
            # this final value is regarded as the difference between neighbors
            sum_average = abs(sum(X_train[j] - X_test[i]) / len(X_train[j]))
            # we find the NN and save its index
            if difference > sum_average:
                difference = sum_average
                nn_index = j
        # if the postulated label != actual label of the test sample there is an error
        if y_train[nn_index] != y_test[i]: 
            err_no+=1
    # we print the number of errors and their percentage compared to the total test samples
    print("The number of errors of the training set on the test set is: " + str(err_no))
    print("test error rate: " + str((err_no / len(X_test)) * 100) + "%")

In [139]:
NearestNeighbor2(ion_X_train, ion_X_test, ion_y_train, ion_y_test)

The number of errors of the training set on the test set is: 23
test error rate: 26.136363636363637%


In [140]:
from math import sqrt

def NearestNeighbor1(X_train, X_test, y_train, y_test):
    err_no = 0 # error counter
    nn_index = 0 # index of nearest neighbor
    for i in range(len(X_test)): # check each test sample[*]
        difference = 1000.0 # difference initialised to large number
        for j in range (len(X_train)): # [*]against all training samples
            # find the distance between training sample and test sample
            # we then find the euclidian norm of the value we are left with
            distance = abs(X_train[j] - X_test[i])
            euc_norm = 0
            for f in distance:
                euc_norm += f**2
            euc_norm = sqrt(euc_norm)
            # we find the NN and save its index
            if difference > euc_norm:
                difference = euc_norm
                nn_index = j
        # if the postulated label != actual label of the test sample there is an error
        if y_train[nn_index] != y_test[i]: 
            err_no+=1
    # we print the number of errors and their percentage compared to the total test samples
    print("The number of errors of the training set on the test set is: " + str(err_no))
    print("test error rate: " + str((err_no / len(X_test)) * 100) + "%")

In [141]:
NearestNeighbor1(ion_X_train, ion_X_test, ion_y_train, ion_y_test)

The number of errors of the training set on the test set is: 13
test error rate: 14.772727272727273%


In [142]:
NearestNeighbor1(iris_X_train, iris_X_test, iris_y_train, iris_y_test)

The number of errors of the training set on the test set is: 2
test error rate: 5.263157894736842%


# Conformity Measure


## First we break down each formula used and implement each of their components as functions

### We take the conformity score, a, to be:
### The distance to the nearest sample of a different class / the distance to the nearest sample of the same class 

In [251]:


# this method returns the distance of the test sample to the nearest sample in the same class
# the test sample parameter is a single sample
def Nearest_Same(X_train, X_test, y_train, y_test):
    distance = 1000.0 
    for i in range(len(X_train)):
        # we only want to compare to the training samples of the same label-class
        if y_train[i] == y_test: 
            difference = abs(X_train[i] - X_test)
            euc_norm = 0
            for f in difference:
                euc_norm += f**2
            euc_norm = sqrt(euc_norm)
            if distance > euc_norm:
                distance = euc_norm
    return distance

# this method returns the distance of the test sample to the nearest sample in a different class
# the test sample parameter is a single sample
def Nearest_Different(X_train, X_test, y_train, y_test):
    distance = 1000.0
    for i in range(len(X_train)):
         # we only want to compare to the training samples of a different label-class
        if y_train[i] != y_test:
            difference = abs(X_train[i] - X_test)
            euc_norm = 0
            for f in difference:
                euc_norm += f**2
            if euc_norm == 0:
                break
            euc_norm = sqrt(euc_norm)
            if distance > euc_norm:
                distance = euc_norm
    return distance

# conformity score function defined below
def conformity_score(distance_different, distance_same):
    return  distance_different / distance_same

from bisect import bisect

# the bisect function allows us to insert an item into a sorted list
# and as return value gives its index, i.e. its conformity score rank
def find_rank(scores, sample):
    scores.sort()
    rank = bisect(scores, sample)
    # [rank + 1] is returned because arrays are 0-based
    return rank + 1


# we need to find the conformity score of each training sample to determine ranks for p-value
def Conform_Score_Train(X_train, y_train):
    scores = []
    for i in range(len(X_train)):
        # the training set without i-th sample
        # label set without i-th samples label
        X_no_i = np.delete(X_train, i, axis = 0)
        y_no_i = np.delete(y_train, i, axis = 0)
            
        # to find the conformity score, we need to find distances to nearest same and nearest different
        distance_same = Nearest_Same(X_no_i, X_train[i], y_no_i, y_train[i])
        distance_different = Nearest_Different(X_no_i, X_train[i], y_no_i, y_train[i])
        
        scores.append(conformity_score(distance_different, distance_same))
    #return conformity scores of triaining set for ranking
    return scores

In [252]:
# function implemented for the p-value formula
def p_value(rank, number_samples):
    return rank / number_samples

In [253]:
def Conformal_Predictor(X_train, X_test, y_train, y_test):
    # this is the numerator for the p-value formula
    # the number of training samples + test sample
    n_plus_one = len(X_train) + 1
    
    # this variable is for the sum of all the false p-values
    total_false = 0.0
    
    # we start by identifying classes
    classes = np.unique(y_train)
    
    # an array containing all conformity scores of training set for ranking
    conform_scores = Conform_Score_Train(X_train, y_train)
    
    # to find every false p-value of each test sample, we first consider test samples individually
    for s in range (len(X_test)):
        for c in range(len(classes)):
            # we consider every class that our test sample is NOT in, i.e. false label
            if y_test[s] != classes[c]:
                # for the test sample, we assume its label is classes[c] to find false p-value
                # first find distances for the nearest same and nearest different
                test_nearest_same = Nearest_Same(X_train, X_test[s], y_train, classes[c])
                test_nearest_different = Nearest_Different(X_train, X_test[s], y_train, c)
                # since the nearest same is the numerator, we avoid zero-values
                if test_nearest_same != 0:
                    # find conformity score of the test sample
                    test_conform_score = conformity_score(test_nearest_different, test_nearest_same)
                    # then find rank to find false p-value
                    rank = find_rank(conform_scores, test_conform_score)
                    # add false p-value to the total
                    total_false += p_value(rank, n_plus_one)
    # average false is the total false p-values, divided by the number of total possible false p-values
    # this value is calculated as the number of test samples
    average_false = total_false / (len(X_test) * (len(classes) -1))
    
    return "The average false p-values for this test sample-set is: " + str(average_false)

In [254]:
Conformal_Predictor(ion_X_train, ion_X_test, ion_y_train, ion_y_test)

'The average false p-values for this test sample-set is: 0.06628787878787883'