# classifier

## Evaluate a single classifier

In [5]:
# exercise 1.5.1
import numpy as np
import pandas as pd

# Load the Iris csv data using the Pandas library
filename = './data/iris.csv'
df = pd.read_csv(filename)

# Pandas returns a dataframe, (df) which could be used for handling the data.
# We will however convert the dataframe to numpy arrays for this course as 
# is also described in the table in the exercise
raw_data = df.get_values() 

# Notice that raw_data both contains the information we want to store in an array
# X (the sepal and petal dimensions) and the information that we wish to store 
# in y (the class labels, that is the iris species).

# We start by making the data matrix X by indexing into data.
# We know that the attributes are stored in the four columns from inspecting 
# the file.
cols = range(0, 4) 
X = raw_data[:, cols]

# We can extract the attribute names that came from the header of the csv
attributeNames = np.asarray(df.columns[cols])

# Before we can store the class index, we need to convert the strings that
# specify the class of a given object to a numerical value. We start by 
# extracting the strings for each sample from the raw data loaded from the csv:
classLabels = raw_data[:,-1] # -1 takes the last column
# Then determine which classes are in the data by finding the set of 
# unique class labels 
classNames = np.unique(classLabels)
# We can assign each type of Iris class with a number by making a
# Python dictionary as so:
classDict = dict(zip(classNames,range(len(classNames))))
# The function zip simply "zips" togetter the classNames with an integer,
# like a zipper on a jacket. 
# For instance, you could zip a list ['A', 'B', 'C'] with ['D', 'E', 'F'] to
# get the pairs ('A','D'), ('B', 'E'), and ('C', 'F'). 
# A Python dictionary is a data object that stores pairs of a key with a value. 
# This means that when you call a dictionary with a given key, you 
# get the stored corresponding value. Try highlighting classDict and press F9.
# You'll see that the first (key, value)-pair is ('Iris-setosa', 0). 
# If you look up in the dictionary classDict with the value 'Iris-setosa', 
# you will get the value 0. Try it with classDict['Iris-setosa']

# With the dictionary, we can look up each data objects class label (the string)
# in the dictionary, and determine which numerical value that object is 
# assigned. This is the class index vector y:
y = np.array([classDict[cl] for cl in classLabels])
# In the above, we have used the concept of "list comprehension", which
# is a compact way of performing some operations on a list or array.
# You could read the line  "For each class label (cl) in the array of 
# class labels (classLabels), use the class label (cl) as the key and look up
# in the class dictionary (classDict). Store the result for each class label
# as an element in a list (because of the brackets []). Finally, convert the 
# list to a numpy array". 
# Try running this to get a feel for the operation: 
# list = [0,1,2]
# new_list = [element+10 for element in list]

# We can determine the number of data objects and number of attributes using 
# the shape of X
N, M = X.shape

# Finally, the last variable that we need to have the dataset in the 
# "standard representation" for the course, is the number of classes, C:
C = len(classNames)



In [10]:
from matplotlib.pyplot import figure, plot, xlabel, ylabel, show
import numpy as np
from scipy.io import loadmat
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection

# requires data from exercise 1.5.1
#from ex1_5_1 import *

# This script crates predictions from three KNN classifiers using cross-validation

# Maximum number of neighbors
L=[1, 20, 80]

CV = model_selection.LeaveOneOut()
i=0

# store predictions.
yhat = []
y_true = []
for train_index, test_index in CV.split(X, y):
    print('Crossvalidation fold: {0}/{1}'.format(i+1,N))    
    
    # extract training and test set for current CV fold
    X_train = X[train_index,:]
    y_train = y[train_index]
    X_test = X[test_index,:]
    y_test = y[test_index]

    # Fit classifier and classify the test points (consider 1 to 40 neighbors)
    dy = []
    for l in L:
        knclassifier = KNeighborsClassifier(n_neighbors=l)
        knclassifier.fit(X_train, y_train)
        y_est = knclassifier.predict(X_test)

        dy.append( y_est )
        # errors[i,l-1] = np.sum(y_est[0]!=y_test[0])
    dy = np.stack(dy, axis=1)
    yhat.append(dy)
    y_true.append(y_test)
    i+=1

yhat = np.concatenate(yhat)
y_true = np.concatenate(y_true)
yhat[:,0] # predictions made by first classifier.
# Compute accuracy here.


Crossvalidation fold: 1/150
Crossvalidation fold: 2/150
Crossvalidation fold: 3/150
Crossvalidation fold: 4/150
Crossvalidation fold: 5/150
Crossvalidation fold: 6/150
Crossvalidation fold: 7/150
Crossvalidation fold: 8/150
Crossvalidation fold: 9/150
Crossvalidation fold: 10/150
Crossvalidation fold: 11/150
Crossvalidation fold: 12/150
Crossvalidation fold: 13/150
Crossvalidation fold: 14/150
Crossvalidation fold: 15/150
Crossvalidation fold: 16/150
Crossvalidation fold: 17/150
Crossvalidation fold: 18/150
Crossvalidation fold: 19/150
Crossvalidation fold: 20/150
Crossvalidation fold: 21/150
Crossvalidation fold: 22/150
Crossvalidation fold: 23/150
Crossvalidation fold: 24/150
Crossvalidation fold: 25/150
Crossvalidation fold: 26/150
Crossvalidation fold: 27/150
Crossvalidation fold: 28/150
Crossvalidation fold: 29/150
Crossvalidation fold: 30/150
Crossvalidation fold: 31/150
Crossvalidation fold: 32/150
Crossvalidation fold: 33/150
Crossvalidation fold: 34/150
Crossvalidation fold: 3

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
from toolbox_02450 import jeffrey_interval
#from ex7_1_1 import *

# Compute the Jeffreys interval
alpha = 0.05
[thetahatA, CIA] = jeffrey_interval(y_true, yhat[:,0], alpha=alpha)

print("Theta point estimate", thetahatA, " CI: ", CIA)

Theta point estimate 0.956953642384106  CI:  (0.9194225123023887, 0.9831344032786383)


In [6]:
help(jeffrey_interval)

Help on function jeffrey_interval in module toolbox_02450.statistics:

jeffrey_interval(y, yhat, alpha=0.05)



## Compare two classifiers

In [11]:
from toolbox_02450 import mcnemar
#from ex7_1_1 import *

# Compute the Jeffreys interval
alpha = 0.05
[thetahat, CI, p] = mcnemar(y_true, yhat[:,0], yhat[:,1], alpha=alpha)

print("theta = theta_A-theta_B point estimate", thetahat, " CI: ", CI, "p-value", p)


Result of McNemars test using alpha= 0.05
Comparison matrix n
[[143.   1.]
 [  4.   2.]]
Approximate 1-alpha confidence interval of theta: [thetaL,thetaU] =  (-0.040463902136215535, 0.00047217032034696516)
p-value for two-sided test A and B have same accuracy (exact binomial test): p= 0.375
theta = theta_A-theta_B point estimate -0.02  CI:  (-0.040463902136215535, 0.00047217032034696516) p-value 0.375



# Regression model

## Compare two regression models

In [1]:
# exercise 5.1.5
import numpy as np
from scipy.io import loadmat

# Load Matlab data file and extract variables of interest
mat_data = loadmat('./data/wine.mat')
X = mat_data['X']
y = mat_data['y'].astype(int).squeeze()
C = mat_data['C'][0,0]
M = mat_data['M'][0,0]
N = mat_data['N'][0,0]

attributeNames = [i[0][0] for i in mat_data['attributeNames']]
classNames = [j[0] for i in mat_data['classNames'] for j in i]


# Remove outliers
outlier_mask = (X[:,1]>20) | (X[:,7]>10) | (X[:,10]>200)
valid_mask = np.logical_not(outlier_mask)
X = X[valid_mask,:]
y = y[valid_mask]
# Remove attribute 12 (Quality score)
X = X[:,0:11]
attributeNames = attributeNames[0:11]
# Update N and M
N, M = X.shape

print('Ran Exercise 5.1.5')

Ran Exercise 5.1.5


In [2]:
from matplotlib.pyplot import figure, plot, xlabel, ylabel, show
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
import sklearn.tree
import scipy.stats
import numpy as np, scipy.stats as st

# requires data from exercise 5.1.5
#from ex5_1_5 import *

X,y = X[:,:10], X[:,10:]
# This script crates predictions from three KNN classifiers using cross-validation

test_proportion = 0.2

X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=test_proportion)

mA = sklearn.linear_model.LinearRegression().fit(X_train,y_train)
mB = sklearn.tree.DecisionTreeRegressor().fit(X_train, y_train)

yhatA = mA.predict(X_test)
yhatB = mB.predict(X_test)[:,np.newaxis]  #  justsklearnthings

# perform statistical comparison of the models
# compute z with squared error.
zA = np.abs(y_test - yhatA ) ** 2

# compute confidence interval of model A
alpha = 0.05
CIA = st.t.interval(1-alpha, df=len(zA)-1, loc=np.mean(zA), scale=st.sem(zA))  # Confidence interval


zB = np.abs(y_test - yhatB ) ** 2
alpha = 0.05
CIB = st.t.interval(1-alpha, df=len(zB)-1, loc=np.mean(zB), scale=st.sem(zB))  # Confidence interval

# Compute confidence interval of z = zA-zB and p-value of Null hypothesis
z = zA - zB
CI = st.t.interval(1-alpha, len(z)-1, loc=np.mean(z), scale=st.sem(z))  # Confidence interval
p = st.t.cdf( -np.abs( np.mean(z) )/st.sem(z), df=len(z)-1)  # p-value

print("CIA",CIA)
print("CIB",CIB)
print("CI",CI)
print("P",p)

CIA (array([0.22305893]), array([0.27726055]))
CIB (array([0.23306005]), array([0.35120085]))
CI (array([-0.10221926]), array([0.01827784]))
P [0.08598555]
