In [27]:
'''
Created on Jan 22, 2019

Some part of the code is based on this tutorial:
Tutorial To Implement k-Nearest Neighbors in Python From Scratch
(https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/)

@author: Diego Sanz Villafruela
'''
import csv
import random
import math
import operator
import numpy as np
from sklearn import preprocessing
from sklearn import neighbors

In [28]:
"""
Converting from one scale to another scale.
"""
def convertRange(x,a,b,c,d):
    return c + ((d-c)/(b-a) ) * (x - a)

print(convertRange(3,2,4,5,10))
print(convertRange(7.5,5,10,2,4))

7.5
3.0


# C-index


In [29]:
"""
Performance measure that indicates how well the model captures the relative ordering/ranking
of the data points.
C-index is measured from 0 to 1, with 0.5 meaning the model wasn't able to capture any information
from the data.
"""
def getCIndex(labels, predictions):
    if labels is None or predictions is None:
        raise Exception("Illegal argument exception")
    if len(labels) != len(predictions):
        raise Exception("The number of labels is not the same to the number of predictions")
        
    size = len(labels)
    h_num = 0
    n = 0
    for i in range(size):
        li = labels[i]
        pi = predictions[i]
        for j in range(i+1,size):
            lj = labels[j]
            pj = predictions[j]
            if ( li != lj):
                n += 1
                if (pi < pj and li < lj) or (pi > pj and li > lj):
                    h_num += 1
                elif pi == pj:
                    h_num += 0.5 # -1 0.5 1
    return h_num/n

labels = [-1, 1, 1, -1, 1]
predictions =  [0.60, 0.80, 0.75, 0.75, 0.70]

print ( "CIndex ", getCIndex(labels,predictions))

CIndex  0.75


# Getting data set from File

In [30]:
"""
Gets samples from CSV file.
"""                
def getSamples(filename):
    samples = []
    with open(filename) as f:
        linesIter = iter(f.readlines())

        next(linesIter) # c_total,Cd,Pb,Mod1,Mod2,Mod3
        for line in linesIter:
            line = line.rstrip("\n\r")
            sample = []
            for value in line.split(","):
                sample.append(float(value)) 
            samples.append(sample)
    return samples

"""
The data is normalized using z-score = (x - Mean) / Standard deviation
"""
def getNormalizedSamples(filename):
    samples = getSamples(r"Water_data.csv")
    samples = np.array(samples)
    
    mod1 = preprocessing.scale(samples[:,3:4])
    mod2 = preprocessing.scale(samples[:,4:5])
    mod3 = preprocessing.scale(samples[:,5:6])

    samples[:,3:4] = mod1
    samples[:,4:5] = mod2
    samples[:,5:6] = mod3
    
    return samples


samples = getNormalizedSamples(r"Water_data.csv")

print (samples)



[[  0.00000000e+00   0.00000000e+00   0.00000000e+00  -9.71720942e-01
   -6.69818294e-01  -3.35076304e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00  -9.63093411e-01
   -6.69990860e-01  -1.60829666e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00  -9.62826686e-01
   -6.69732011e-01   9.48750650e-02]
 ..., 
 [  5.00000000e+03   5.00000000e+03   0.00000000e+00  -8.41559099e-01
   -2.89482626e-01   5.67577607e-01]
 [  5.00000000e+03   5.00000000e+03   0.00000000e+00  -8.41333408e-01
   -2.94659608e-01   8.41407787e-01]
 [  5.00000000e+03   5.00000000e+03   0.00000000e+00  -8.34398556e-01
   -3.14159577e-01   7.77796794e-01]]


# Standardization

When independent variables in training data are measured in different units, it is important to standardize variables before calculating distance. For example, if one variable is based on height in cms, and the other is based on weight in kgs, then height will influence more on the distance calculation.


In [31]:
"""
Calculates the Ecludean distance between 2 samples. 
It is mainly used when data is continuous.
Note:
Discrete data can only take particular values whereas Continuous data are not restricted
to defined separate values, but can occupy any value over a continuous range.
"""
def euclideanDistance(instance1, instance2, origin, end):
    distance = 0
    for propertyIndex in range(origin, end):
        distance += pow((instance1[propertyIndex] - instance2[propertyIndex]), 2)
    return math.sqrt(distance)

"""
Gets the k closest neighbors from the trainingSet. 
"""
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    origin = 3
    end = len(testInstance)
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], origin, end)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    
    return list(map(lambda neighbor : neighbor[0], distances[:k]))

def getResponse(neighbors):
    length = len(neighbors)
    if length == 0:
        raise Exception("The number of elements is 0")
        
    c_total = 0
    Cd = 0
    Pb = 0
    for neighbor in neighbors:
        c_total += neighbor[0]
        Cd += neighbor[1]
        Pb += neighbor[2]
    
    return {"c_total": c_total/length, "Cd" : Cd/length, "Pb" : Pb/length}
    
"""
Calculates the accuracy of a kNearestNeighbors classifier.
"""
def kNearestNeighbors(trainingData, testData, k = 1):
    if k > len(trainingData):
        raise Exception("Elements are less than k: {}".format(k))
        
    predictions = {"c_total" : [], "Cd" : [], "Pb" : []}
    for testInstance in testData:
        neighbors = getNeighbors(trainingData, testInstance, k)
        
        response = getResponse(neighbors)
        
        predictions["c_total"].append(response["c_total"])
        predictions["Cd"].append(response["Cd"])
        predictions["Pb"].append(response["Pb"])
        
    return predictions
    
def scikit_kNearestNeighbors(trainingData, testData, k = 1):
    inputValues = trainingData[:, 3:]
    outputValues = trainingData[:, 0:3]

    knn = neighbors.KNeighborsRegressor(k, weights='uniform')
    predictions = knn.fit(inputValues, outputValues).predict(testData[:, 3:])
    
    return {"c_total" : list(predictions[:,0]), "Cd" : list(predictions[:,1]), "Pb" : list(predictions[:,2])}

samples = getNormalizedSamples(r"Water_data.csv")

    
trainingData = samples[2:]
testData = samples[:2]

print (kNearestNeighbors (trainingData, testData, 5))
print("-----")
print(scikit_kNearestNeighbors(trainingData, testData, 5))

{'c_total': [25.199999999999999, 39.600000000000001], 'Cd': [9.5199999999999996, 2.2399999999999998], 'Pb': [15.680000000000001, 37.359999999999999]}
-----
{'c_total': [25.199999999999999, 39.600000000000001], 'Cd': [9.5199999999999996, 2.2399999999999998], 'Pb': [15.680000000000001, 37.359999999999999]}


In [32]:
"""
Evaluation of the predictionPerformance using crossValidation.

samples: independent data set
"""
def crossValidation(samples, foldSize, predictionMethod, neighborNumber = 1):
    k = len(samples) // foldSize

    sumAvgCIndexes = 0
    
    predictions = {"c_total" : [], "Cd" : [], "Pb" : []}
    labels = {"c_total" : [], "Cd" : [], "Pb" : []}
    
    for fold in range(0,k):
        beginningFold = fold * foldSize
        endFold = beginningFold + foldSize
    
        trainingData = np.concatenate((samples[:beginningFold],samples[endFold:]),axis=0)
        testData = samples[beginningFold:endFold]
    
        # predictions
        c_total, Cd, Pb = predictionMethod(trainingData,testData,neighborNumber).values()

        predictions["c_total"].extend(c_total)
        predictions["Cd"].extend(Cd)
        predictions["Pb"].extend(Pb)
        
        # labels
        labels["c_total"].extend(testData[:,0])
        labels["Cd"].extend(testData[:,1])
        labels["Pb"].extend(testData[:,2])
        
    # c-indexes
    c_totalIndex = getCIndex(labels["c_total"],  predictions["c_total"])
    CdIndex = getCIndex(labels["Cd"],  predictions["Cd"])
    PbIndex = getCIndex(labels["Pb"], predictions["Pb"])
    
    return np.array([c_totalIndex, CdIndex, PbIndex])

In [33]:
samples = getNormalizedSamples(r"Water_data.csv")
foldSize = 3
neighborNumber = 1
crossValidation(samples, foldSize, kNearestNeighbors, neighborNumber)

array([ 0.81500489,  0.74102846,  0.73626805])

In [34]:
def bestCVNeighbors(samples,folds,neighbors):
    cIndexes = {}
    for fold in folds:
        for neighbor in neighbors:
            cIndexes[fold,neighbor] = crossValidation(samples, fold, kNearestNeighbors, neighbor)
            print ("Fold: {}  NeighborNumber: {} = {}  mean = {}".format(fold,neighbor,cIndexes[fold,neighbor],np.array(cIndexes[fold,neighbor]).mean()))
            
        print ("")
        
bestCVNeighbors (samples,[1,3],[1,2,3,4,5])          

Fold: 1  NeighborNumber: 1 = [ 0.89866406  0.90058067  0.86636849]  mean = 0.8885377374989184
Fold: 1  NeighborNumber: 2 = [ 0.90561529  0.90262084  0.87238439]  mean = 0.8935401746439958
Fold: 1  NeighborNumber: 3 = [ 0.90360595  0.87769408  0.84947165]  mean = 0.87692389234671
Fold: 1  NeighborNumber: 4 = [ 0.89195721  0.85159029  0.84507742]  mean = 0.8628749731279953
Fold: 1  NeighborNumber: 5 = [ 0.88041707  0.82540804  0.83035154]  mean = 0.8453922156993051

Fold: 3  NeighborNumber: 1 = [ 0.81500489  0.74102846  0.73626805]  mean = 0.7641004643768753
Fold: 3  NeighborNumber: 2 = [ 0.81573803  0.74550115  0.75188324]  mean = 0.771040805151734
Fold: 3  NeighborNumber: 3 = [ 0.82005539  0.73535258  0.75274639]  mean = 0.7693847891056445
Fold: 3  NeighborNumber: 4 = [ 0.82268926  0.72214375  0.75944235]  mean = 0.7680917880176734
Fold: 3  NeighborNumber: 5 = [ 0.81516781  0.72065286  0.75593743]  mean = 0.7639193659863729



In [35]:
"""
Which crossValidation gives the best Accuracy.
"""
folds_cIndexesMeans = {}
for fold in range(1,len(samples)-1):
    cIndexesMean = crossValidation(samples, fold, kNearestNeighbors).mean()
    folds_cIndexesMeans[fold] = cIndexesMean

maxFold = max(folds_cIndexesMeans, key=folds_cIndexesMeans.get)   
minFold = min(folds_cIndexesMeans, key=folds_cIndexesMeans.get)  

print ("MaxFold: " + str(maxFold) + " C-index: " + str(folds_cIndexesMeans[maxFold]))
print ("MinFold: " + str(minFold) + " C-index: " + str(folds_cIndexesMeans[minFold]))

MaxFold: 1 C-index: 0.888537737499
MinFold: 128 C-index: 0.35303614108


In [36]:
"""
Which k-neighbor gives the best Accuracy.
"""
def cvBestNeighbor(samples,foldSize):
    neighbor_cIndexesMeans = {}
    for neighbor in range(1,len(samples)-foldSize):
        cIndexesMean = crossValidation(samples, foldSize, kNearestNeighbors, neighbor).mean()
        neighbor_cIndexesMeans[neighbor] = cIndexesMean

    maxNeighbor = max(neighbor_cIndexesMeans, key=neighbor_cIndexesMeans.get)   
    minNeighbor = min(neighbor_cIndexesMeans, key=neighbor_cIndexesMeans.get)  

    print ("NeighborNumber: " + str(maxFold) + " C-index: " + str(neighbor_cIndexesMeans[maxNeighbor]))
    print ("NeighborNumber: " + str(minFold) + " C-index: " + str(neighbor_cIndexesMeans[minNeighbor]))

print("Leave-One-Out Cross Validation")
cvBestNeighbor(samples,1)
print("Leave-Three-Out Cross Validation")
cvBestNeighbor(samples,3)

Leave-One-Out Cross Validation 
NeighborNumber: 1 C-index: 0.893540174644
NeighborNumber: 128 C-index: 0.154174917471
Leave-Three-Out Cross Validation
NeighborNumber: 1 C-index: 0.771040805152
NeighborNumber: 128 C-index: 0.113521972249


#  which evaluation approach generalize better? Why? 

Leave-One-Out Cross Validation
<pre>
Fold: 1  NeighborNumber: 1 = [ 0.89866406  0.90058067  0.86636849]  mean = 0.8885377374989184
Fold: 1  NeighborNumber: 2 = [ 0.90561529  0.90262084  0.87238439]  mean = 0.8935401746439958
Fold: 1  NeighborNumber: 3 = [ 0.90360595  0.87769408  0.84947165]  mean = 0.87692389234671
Fold: 1  NeighborNumber: 4 = [ 0.89195721  0.85159029  0.84507742]  mean = 0.8628749731279953
Fold: 1  NeighborNumber: 5 = [ 0.88041707  0.82540804  0.83035154]  mean = 0.8453922156993051
</pre>
Leave-Three-Out Cross Validation
<pre>
Fold: 3  NeighborNumber: 1 = [ 0.81500489  0.74102846  0.73626805]  mean = 0.7641004643768753
Fold: 3  NeighborNumber: 2 = [ 0.81573803  0.74550115  0.75188324]  mean = 0.771040805151734
Fold: 3  NeighborNumber: 3 = [ 0.82005539  0.73535258  0.75274639]  mean = 0.7693847891056445
Fold: 3  NeighborNumber: 4 = [ 0.82268926  0.72214375  0.75944235]  mean = 0.7680917880176734
Fold: 3  NeighborNumber: 5 = [ 0.81516781  0.72065286  0.75593743]  mean = 0.7639193659863729
</pre>
As it can seem, the Leave-One-Out Cross Validation got better c-index than Leave-Three-Out Cross Validation.

The reason why the Leave-One-Out got better C-index is because there are replicas in our data.
Every 3 samples correspond to the same mixture, so we are biased using the Leave-one out CV.
The correct way of doing it is using Leave-Three-Out.

For this reason, Leave-Three-Out generalizes better than Leave-One-Out in this case of study.