# Leren: Programming assignment 4

**Student 1:**  <span style="color:red">Wim Berkelmans</span> (<span style="color:red">10793674</span>)<br>
**Student 2:** <span style="color:red">Philip Bouman</span> (<span style="color:red">10668667</span>)<br>

-----------------------------------

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy.linalg as linalg
import scipy.ndimage
import operator
import math
import sklearn
from sklearn import linear_model, datasets
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors

# Load data from .csv 
def readData():
    training_set = np.loadtxt('digits123-1.csv', skiprows = 1, delimiter=';');
    test_set = np.loadtxt('digits123-2.csv', skiprows = 1, delimiter=';');
    return training_set, test_set

## 1. K-nearest neighbour
####  a) Implementation

In [30]:
# Calculate Euclidian distance between two values
def euclideanDistance(data1, data2):
    dist = 0
    for i in range(len(data1)-1):
        dist += pow((data1[i] - data2[i]), 2)
        
    return math.sqrt(dist)
    
# Get most similar neighbors
def getNeighbors(training_set, test_set, k):
    distances = []
    for i in range(len(training_set)):
        dist = euclideanDistance(training_set[i], test_set)
        distances.append((training_set[i], dist))
        
    distances.sort(key=operator.itemgetter(1))
    
    weights = []
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
        weights.append(distances[i][-1])
    
    return neighbors, weights

# Get votes
def getResponse(neighbors):
    classVotes = {}
    for i in range(len(neighbors)):
        response = neighbors[i][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
        sortedVotes = sorted(classVotes.iteritems(), key=operator.itemgetter(1), reverse=True)

    return sortedVotes[0][0]
            
# Count number of correct predictions
def getAccuracy(test_set, target):
    correct = 0
    for i in range(len(test_set)):
        if test_set[i][-1] == target[i]:
            correct += 1
            
    return (correct/float(len(test_set))) * 100.0
    
# Main method
def k_nearest_neighbor(k):
    training_set, test_set = readData()
    predictions = []
    
    for i in range(len(test_set)):
        neighbors, weights = getNeighbors(training_set, test_set[i], k)
        result = getResponse(neighbors)
        predictions.append(result)
    
    accuracy = getAccuracy(test_set, predictions)
    print "K = %(k)d  Accuracy: %(acc)f"  % {"k": k, "acc": accuracy}
    
# run with different k's
k_nearest_neighbor(1)
# k_nearest_neighbor(3)
# k_nearest_neighbor(5)
# k_nearest_neighbor(9)
# k_nearest_neighbor(15)

K = 1  Accuracy: 98.744770 <br>
K = 3  Accuracy: 97.907950 <br>
K = 5  Accuracy: 97.907950 <br>
K = 9  Accuracy: 97.907950 <br>
K = 15  Accuracy: 97.489540 <br>

#### b) Weighted nearest neighbour

In [31]:
def getWeightedResponse(neighbors, weights):
    classVotes = {}
    for i in range(len(neighbors)):
        response = neighbors[i][-1]
        if response in classVotes:
            classVotes[response] += 1.0 / weights[i]
        else:
            classVotes[response] = 1
        sortedVotes = sorted(classVotes.iteritems(), key=operator.itemgetter(1), reverse=True)

    return sortedVotes[0][0]

# Main method
def k_nearest_neighbor_weighted(k):
    training_set, test_set = readData()
    predictions = []
    
    for i in range(len(test_set)):
        neighbors, weights = getNeighbors(training_set, test_set[i], k)
        result = getWeightedResponse(neighbors, weights)
        predictions.append(result)
        
    accuracy = getAccuracy(test_set, predictions)
    print "K = %(k)d  Accuracy: %(acc)f"  % {"k": k, "acc": accuracy}
    
# run with different k's
k_nearest_neighbor_weighted(1)
# k_nearest_neighbor_weighted(3)
# k_nearest_neighbor_weighted(5)
# k_nearest_neighbor_weighted(9)
# k_nearest_neighbor_weighted(15)

K = 1  Accuracy: 98.744770 <br>
K = 3  Accuracy: 97.907950 <br>
K = 5  Accuracy: 98.326360 <br>
K = 9  Accuracy: 97.907950 <br>
K = 15  Accuracy: 97.907950 <br>

#### c) Predictive value

In [36]:
# Get most similar neighbors
def getNeighbors(training_set, test_set, k, accuracies):
    distances = []
    for i in range(len(training_set)):
        dist = euclideanDistanceAlt(training_set[i], test_set, accuracies)
        distances.append((training_set[i], dist))
        
    distances.sort(key=operator.itemgetter(1))
    
    weights = []
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
        weights.append(distances[i][-1])
    
    return neighbors, weights

# alternated version for predictive method
def getNeighborsAlt(training_set, test_set, k):
    distances = []
    for i in range(len(training_set)):
        dist = pow((training_set[i] - test_set), 2)
        distances.append((training_set[i], dist))
        
    distances.sort(key=operator.itemgetter(1))
    
    weights = []
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
        weights.append(distances[i][-1])
    
    return neighbors, weights

# alternated version for predictive method
def getResponseAlt(neighbors):
    classVotes = {}
    for i in range(len(neighbors)):
        response = neighbors[i]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
        sortedVotes = sorted(classVotes.iteritems(), key=operator.itemgetter(1), reverse=True)

    return sortedVotes[0][0]

# Calculate Euclidian distance between two values
def euclideanDistanceAlt(data1, data2, accuracies):
    dist = 0
    for i in range(len(data1)-1):
        dist += accuracies[i] * pow((data1[i] - data2[i]), 2)
    
    return math.sqrt(dist)

def k_nearest_neighbor_predict(k):
    training_set, test_set = readData()
    accuracies = []
    # iterate over all sets of 1 parameter except target value
    # to obtain weights
    for i in range(len(training_set[0])):
        train = training_set[:,i]
        test = test_set[:,i]
        
        length = len(test)
        predictions = []
        for i in range(length):
            neighbors, weights = getNeighborsAlt(train, test[i], k)
            result = getResponseAlt(neighbors)
            predictions.append(result)
    
        accuracy = getAccuracy(test_set, predictions)
        accuracies.append(accuracy)
    
    predictions = []
    
    # apply weights to calculate predictive kNN
    for i in range(len(test_set)):
        neighbors, weights = getNeighbors(training_set, test_set[i], k, accuracies)
        result = getResponse(neighbors)
        predictions.append(result)
    
    accuracy = getAccuracy(test_set, predictions)
    print "K = %(k)d  Accuracy: %(acc)f"  % {"k": k, "acc": accuracy}
    
# run with different k's
k_nearest_neighbor_predict(1)
# k_nearest_neighbor_predict(3)
# k_nearest_neighbor_predict(5)
# k_nearest_neighbor_predict(9)
# k_nearest_neighbor_predict(15)

K = 1  Accuracy: 94.560669


K = 1  Accuracy: 94.560669 <br>
K = 3  Accuracy: 95.397490 <br>
K = 5  Accuracy: 96.234310 <br>
K = 9  Accuracy: 93.723849 <br>
K = 15  Accuracy: 94.142259 <br>

## 2. Compare classifiers
#### a) Implementation

In [5]:
# load data from file
def readDigits(selection):
    if selection == 'train':
        data = np.loadtxt('digits123-1.csv', delimiter=';')
    elif selection == 'test':
        data = np.loadtxt('digits123-2.csv', delimiter=';')
        
    return data

def initTrainTest():
    DigitsTrain = readDigits('train')
    TrainY = DigitsTrain[:,-1] # Y target values, last column of Digits
    Train = np.delete(DigitsTrain, -1, 1) # remove target values
    DigitsTest = readDigits('test')
    TestY = DigitsTest[:,-1] # Y target values, last column of Digits
    Test = np.delete(DigitsTest, -1, 1) # remove target values
    return Train, TrainY, Test, TestY

**Logistic Regression**



In [6]:
def logReg():
    Train, TrainY, Test, TestY = initTrainTest()
    logreg = linear_model.LogisticRegression(C=0.001, solver='newton-cg', 
                                    max_iter=100, multi_class='multinomial')
    logreg.fit(Train, TrainY)

    return logreg.score(Test,TestY)

logReg()

0.95416666666666672

**Neural Network**



In [37]:
def neuralNetwork():
    Train, TrainY, Test, TestY = initTrainTest()
    logreg = MLPClassifier(solver='adam', activation='logistic', 
                           alpha=0.001, hidden_layer_sizes=(50,))
    logreg.fit(Train, TrainY)

    return logreg.score(Test,TestY)

neuralNetwork()

0.94999999999999996

**Nearest Neighbour**



In [8]:
def nearestN(k):
    Train, TrainY, Test, TestY = initTrainTest()
    logreg = neighbors.KNeighborsClassifier(k, weights='distance')
    logreg.fit(Train, TrainY)

    return logreg.score(Test,TestY)

print "NNB k=1 ", nearestN(1)
print "NNB k=3 ", nearestN(3)
print "NNB k=5 ", nearestN(5)
print "NNB k=9 ", nearestN(9)
print "NNB k=15 ", nearestN(15)

NNB k=1  0.9875
NNB k=3  0.983333333333
NNB k=5  0.983333333333
NNB k=9  0.979166666667
NNB k=15  0.975
