# Leren: Programming assignment 5

**Student 1:**  <span style="color:red">Wim Berkelmans</span> (<span style="color:red">10793674</span>)<br>
**Student 2:** <span style="color:red">Philip Bouman</span> (<span style="color:red">10668667</span>)<br>

-----------------------------------

## 1. Gaussian Naive Bayes
####  a) Implementation

In [11]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy.linalg as linalg
import scipy.ndimage
import scipy.stats as sp
import math
from sklearn import linear_model, datasets

def readData(train,test):
    training = np.loadtxt(train, delimiter=';')
    test = np.loadtxt(test, delimiter=';')
    return training, test

# Calculate mean for all features in dataset
# return list of all means
def calcMean(X):
    m = len(X)
    n = len(X[0])
    totalSum = []
    for i in range(n):
        sum = 0
        for j in range(m):
            sum += float(X[j][i])

        totalSum.append(float(sum / len(X)))

    return totalSum

# Calculate variance for all feautures in dataset
# based on list of means and dataset
def calcVar(X, mean):
    m = len(X)
    n = len(X[0])
    totalSum = np.ones((m,n))
    for i in range(n):
        for j in range(m):
            totalSum[j][i] = float((X[j][i] - mean[i])**2)
    
    # calc mean of variance per feature
    var = calcMean(totalSum)
    
    return var
         
# returns the size of each class based on target values
def classSize(Y):
    size = []
    uniques = list(set(Y))
    for i in range(len(uniques)):
        counter = 0
        for j in range(len(Y)):
            if Y[j] == uniques[i]:
                counter +=1    
        size.append(counter)
    return size

# calculate normal distribution
def gaussian(mean, var, v):
    if var != 0:
        gaus = 1.0 / math.sqrt(2 * math.pi * var) * math.exp(-(v - mean)**2/(2*var))
    else:
        if v == mean:
            gaus = 1
        else:
            gaus = 0
            
    return gaus

# split dataset for each class
def splitClasses(X, Y):
    size = classSize(Y)
    classes = []
    class1 = X[0: size[0]]
    classes.append(class1)
    base = size[0]
    for i in range(len(size)-1):
        classN = X[base:base + size[i+1]]
        base = base + size[i+1]
        classes.append(classN)  
        
    return classes
    
# retrieve mean for each class
def classMean(classes):
    means = []
    for i in range(len(classes)):
        mean = calcMean(classes[i])
        means.append(mean)
    return means

# retrieve variance for each class
def classVars(classes, means):
    variances = []
    for i in range(len(classes)):
        var = calcVar(classes[0], means[0])
        variances.append(var)
    return variances

# claculate probabilities for each class
def calcPDF(means, variances, testSet):
    m = len(testSet)
    n = len(testSet[0])-1
    probabilities = np.ones((m,len(means)))
    for h in range(len(means)):
        for i in range(m):
            for j in range(n):
                prob = gaussian(means[h][j], variances[h][j] ,testSet[i][j])
                if prob != 0:
                    probabilities[i][h] *= prob

    return probabilities

# give best predictions for test set
def predict(probabilities, Y):
    uniques = list(set(Y))
    predictions = []
    finalProbs = []
    for i in range(len(probabilities)):
        best = probabilities[i].argmax(axis=0)
        predictions.append(uniques[best])
        
        probs = probabilities[i].max()
        finalProbs.append(probs)
        
    return predictions, finalProbs
    
# calculate the percentage of correct predictions
def getAccuracy(test_set, target):
    correct = 0
    miss = []
    for i in range(len(test_set)):
        if test_set[i][-1] == target[i]:
            correct += 1
        else:
            miss.append(i)
            
    accuracy = (correct/float(len(test_set))) * 100.0
    return accuracy, miss

# give back the n misclassifications with highset probability
def highestMiss(prob, miss, n):
    misclassified = {}
    for i in range(len(miss)):
        misclassified[prob[miss[i]]] = miss[i]
    keys = sorted(misclassified.items(), key=lambda x: x[0])
    return keys[-n:]
    
# main function
def naiveBayes(X, Y, testSet):
    # split classes and calculate mean and variance for each class
    classes = splitClasses(X, Y)
    means = classMean(classes)
    variances = classVars(classes, means)

    # calculate pdf, predictions and accuracy
    probs = calcPDF(means, variances, testSet)
    pred, finalProbs = predict(probs, Y)
    acc, miss = getAccuracy(testSet, pred)
    
    # examples with highest misclassification
    misclassified = highestMiss(finalProbs, miss, 10)
    
    print "Accuracy Gaussian Naive Bayes: ", acc/100
    print "Misclassified:"
    print misclassified
    print
        
# preprocess data
dataSets = readData('digits123-1.csv','digits123-2.csv')
X = dataSets[0]
Y = X[:,-1]
X = np.delete(X, -1, 1)
testSet = dataSets[1]
    
naiveBayes(X, Y, testSet)

Accuracy Gaussian Naive Bayes:  0.608333333333
Misclassified:
[(4.5076557255914265e-65, 156), (4.8806794970637234e-65, 149), (6.3970153534652496e-65, 207), (2.4932007457562496e-63, 152), (1.6574780653238209e-62, 157), (9.8485919284844255e-62, 222), (1.6381044703392142e-57, 105), (9.2914534931736198e-57, 132), (2.5853255454796082e-55, 126), (3.1439978732073109e-54, 101)]



### Misclassifications
Accuracy:  60.83 % <br>
Ten misclassifications with highest probability (for digits123-2.csv): <br>
101, 126, 132, 105, 222, 157, 152, 207, 149, 156

## 2. Compare classifiers

### Logistic regression

In [12]:
def logRegTrain(TrainX, TrainY, reg, maxI):
    logreg = linear_model.LogisticRegression(C=reg, solver='newton-cg', 
                                    max_iter=maxI, multi_class='multinomial')
    logreg.fit(TrainX, TrainY)
    return logreg

def logRegTest(logreg,TestX,TestY):
    return logreg.score(TestX,TestY)


# def logReg(Train, TrainY, Test, TestY):
#     logreg = linear_model.LogisticRegression(C=0.001, solver='newton-cg', 
#                                     max_iter=100, multi_class='multinomial')
#     logreg.fit(Train, TrainY)

#     return logreg.score(Test,TestY)


### Compare Logistic regression with Naive Bayes

In [13]:
# load data from file
def readDigits(file):
    data = np.loadtxt(file, delimiter=';')  
    return data

def init():
    Digits = readDigits('digits123-1.csv')
    # divide input into training set (60%), crossvalidation set (20%) and test set (20%)
    m = len(Digits)
    CV = Digits[::5]
    Test = Digits[1::5]
    Train1 = Digits[2::5]
    Train2 = Digits[3::5]
    Train3 = Digits[4::5]
    Train = np.concatenate((Train1,Train2,Train3))
    return Train, CV, Test

Train, CV, Test = init()
print 'Number of training examples',len(Train)
print 'Number of examples for cross validation',len(CV)
print 'Number of examples in the test set',len(Test)
print 

TrainY = Train[:,-1] # Y target values, last column
TrainX = np.delete(Train, -1, 1) # remove target values
CVY = CV[:,-1] # Y target values, last column
CVX = np.delete(CV, -1, 1) # remove target values
TestY = Test[:,-1] # Y target values, last column
TestX = np.delete(Test, -1, 1) # remove target values

# run Naive Bayes
naiveBayes(TrainX, TrainY, Test)

print "For Logistic Regression we use sklearn"
# train logistic regression
reg = 0.00001
maxI = 20
logreg = logRegTrain(TrainX, TrainY, reg, maxI)
# validate the parameters
print "CV Logistic Regression accuracy with regularization = ", reg, "max iterations = ",maxI
print logRegTest(logreg, CVX, CVY)
# train logistic regression
reg = 0.0001
maxI = 20
logreg = logRegTrain(TrainX, TrainY, reg, maxI)
# validate the parameters
print "CV Logistic Regression accuracy with regularization = ", reg, "max iterations = ",maxI
print logRegTest(logreg, CVX, CVY)
# train logistic regression
reg = 0.001
maxI = 20
logreg = logRegTrain(TrainX, TrainY, reg, maxI)
# validate the parameters
print "CV Logistic Regression accuracy with regularization = ", reg, "max iterations = ",maxI
print logRegTest(logreg, CVX, CVY)
# train logistic regression
reg = 0.01
maxI = 20
logreg = logRegTrain(TrainX, TrainY, reg, maxI)
# validate the parameters
print "CV Logistic Regression accuracy with regularization = ", reg, "max iterations = ",maxI
print logRegTest(logreg, CVX, CVY)
# train logistic regression
reg = 0.1
maxI = 20
logreg = logRegTrain(TrainX, TrainY, reg, maxI)
# validate the parameters
print "CV Logistic Regression accuracy with regularization = ", reg, "max iterations = ",maxI
print logRegTest(logreg, CVX, CVY)
# train logistic regression
reg = 0.001
maxI = 200
logreg = logRegTrain(TrainX, TrainY, reg, maxI)
# validate the parameters
print "CV Logistic Regression accuracy with regularization = ", reg, "max iterations = ",maxI
print logRegTest(logreg, CVX, CVY)
# test the model
reg = 0.001
maxI = 20
logreg = logRegTrain(TrainX, TrainY, reg, maxI)
print "Test Logistic Regression accuracy with regularization = ", reg, "max iterations = ",maxI
print logRegTest(logreg, TestX, TestY)
# print logReg(TrainX, TrainY, TestX, TestY)



Number of training examples 180
Number of examples for cross validation 60
Number of examples in the test set 60

Accuracy Gaussian Naive Bayes:  0.716666666667
Misclassified:
[(1.0988338013682296e-61, 14), (9.6119803783495996e-61, 2), (2.111899565393851e-60, 30), (1.6852462784038524e-58, 48), (5.8373026747012277e-58, 4), (2.640664411379417e-57, 59), (7.9179126073727142e-57, 37), (3.3493783776435987e-56, 58), (3.2619089793241219e-55, 40), (4.0905469471446541e-55, 56)]

For Logistic Regression we use sklearn
CV Logistic Regression accuracy with regularization =  1e-05 max iterations =  20
0.916666666667
CV Logistic Regression accuracy with regularization =  0.0001 max iterations =  20
0.983333333333
CV Logistic Regression accuracy with regularization =  0.001 max iterations =  20
1.0
CV Logistic Regression accuracy with regularization =  0.01 max iterations =  20
1.0
CV Logistic Regression accuracy with regularization =  0.1 max iterations =  20
1.0
CV Logistic Regression accuracy with 

### Analysis
Logistic regression is much more accurate than Naive Bayes.