# DISCRIMINANT ANALYSIS

In this coding assignment you are to implement a Minimum Risk Bayes Decision Theoretic classifier and use it to classify the test examples in the provided datasets.  
Assume the following:
1. All conditional density functions are multivariate Gaussian
2. Each class has its own covariance matrix
3. Equally likely prior probabilities
4. 0-1 loss function


## Training Phase

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load training data - 135 observations, 4 features, 3 classes, 
df = pd.read_csv("iris_corrupted_training_dataset.csv")
print(df.head())
df = df.values
train_data = df

# Load validation data - 15 samples
df = pd.read_csv("iris_validation_dataset.csv")
print(df.head())
df = df.values
val_data = df

   sepal_length   sepal_width   petal_length   petal_width   class
0        5.7147        2.6743         3.2696       1.65440       2
1        5.1734        3.7374         5.9442       3.00050       3
2        7.3776        3.1505         3.3543       0.64839       2
3        6.4908        2.3983         3.3917       1.54950       2
4        6.8182        3.4016         4.7495       0.57970       3
   sepal_length   sepal_width   petal_length   petal_width   class
0           4.4           2.9            1.4           0.2       1
1           6.7           3.0            5.2           2.3       3
2           4.9           3.1            1.5           0.2       1
3           5.1           2.5            3.0           1.1       2
4           6.1           3.0            4.6           1.4       2


In [2]:
# Compute various components of the disriminant functions
train_data1 = train_data[np.where(train_data[:,4]==1),:]  # shape = 1,45,5
train_data2 = train_data[np.where(train_data[:,4]==2),:]
train_data3 = train_data[np.where(train_data[:,4]==3),:]
[i,j,k] = np.shape(train_data1)

train_data1 = train_data1.reshape(j,k)  # reshape to 2D (45,5), last col is label col
train_data2 = train_data2.reshape(j,k)
train_data3 = train_data3.reshape(j,k)

# Size of train_data* are now 45x4
train_data1 = train_data1[:,0:4]
train_data2 = train_data2[:,0:4]
train_data3 = train_data3[:,0:4]

# TO DO:
# Find the mean of each class
#  u1, u2, u3 are the 1x4 mean vectors for train_data1, train_data2, train_data3 matrices
#  Note: dimension of each of train_data is 45x4, 
#        hence dimensions of u1, u2, u3 = 1x4
# Hint: use np.mean

u1 = np.mean(train_data1, axis=0)
u2 = np.mean(train_data2, axis=0)
u3 = np.mean(train_data3, axis=0)
# complete code ...


# TO DO:
# Find the covariance of each class
#  cov1, cov2, cov3 are the covariance matrices of 
#      train_data1, train_data2, train_data3
#  dimension cov1, cov2, cov3 must be 4x4
# Hint: use np.cov, np.tranpose

cov1 = np.cov(np.transpose(train_data1))
cov2 = np.cov(np.transpose(train_data2))
cov3 = np.cov(np.transpose(train_data3))
# complete code ...


# TO DO: 
# Compute the determinant of cov* and its log. These are scalar quantities
#  Hint: use np.log, np.linalg.det

log_detcov1 = np.log(np.linalg.det(cov1))
log_detcov2 = np.log(np.linalg.det(cov2))
log_detcov3 = np.log(np.linalg.det(cov3))
# complete code ...


# TO DO:
# Compute the inverse of cov*
#   These are matrices of size 4x4
#   Hint: use np.linalg.inv

icov1 = np.linalg.inv(cov1)
icov2 = np.linalg.inv(cov2)
icov3 = np.linalg.inv(cov3)
# complete code ...


# Equally likely proir prob.
log_prior = np.log(1/3)

In [3]:
# print the mean vectors and the covariance matrices
print(u1)
print(u2)
print(u3)
print(cov1)
print(cov2)
print(cov3)

[4.80081778 3.48799556 1.26920989 0.34787733]
[6.06588222 2.82287978 4.26241333 1.10785197]
[6.42966    2.95656956 5.55874667 1.92476547]
[[ 0.73847372 -0.09788292  0.162097    0.09430334]
 [-0.09788292  1.04517177  0.08250472  0.06122466]
 [ 0.162097    0.08250472  0.75386746  0.07747734]
 [ 0.09430334  0.06122466  0.07747734  0.51347455]]
[[ 1.02666705  0.16051089  0.28736137 -0.10850815]
 [ 0.16051089  0.80414317  0.20221368 -0.07318826]
 [ 0.28736137  0.20221368  0.74048204 -0.04380217]
 [-0.10850815 -0.07318826 -0.04380217  0.69674064]]
[[1.36272732 0.26608677 0.44568822 0.30336696]
 [0.26608677 1.03934606 0.12853287 0.18437967]
 [0.44568822 0.12853287 0.69605886 0.23021863]
 [0.30336696 0.18437967 0.23021863 0.85756954]]


## Validation phase

In [4]:
# Evaluate model accuracy with validation dataset
# The dimension of the validation dataset, val_data, is 15x5. The first four
# columns are the feature columns and the last column is the class label column

# For each sample, compute the discriminant function (g1, g2, g3) corresponding to each class
# Assume equal prior = 1/3
# The predicted class label is associated with the largest of g1, g2, g3
# Count the number of correctly predicted labels

correct_class = 0;  # number of correctly predicted label

for i in range(0, len(val_data)):
    
    x = val_data[i,0:4]  # test sample's feature vector (transpose) 1x4
    y = val_data[i,4]    # test samples's true label
    
    # TO DO: compute g1, g2, g3

    g1 = (-0.5)*np.matmul(np.matmul(np.transpose(x-u1),(icov1)),(x-u1))-(0.5)*(log_detcov1)+log_prior
    g2 = (-0.5)*np.matmul(np.matmul(np.transpose(x-u2),(icov2)),(x-u2))-(0.5)*(log_detcov2)+log_prior
    g3 = (-0.5)*np.matmul(np.matmul(np.transpose(x-u3),(icov3)),(x-u3))-(0.5)*(log_detcov3)+log_prior

    # TO DO: 
    #  Now find the predicted class y_hat, compare it with the true label y
    #  and count the number of corectly predicted labels (correct_class)
    #  Recall this is a classification problem, hence y_hat should be 
    #  a discrete value (1, 2 or 3)


    if(g3>g1 and g3>g2):
      yhat = 3
    elif(g2>g1 and g2>g3):
      yhat = 2
    elif(g1>g2 and g1>g3):
      yhat = 1

    if (yhat == y):
        correct_class = correct_class + 1;
print('Classification accuracy = ', '{0:.4f}'. format(correct_class/15))

Classification accuracy =  0.9333
