In [26]:
import numpy as np
import math

def clean_data(line):
    l = line.replace('(', '').replace(')', '').replace(' ', '').replace('M','1').replace('W','0').strip().split(',')
    return [float(i) for i in l]

def fetch_data(filename):
    with open(filename, 'r') as f:
        input_data = f.readlines()
        clean_input = list(map(clean_data, input_data))
        f.close()
    return clean_input


def readFile(dataset_path):
    input_data = fetch_data(dataset_path)
    input_np = np.array(input_data)
    return input_np

training_data = './dataset/1a-training.txt'
test_data = './dataset/1a-test.txt'
large_120_data = './dataset/1c-data.txt'

train_np = readFile(training_data)
print(train_np)
test_np = readFile(test_data)
print(test_np)
large_np = readFile(large_120_data)


[[ 1.65301904 72.87114665 24.          0.        ]
 [ 1.64713849 72.61278531 34.          0.        ]
 [ 1.64720558 73.53968351 33.          1.        ]
 [ 1.73230089 76.06787034 30.          1.        ]
 [ 1.67507027 81.05582112 30.          1.        ]
 [ 1.57809707 64.92608468 30.          0.        ]
 [ 1.65876294 69.38092449 30.          1.        ]
 [ 1.6763296  77.06229599 31.          1.        ]
 [ 1.71872241 62.11292332 37.          0.        ]
 [ 1.52022182 66.15144402 27.          0.        ]
 [ 1.55526893 66.07638614 31.          0.        ]
 [ 1.69693332 77.45386245 34.          1.        ]
 [ 1.68879808 76.48964073 37.          1.        ]
 [ 1.52135529 63.95294495 35.          0.        ]]
[[ 1.62065759 59.37655744 32.        ]
 [ 1.77939838 72.07177567 36.        ]
 [ 1.70045766 66.26750811 31.        ]
 [ 1.65910862 61.7516219  29.        ]]


# K Nearest Neighbor

In [27]:
# Defined fucntions for different similarity measures
def minkowski_dist(p1, p2):
    dim, sum = len(p1), 0
    for index in range(dim - 1):
        sum += math.pow(abs(p1[index] - p2[index]), 3)
    return math.pow(sum,1/3)

def manhattan_dist(p1, p2):
    dim, sum = len(p1), 0
    for index in range(dim - 1):
        sum += abs(p1[index] - p2[index])
    return sum

def cartesian_dist(p1, p2):
    dim, sum = len(p1), 0
    for index in range(dim - 1):
        sum += math.pow(p1[index] - p2[index], 2)
    return math.sqrt(sum)

In [28]:
# Implemented KNN algorithm in this cell
def knn(train_set, new_sample, K, dist):
    dists, train_size = {}, len(train_set)
    # Calcuating distance between the new sample and the data in training set
    for i in range(train_size):
        if dist == 'Cartesian':
            d = cartesian_dist(train_set[i], new_sample)
        elif dist == 'Manhattan':
            d = manhattan_dist(train_set[i], new_sample)
        elif dist == 'Minkowski':
            d = minkowski_dist(train_set[i], new_sample)

        dists[i] = d
    
    k_neighbors = sorted(dists, key=dists.get)[:K]
    
    M, W = 0, 0
    for index in k_neighbors:
        if train_set[index][-1] == 1.:
            M += 1
        else:
            W += 1
            
    if M > W:
        return 'M'
    else:
        return 'W'

# 1)A)

In [29]:
# Predicting output class for different k values and simitality measures
for k in [1,3,7]:
    for dist in ['Cartesian','Manhattan','Minkowski']:
        print('------------------------------------------------------------------')
        print('For k = ' + str(k) + ' with similarity measurements as ' + dist)
        print('------------------------------------------------------------------')
        for ns in test_np:
            print(ns,end = '\t = \t')
            print(knn(train_np,ns,k,dist))
    print()
    print('***********************************************************************')
    print()

------------------------------------------------------------------
For k = 1 with similarity measurements as Cartesian
------------------------------------------------------------------
[ 1.62065759 59.37655744 32.        ]	 = 	W
[ 1.77939838 72.07177567 36.        ]	 = 	W
[ 1.70045766 66.26750811 31.        ]	 = 	W
[ 1.65910862 61.7516219  29.        ]	 = 	W
------------------------------------------------------------------
For k = 1 with similarity measurements as Manhattan
------------------------------------------------------------------
[ 1.62065759 59.37655744 32.        ]	 = 	W
[ 1.77939838 72.07177567 36.        ]	 = 	W
[ 1.70045766 66.26750811 31.        ]	 = 	W
[ 1.65910862 61.7516219  29.        ]	 = 	W
------------------------------------------------------------------
For k = 1 with similarity measurements as Minkowski
------------------------------------------------------------------
[ 1.62065759 59.37655744 32.        ]	 = 	W
[ 1.77939838 72.07177567 36.        ]	 = 	W
[ 

# 1)B)

In [30]:
# Enter Height, Weight, Age and K value to predict the output label using KNN
ns = input('Enter Height, Weight and Age as "," seperated parameters\n')
k = int(input('Enter K value '))
ns = [float(i) for i in ns.split(',')]
print('Output = ' + knn(train_np,ns,k,'Cartesian'))

Enter Height, Weight and Age as "," seperated parameters
1.7359417237856, 77.004988515324, 20
Enter K value 3
Output = M


## 1)C)

In [10]:
# Here we are iterating through different k values with Cartesian (Euclidean) distance metric
for k in [1,3,5,7,9,11]:
    miss_classified = 0
    # We are iterating through large_np dataset
    for i in range(len(large_np)):
        # Storing the new sample in a variable
        ns = large_np[i]
        # Deleteing the stored sample from the training data
        large_np = np.delete(large_np, i, axis = 0)
        # Predicitng the output class using KNN algorithm
        op = (knn(large_np,np.delete(ns,-1),k,'Cartesian'))
        # Checking here if the output label is classified correclty or not
        if (ns[-1] == 0.0 and op == 'M') or (ns[-1] == 1.0 and op == 'W'):
            miss_classified+=1
        # Adding back the deleted sample from the training dataset
        large_np = np.insert(large_np, i,  ns, axis = 0)
    # Printing the error by checking the miss classified labels w.r.t total training dataset size - 1
    print(" % Error for k = " + str(k) + ' is ' + str(round(miss_classified/(len(large_np)-1)*100,2)) + ' %')

 % Error for k = 1 is 45.38 %
 % Error for k = 3 is 38.66 %
 % Error for k = 5 is 38.66 %
 % Error for k = 7 is 39.5 %
 % Error for k = 9 is 36.97 %
 % Error for k = 11 is 41.18 %


In [11]:
# From above we can see that k = 9 has lowest error, so we can say that for k = 9 we get the best performance.

# 1)D)

In [12]:
# Removed age from the training dataset
large_np_wo_age = np.delete(large_np, 2, 1)

In [13]:
# Here we are iterating through different k values with Cartesian (Euclidean) distance metric
for k in [1,3,5,7,9,11]:
    miss_classified = 0
    # We are iterating through large_np without age dataset
    for i in range(len(large_np_wo_age)):
        # Storing the new sample in a variable
        ns = large_np_wo_age[i]
        # Deleteing the stored sample from the training data
        large_np_wo_age = np.delete(large_np_wo_age, i, axis = 0)
        # Predicitng the output class using KNN algorithm
        op = (knn(large_np_wo_age,np.delete(ns,-1),k,'Cartesian'))
        # Checking here if the output label is classified correclty or not
        if (ns[-1] == 0.0 and op == 'M') or (ns[-1] == 1.0 and op == 'W'):
            miss_classified+=1
        # Adding back the deleted sample to the training dataset
        large_np_wo_age = np.insert(large_np_wo_age, i,  ns, axis = 0)
    # Printing the error by checking the miss classified labels w.r.t total training dataset size - 1
    print(" % Error for k = " + str(k) + ' is ' + str(round(miss_classified/(len(large_np)-1)*100,2)) + ' %')

 % Error for k = 1 is 37.82 %
 % Error for k = 3 is 29.41 %
 % Error for k = 5 is 35.29 %
 % Error for k = 7 is 36.97 %
 % Error for k = 9 is 40.34 %
 % Error for k = 11 is 42.86 %


In [None]:
# Here we see that k = 3 has lowest error. So we can say that k = 3 has performed better when age feature excluded.

In [None]:
# From above results we see that for k value 1,3,5 and 7 the error is lower and for 9 and 10 the errors are slightly 
# higher, so from this we can see that without the age data the model was able to predict the gender better when 
# compared to the data when age was included.

# Gaussian Naive Bayes Classification

In [14]:
# Creating a dictionary to seperate the M and W samples from the dataset and storing it in that dictionarty
class_label_dict = {1.0:np.empty((0, 4), int),0.0:np.empty((0, 4), int)}
for i in train_np:
    class_label_dict[i[-1]] = np.append(class_label_dict[i[-1]], np.array([i]), axis=0)

In [15]:
class_label_dict

{1.0: array([[ 1.64720558, 73.53968351, 33.        ,  1.        ],
        [ 1.73230089, 76.06787034, 30.        ,  1.        ],
        [ 1.67507027, 81.05582112, 30.        ,  1.        ],
        [ 1.65876294, 69.38092449, 30.        ,  1.        ],
        [ 1.6763296 , 77.06229599, 31.        ,  1.        ],
        [ 1.69693332, 77.45386245, 34.        ,  1.        ],
        [ 1.68879808, 76.48964073, 37.        ,  1.        ]]),
 0.0: array([[ 1.65301904, 72.87114665, 24.        ,  0.        ],
        [ 1.64713849, 72.61278531, 34.        ,  0.        ],
        [ 1.57809707, 64.92608468, 30.        ,  0.        ],
        [ 1.71872241, 62.11292332, 37.        ,  0.        ],
        [ 1.52022182, 66.15144402, 27.        ,  0.        ],
        [ 1.55526893, 66.07638614, 31.        ,  0.        ],
        [ 1.52135529, 63.95294495, 35.        ,  0.        ]])}

In [16]:
# Calculating Mean of output class 'M'
# M_mh is Mean of output class 'M' for variable Height
# M_mw is Mean of output class 'M' for variable Weight
# M_ma is Mean of output class 'M' for variable Age
M_mh,M_mw,M_ma, M_o =  np.mean(class_label_dict[1.0], axis = 0)

In [17]:
# Calculating Mean of output class 'W'
# W_mh is Mean of output class 'W' for variable Height
# W_mw is Mean of output class 'W' for variable Weight
# W_ma is Mean of output class 'W' for variable Age
W_mh,W_mw,W_ma, W_o = np.mean(class_label_dict[0.0], axis = 0)

In [18]:
# Calculating variance of output class 'M'
# M_vh is Variance of output class 'M' for variable Height
# M_vw is Variance of output class 'M' for variable Weight
# M_va is Variance of output class 'M' for variable Age
M_vh,M_vw,M_va, M_o = np.var(class_label_dict[1.0], axis = 0)

In [19]:
# Calculating variance of output class 'W'
# W_vh is Variance of output class 'W' for variable Height
# W_vw is Variance of output class 'W' for variable Weight
# W_va is Variance of output class 'W' for variable Age
W_vh,W_vw,W_va, W_o = np.var(class_label_dict[0.0], axis = 0)

In [20]:
M_m = [M_mh,M_mw,M_ma]
W_m = [W_mh,W_mw,W_ma]
M_v = [M_vh,M_vw,M_va]
W_v = [W_vh,W_vw,W_va]

In [21]:
# This function uses Gaussian Naive Bayes Formula and returns the probability
def gnb(ns, mean, var):
    pi = np.pi
    expr1 = 1/(np.sqrt(2 * np.pi * var))
    expr2 = ((ns - mean) ** 2)/2 * var
    expo = np.exp(-(expr2))
    prob = expr1 * expo
    return prob

# 2)A)

In [22]:
# Here we are iterating through test_np dataset
for ns in test_np:
    m_op, w_op = 1.0, 1.0
    print(ns,end = '\t = \t')
    # Here are are calling the gnb function to get the probability for each input data feature
    # ns[0] is height of new sample, M_m[0] is mean height, M_v[0] is vaiance of height
    # ns[1] is weight of new sample, M_m[0] is mean weight, M_v[0] is vaiance of weight
    # ns[2] is age of new sample, M_m[0] is mean age, M_v[0] is vaiance of age
    # And then we multiply each of these probabilities
    for i in range(len(ns)):
        m_op *= (gnb(ns[i],M_m[i],M_v[i]))
        w_op *= (gnb(ns[i],W_m[i],W_v[i]))
    # We check which output label has highest probability and classify the sample as that output label.
    if m_op > w_op:
        print ('M')
    else:
        print('W')

[ 1.62065759 59.37655744 32.        ]	 = 	W
[ 1.77939838 72.07177567 36.        ]	 = 	M
[ 1.70045766 66.26750811 31.        ]	 = 	W
[ 1.65910862 61.7516219  29.        ]	 = 	W


# 2)B)

In [23]:
# Enter Height, Weight, Age and K value to predict the output label using GNB
ns = input('Enter Height, Weight and Age as "," seperated parameters\n')
ns = [float(i.strip()) for i in ns.split(',')]
m_op, w_op = 1.0, 1.0
for i in range(len(ns)):
    m_op *= (gnb(ns[i],M_m[i],M_v[i]))
    w_op *= (gnb(ns[i],W_m[i],W_v[i]))
if m_op > w_op:
    print ('Output = M')
else:
    print('Output = W')

Enter Height, Weight and Age as "," seperated parameters
1.7359417237856, 77.004988515324, 20
Output = M


# 2)C)

In [24]:
miss_classified = 0
# Here we are iterating through large_np dataset
for i in range(len(large_np)):
    # Storing the new sample in a variable
    ns = large_np[i]
    # Deleteing the stored sample from the training data
    large_np = np.delete(large_np, i, axis = 0)
    # Predicitng the output class using GNB algorithm
    for j in range(len(ns)-1):
        m_op *= (gnb(ns[j],M_m[j],M_v[j]))
        w_op *= (gnb(ns[j],W_m[j],W_v[j]))
    op = 'W'
    # if probability of M is greater than W then output class label is set to M
    if m_op > w_op:
        op = ('M')
    # Checking here if the output label is classified correclty or not
    if (ns[-1] == 0.0 and op == 'M') or (ns[-1] == 1.0 and op == 'W'):
        miss_classified+=1
    # Adding back the deleted sample to the training dataset
    large_np = np.insert(large_np, i,  ns, axis = 0)
# Printing the error by checking the miss classified labels w.r.t total training dataset size - 1
print("Error is " + str(round(miss_classified/(len(large_np)-1)*100,2)) + ' %')

Error is 46.22 %


# 2)D)

In [25]:
miss_classified = 0
# Here we are iterating through large_np_wo_age (dataset with no age data) dataset
for i in range(len(large_np_wo_age)):
    # Storing the new sample in a variable
    ns = large_np_wo_age[i]
    # Deleteing the stored sample from the training data
    large_np_wo_age = np.delete(large_np_wo_age, i, axis = 0)
    # Predicitng the output class using GNB algorithm
    for j in range(len(ns)-1):
            m_op *= (gnb(ns[j],M_m[j],M_v[j]))
            w_op *= (gnb(ns[j],W_m[j],W_v[j]))
    op = 'W'
    # if probability of M is greater than W then output class label is set to M
    if m_op > w_op:
        op = ('M')
    # Checking here if the output label is classified correclty or not
    if (ns[-1] == 0.0 and op == 'M') or (ns[-1] == 1.0 and op == 'W'):
        miss_classified+=1
    # Adding back the deleted sample to the training dataset
    large_np_wo_age = np.insert(large_np_wo_age, i,  ns, axis = 0)
# Printing the error by checking the miss classified labels w.r.t total training dataset size - 1
print("Error is " + str(round(miss_classified/(len(large_np)-1)*100,2)) + ' %')

Error is 45.38 %


In [None]:
# Like we have seen in KNN, in the GNB also we see that when age data is excluded from the training dataset, the 
# error rate is a bit lower.

In [None]:
# If we compare GNB and KNN, comparing the error rates we can say that KNN performed better than GNB in both 
# the cases, including the age data and without including the age data.