In [None]:
from scipy.io import arff
import pandas as pd
import numpy as np
import math
import statistics
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
from google.colab import files

# Load file for google colab, comment out for local test
#uploaded = files.upload()

# scale function to rescale the continuous feature into value between 0 and 1
def maximum_absolute_scaling(df):
  # copy the dataframe
  df_scaled = df.copy()
  # apply maximum absolute scaling
  df_scaled.iloc[:, 2:18] = df_scaled.iloc[:, 2:18] / df_scaled.iloc[:, 2:18].abs().max()
  return df_scaled

def load_and_clean_data():
  # Load file for google colab, comment out for local test
  #uploaded = files.upload() 
  # loading dataset 
  data = arff.loadarff('messidor_features.arff')
  dataset2 = pd.DataFrame(data[0])
  dataset2.columns = ['Quality', 'Pre-screening', 'MA-0.5', 'MA-0.6', 'MA-0.7', 'MA-0.8', 'MA-0.9', 'MA-1.0', 'Exudates-1', 'Exudates-2', 'Exudates-3', 'Exudates-4', 'Exudates-5', 'Exudates-6','Exudates-7','Exudates-8','dist-macula-optic', 'diameter-optic', 'AM/FM-classi', 'Class']
  # Clean data - remove all rows with missing values
  dataset2=dataset2.replace("?",np.nan).dropna()
  dataset2['Class'] = dataset2['Class'].str.decode('utf-8') 
  dataset2 = dataset2.apply(pd.to_numeric)
  #x, y = dataset2.iloc[: , :-1].to_numpy(), dataset2.iloc[: , -1].to_numpy()  
  #print(dataset2)
  return dataset2

def scaled_data(dataset2):
  df_scaled = maximum_absolute_scaling(dataset2)
  x, y = df_scaled.iloc[: , :-1].to_numpy(), df_scaled.iloc[: , -1].to_numpy()  
  (N,D), C = x.shape, np.max(y)+1
  print(f'instances (N) \t {N} \nfeatures (D) \t {D} \nclasses (C) \t {C}')    
  return x, y, N

def unscaled_data(dataset2):
  x, y = dataset2.iloc[:, :-1].to_numpy(), dataset2.iloc[:, -1].to_numpy()
  (N, D), C = x.shape, np.max(y) + 1
  print(f'instances (N) \t {N} \nfeatures (D) \t {D} \nclasses (C) \t {C}')
  return x,y,N


# KNN

In [None]:
#define the metric we will use to measure similarity 
euclidean = lambda x1, x2: np.sqrt(np.sum((x1 - x2)**2,axis = -1))
manhattan = lambda x1, x2: np.sum(np.abs(x1 - x2), axis=-1)

class KNN:
    def __init__(self, K=1, dist_fn= euclidean):
        self.dist_fn = dist_fn                                                    #we need to use self because the parameters would be stored in variables 
                                                                                  #on the stack and would be discarded when the init method goes out of scope
        self.K = K
        return
    
    def fit(self, x, y):
        ''' Store the training data using this method as it is a lazy learner'''
        self.x = x
        self.y = y
        self.C = np.max(y) + 1
        return self
    
    def predict(self, x_test):
        ''' Makes a prediction using the stored training data and the test data given as argument'''

        num_test = x_test.shape[0]
        #calculate distance between the training & test samples and returns an array of shape [num_test, num_train]    
        distances=distances = self.dist_fn(self.x[None, :, :], x_test[:, None, :])
        
        #ith-row of knns stores the indices of k closest training samples to the ith-test sample 
        knns = np.zeros((num_test, self.K), dtype=int)
        
        #ith-row of y_prob has the probability distribution over C classes
        y_prob = np.zeros((num_test, self.C))
        for i in range(num_test):
            knns[i,:] = np.argsort(distances[i])[:self.K]  
            #counts the number of instances of each class in the K-closest training samples
            y_prob[i,:] = np.bincount(self.y[knns[i,:]], minlength=self.C) 
        y_prob /= self.K                                                          
        return y_prob, knns

def model_knn_for_specific_k(x, y, N, k):
  model = KNN(K=k)
  accuracy_list=[]
  for i in range(100):
    inds = np.random.permutation(N)    
    #split the dataset into train and test(80% training & 20% testing)
    x_train, y_train = x[inds[:920]], y[inds[:920]]
    x_test, y_test = x[inds[920:]], y[inds[920:]]
    y_prob, knns = model.fit(x_train, y_train).predict(x_test)
    y_pred = np.argmax(y_prob,axis=-1)                                                
    accuracy_list.append(np.sum(y_pred == y_test)/y_test.shape[0])
  print('knns shape:', knns.shape)
  print('y_prob shape:', y_prob.shape)

  accuracy=sum(accuracy_list) /len(accuracy_list)

  print(f'accuracy is {accuracy*100:.1f}.')



# KNN for different distance function - plot

In [None]:
def make_prediction_cost(N, k, x, y):
    model1 = KNN(K=k,dist_fn= euclidean)
    model2 = KNN(K=k,dist_fn= manhattan)
    #square_error = []
    accuracy_list1=[]
    accuracy_list2=[]
    for i in range(50):
        inds = np.random.permutation(N)
        # split the dataset into train and test(80% training & 20% testing)
        x_train, y_train = x[inds[:920]], y[inds[:920]]
        x_test, y_test = x[inds[920:]], y[inds[920:]]
        y_prob, knns = model1.fit(x_train, y_train).predict(x_test)
        y_prob1, knns1 = model2.fit(x_train, y_train).predict(x_test)
        # To get hard predictions by choosing the class with the maximum probability
        y_pred = np.argmax(y_prob, axis=-1)
        y_pred1 = np.argmax(y_prob1, axis=-1)
        accuracy_list1.append(np.sum(y_pred == y_test) / y_test.shape[0])
        accuracy_list2.append(np.sum(y_pred1 == y_test) / y_test.shape[0])
    accuracy_euclidean=sum(accuracy_list1) /len(accuracy_list1)
    accuracy_manhattan=sum(accuracy_list2) /len(accuracy_list2)
    # print(f'accuracy is {accuracy * 100:.1f}.')
    # print(f'mean squared error is {mean_square_error}.')
    return accuracy_euclidean, accuracy_manhattan

def compare_diff_distance_func(x,y,N):
    k_list=[]
    accuracy_euclidean_list=[]
    accuracy_manhattan_list=[]
    for k in range(1,21):
        accuracy_euclidean, accuracy_manhattan=make_prediction_cost(N, k, x, y)
        accuracy_euclidean_list.append(accuracy_euclidean)
        accuracy_manhattan_list.append(accuracy_manhattan)
        k_list.append(k)
    plt.plot(k_list, accuracy_euclidean_list, label="euclidean")
    plt.plot(k_list, accuracy_manhattan_list, label="manhattan")
    plt.title('Plot of accuracy for model#2 with different distance function')
    plt.xlabel('k (number of neighbours)')
    plt.ylabel('Accuracy')
    plt.xticks(range(0,21,2))
    plt.legend()
    plt.show()
    

# L-fold cross_validation for KNN

In [None]:
def make_prediction(N, k, x, y):
    model = KNN(K=k)
    square_error = []
    for i in range(30):
        inds = np.random.permutation(N)
        # split the dataset into train and test(80% training & 20% testing)
        x_train, y_train = x[inds[:920]], y[inds[:920]]
        x_test, y_test = x[inds[920:]], y[inds[920:]]
        y_prob, knns = model.fit(x_train, y_train).predict(x_test)
        # To get hard predictions by choosing the class with the maximum probability
        y_pred = np.argmax(y_prob, axis=-1)
        accuracy = np.sum(y_pred == y_test) / y_test.shape[0]
        square_error.append(np.square(np.subtract(y_pred,y_test)).mean())
    mean_square_error=sum(square_error) / len(square_error)
    # print(f'accuracy is {accuracy * 100:.1f}.')
    # print(f'mean squared error is {mean_square_error}.')
    return mean_square_error


def cross_validation(k, x, y):
    model = KNN(K=k)
    accuracy_list = []
    square_error = []
    for i in range(10):
        split_x = np.split(x, 10)
        val_x = split_x.pop(i)
        train_x = np.concatenate(split_x)
        split_y = np.array_split(y, 10)
        val_y = split_y.pop(i)
        train_y = np.concatenate(split_y)

        y_prob, knns = model.fit(train_x, train_y).predict(val_x)
        # To get hard predictions by choosing the class with the maximum probability
        y_pred = np.argmax(y_prob, axis=-1)
        square_error.append(np.square(np.subtract(y_pred,val_y)).mean())
        accuracy_list.append(np.sum(y_pred == val_y) / val_y.shape[0])
    mean_square_error = sum(square_error) / len(square_error)

    accuracy = sum(accuracy_list) / len(accuracy_list)
    # print(f'accuracy is {accuracy * 100:.1f}.')
    # print(f'mean squared error is {mean_square_error}.')
    return mean_square_error

def get_average_accuracy_fold(N, i, x, y):
    accuracy1 = []
    accuracy2 = []
    for k in range(30):
        inds = np.random.permutation(N)
        # split the dataset into train and test(80% training & 20% testing)
        x_train, y_train = x[inds[:920]], y[inds[:920]]
        x_test, y_test = x[inds[920:]], y[inds[920:]]
        accuracy1.append(cross_validation(i, x_train, y_train))
        accuracy2.append(make_prediction(N, i, x, y))
    return sum(accuracy1) / len(accuracy1), sum(accuracy2) / len(accuracy2)

def L_fold_with_plot_knn(x,y,N):
    accuracy_for_test = []
    accuracy_for_validation = []
    k = []

    for i in range(1, 30):
        k.append(i)
        acc1, acc2 = get_average_accuracy_fold(N, i, x, y)
        accuracy_for_validation.append(acc1)
        accuracy_for_test.append(acc2)
    std = statistics.stdev(accuracy_for_validation)
    plt.plot(k, accuracy_for_test, label="test")
    plt.errorbar(k, accuracy_for_validation, std, label="validation")
    plt.title('Plot of the mean and standard deviation in 10 fold cross-validation')
    plt.xlabel('k (number of neighbours)')
    plt.ylabel('mean squared error')
    plt.xticks(range(0,30,5))
    plt.legend()
    plt.show()



# KNN: 2 Most Important Features and Decision Boundary

In [None]:
def two_most_important_feature(dataset2):
  max_accuracy = 0
  two_features = [None, None]
  j = 1
  max_j = 0
  max_k = 0
  while j<18:
    k = j + 1
    while k<=18:
      y = dataset2.iloc[:, -1].to_numpy()
      x1 = dataset2.iloc[:, j].to_numpy()
      x2 = dataset2.iloc[:, k].to_numpy()
      x = np.concatenate((x1[:,None], x2[:,None]), axis = 1)
        
      (N,D), C = x.shape, np.max(y)+1                                                    
      #print(f'instances (N) \t {N} \nfeatures (D) \t {D} \nclasses (C) \t {C}')
      #deleted_column = df.pop(df.columns[0])

      #print("The features that used are " + dataset1.columns[j] + " " + dataset1.columns[k])
      model = KNN(K=12)
      accuracy_list=[]
      inds = np.random.permutation(N)
      accuracies = 0
      for i in range(100):
    
        inds = np.random.permutation(N)  
        #split the dataset into train and test(80% training & 20% testing)
        x_train, y_train = x[inds[:920]], y[inds[:920]]
        x_test, y_test = x[inds[920:]], y[inds[920:]]
        y_prob, knns = model.fit(x_train, y_train).predict(x_test)

        #To get hard predictions by choosing the class with the maximum probability
        y_pred = np.argmax(y_prob,axis=-1)        
        accuracy_list.append(np.sum(y_pred == y_test)/y_test.shape[0])

        accuracy = sum(accuracy_list) /len(accuracy_list)
        accuracies += accuracy
      accuracies /= 100
        #print(f'accuracy is {accuracies*100:.1f}.')
      if(accuracies>max_accuracy):
        max_accuracy = accuracies
        two_features[0] = dataset2.columns[j]
        two_features[1] = dataset2.columns[k]
        max_j = j
        max_k = k
      k = k + 1
    j = j + 1
  print(f'accuracy is {max_accuracy*100:.1f}.')
  print("The features that used are " + two_features[0] + " and " + two_features[1])

  y = dataset2.iloc[:, -1].to_numpy()
  x_1 = dataset2.iloc[:, max_j].to_numpy()
  x_2 = dataset2.iloc[:, max_k].to_numpy()
  x = np.concatenate((x_1[:,None], x_2[:,None]), axis = 1)

  (N,D), C = x.shape, np.max(y)+1
  inds = np.random.permutation(N)
  x_train, y_train = x[inds[:920]], y[inds[:920]]
  x_test, y_test = x[inds[920:]], y[inds[920:]]

  x0v = np.linspace(np.min(x[:, 0]), np.max(x[:, 0]), 200)
  x1v = np.linspace(np.min(x[:, 1]), np.max(x[:, 1]), 200)
  x0, x1 = np.meshgrid(x0v, x1v)
  x_all = np.vstack((x0.ravel(), x1.ravel())).T

  model = KNN(K=12)
  y_train_prob = np.zeros((y_train.shape[0], C+1))

  y_train_prob[np.arange(y_train.shape[0]), y_train] = 1

  y_prob_all, _ = model.fit(x_train, y_train).predict(x_all)

  y_pred_all = np.zeros((y_prob_all.shape[0], C+1))
  y_pred_all[np.arange(x_all.shape[0]), np.argmax(y_prob_all, axis=-1)] = 1
  print(y_pred_all)

  plt.scatter(x_train[:,0], x_train[:,1], c=y_train_prob, marker='o', alpha=1)
  plt.scatter(x_all[:,0], x_all[:,1], c=y_pred_all, marker='.', alpha=0.01)
  plt.ylabel(dataset2.columns[max_k])
  plt.xlabel(dataset2.columns[max_j])
  plt.show()


# Decision Tree for different cost function

In [None]:
np.seterr(divide='ignore', invalid='ignore')
class Node:
    def __init__(self, data_indices, parent):
        self.data_indices = data_indices    # stores the data indices which are in the region defined by this node
        self.left = None                    # stores the left child of the node
        self.rigth = None                   # stores the right child of the node
        self.split_feature = None           # the feature for split at this node
        self.split_value = None             # the value of the feature for split at this node
        if parent:
            self.depth = parent.depth + 1   # obtain the depth of the node by adding one to depth of the parent
            self.num_classes = parent.num_classes   # copies the num classes from the parent
            self.data = parent.data         # copies the data from the parent
            self.labels = parent.labels     # copies the labels from the parent
            class_prob = np.bincount(self.labels[data_indices], minlength=self.num_classes)
                        # this is counting frequency of different labels in the region defined by this node
            self.class_prob = class_prob / np.sum(class_prob)
                        # stores the class probability for the node
            # we'll use the class probabilities of the leaf nodes for making predictions after the tree is built


def greedy_test(node, cost_fn):
    # initialize the best parameter values
    best_cost = np.inf
    best_feature, best_value = None, None
    num_instances, num_features = node.data.shape
            
    #sort the features to get the test value candidates by taking
    #the average of consecutive sorted feature values
    data_sorted = np.sort(node.data[node.data_indices], axis=0)
    test_candidates = (data_sorted[1:] + data_sorted[:-1]) / 2.
    for f in range(num_features):
        data_f = node.data[node.data_indices, f]
        for test in test_candidates[:, f]:

            #Split the indices using the test value of f-th feature
            left_indices = node.data_indices[data_f <= test]
            right_indices = node.data_indices[data_f > test]

            #We can't have a split where a child has zero element
            #if this is true over all the test features and their test values
            #then the function returns the best coset as infinity
            if len(left_indices) == 0 or len(right_indices) == 0:
                continue
            #compute the left and right cost based on the current split
            left_cost = cost_fn(node.labels[left_indices])
            right_cost = cost_fn(node.labels[right_indices])
            num_left, num_right = left_indices.shape[0], right_indices.shape[0]

            # get combined cost using the weighted sum of left and right cost
            cost = (num_left * left_cost + num_right * right_cost) / num_instances

            if cost < best_cost:
                best_cost = cost
                best_feature = f
                best_value = test
    return best_cost, best_feature, best_value

def cost_misclassification(labels):
    counts = np.bincount(labels)
    class_probs = counts / np.sum(counts)
    return 1 - np.max(class_probs)

def cost_entropy(labels):
    class_probs = np.bincount(labels) / len(labels)
    class_probs = class_probs[class_probs > 0]
    return -np.sum(class_probs * np.log(class_probs)) 

def cost_gini_index(labels):
    class_probs = np.bincount(labels) / len(labels)
    return 1 - np.sum(np.square(class_probs))


class DecisionTree:
    def __init__(self, num_classes=None, max_depth=3, cost_fn=cost_misclassification, min_leaf_instances=1):
        self.max_depth = max_depth      #maximum dept for termination 
        self.root = None                #stores the root of the decision tree 
        self.cost_fn = cost_fn          #stores the cost function of the decision tree 
        self.num_classes = num_classes  #stores the total number of classes
        self.min_leaf_instances = min_leaf_instances  #minimum number of instances in a leaf for termination
        
    def fit(self, data, labels):
        pass                            #pass in python 3 means nothing happens and the method here is empty
    
    def predict(self, data_test):
        pass

def fit(self, data, labels):
    self.data = data
    self.labels = labels
    if self.num_classes is None:
        self.num_classes = np.max(labels) + 1
    #below are initialization of the root of the decision tree
    self.root = Node(np.arange(data.shape[0]), None)
    self.root.data = data
    self.root.labels = labels
    self.root.num_classes = self.num_classes
    self.root.depth = 0
    #to recursively build the rest of the tree
    self._fit_tree(self.root)
    return self

def _fit_tree(self, node):
    #This gives the condition for termination of the recursion resulting in a leaf node
    if node.depth == self.max_depth or len(node.data_indices) <= self.min_leaf_instances:
        return
    #greedily select the best test by minimizing the cost
    cost, split_feature, split_value = greedy_test(node, self.cost_fn)
    #if the cost returned is infinity it means that it is not possible to split the node and hence terminate
    if np.isinf(cost):
        return
    #print(f'best feature: {split_feature}, value {split_value}, cost {cost}')
    #to get a boolean array suggesting which data indices corresponding to this node are in the left of the split
    test = node.data[node.data_indices,split_feature] <= split_value
    #store the split feature and value of the node
    node.split_feature = split_feature
    node.split_value = split_value
    #define new nodes which are going to be the left and right child of the present node
    left = Node(node.data_indices[test], node)
    right = Node(node.data_indices[np.logical_not(test)], node)
    #recursive call to the _fit_tree()
    self._fit_tree(left)
    self._fit_tree(right)
    #assign the left and right child to present child
    node.left = left
    node.right = right

DecisionTree.fit = fit
DecisionTree._fit_tree = _fit_tree

def predict(self, data_test):
    class_probs = np.zeros((data_test.shape[0], self.num_classes))
    for n, x in enumerate(data_test):
        node = self.root
        #loop along the dept of the tree looking region where the present data sample fall in based on the split feature and value
        while node.left:
            if x[node.split_feature] <= node.split_value:
                node = node.left
            else:
                node = node.right
        #the loop terminates when you reach a leaf of the tree and the class probability of that node is taken for prediction
        class_probs[n,:] = node.class_prob
    return class_probs
DecisionTree.predict = predict

def make_prediction_depth(N, depth, x, y):
    tree1 = DecisionTree(max_depth=depth)    #### using misclassification as cost function
    tree2 = DecisionTree(max_depth=depth, cost_fn = cost_entropy) 
    tree3 = DecisionTree(max_depth=depth, cost_fn = cost_gini_index)
    accuracy_list1=[]
    accuracy_list2=[]
    accuracy_list3 = []
    for i in range(10):
        inds = np.random.permutation(N)
        # split the dataset into train and test(80% training & 20% testing)
        x_train, y_train = x[inds[:920]], y[inds[:920]]
        x_test, y_test = x[inds[920:]], y[inds[920:]]

        #### using misclassification as cost function
        probs_test1 = tree1.fit(x_train, y_train).predict(x_test)
        probs_test2 = tree2.fit(x_train, y_train).predict(x_test)
        probs_test3 = tree3.fit(x_train, y_train).predict(x_test)
        y_pred1 = np.argmax(probs_test1,1)
        y_pred2 = np.argmax(probs_test2,1)
        y_pred3 = np.argmax(probs_test3,1)
        accuracy_list1.append(np.sum(y_pred1 == y_test) / y_test.shape[0])
        accuracy_list2.append(np.sum(y_pred2 == y_test) / y_test.shape[0])
        accuracy_list3.append(np.sum(y_pred3 == y_test) / y_test.shape[0])

    accuracy1 = sum(accuracy_list1) / len(accuracy_list1)
    accuracy2 = sum(accuracy_list2) / len(accuracy_list2)
    accuracy3 = sum(accuracy_list3) / len(accuracy_list3)
    
    return accuracy1,accuracy2,accuracy3

def compare_diff_cost_func(x,y,N):
    depth_list=[]
    accuracy_misclassification_list=[]
    accuracy_entropy_list=[]
    accuracy_gini_index_list=[]
    for k in range(1,11):
        #print(k)
        accuracy_misclassification, accuracy_entropy, accuracy_gini_index=make_prediction_depth(N, k, x, y)
        accuracy_misclassification_list.append(accuracy_misclassification)
        accuracy_entropy_list.append(accuracy_entropy)
        accuracy_gini_index_list.append(accuracy_gini_index)
        depth_list.append(k)
    plt.plot(depth_list, accuracy_misclassification_list, label="misclassification")
    plt.plot(depth_list, accuracy_entropy_list, label="entropy")
    plt.plot(depth_list, accuracy_gini_index_list, label="gini_index")
    plt.title('Plot of accuracy for model#2 with different cost function')
    plt.xlabel('Tree Depth')
    plt.ylabel('Accuracy')
    plt.xticks(range(0,11,2))
    plt.legend()
    plt.show()
    
def model_tree_for_specific_depth(x,y,N,depth):
    for k in range(depth,depth+1):
        accuracy_misclassification, accuracy_entropy, accuracy_gini_index=make_prediction_depth(N, k, x, y)
        print('the average accuracy of decision tree model with misclassification cost for tree depth ',depth,f' is {accuracy_misclassification*100:.1f}.')
        print('the average accuracy of decision tree model with entropy cost for tree depth ',depth,f' is {accuracy_entropy*100:.1f}.')
        print('the average accuracy of decision tree model with gini cost for tree depth ',depth,f' is {accuracy_gini_index*100:.1f}.')



# L-fold Cross_validation for Decision Tree

In [None]:
def make_prediction(N, depth, x, y):
    tree1 = DecisionTree(max_depth=depth)    #### using misclassification as cost function
    tree2 = DecisionTree(max_depth=depth, cost_fn = cost_entropy) 
    tree3 = DecisionTree(max_depth=depth, cost_fn = cost_gini_index)
    
    square_error = []
    accuracy_list = []
    for i in range(10):
        inds = np.random.permutation(N)
        # split the dataset into train and test(80% training & 20% testing)
        x_train, y_train = x[inds[:920]], y[inds[:920]]
        x_test, y_test = x[inds[920:]], y[inds[920:]]

        #### using misclassification as cost function
        probs_test = tree3.fit(x_train, y_train).predict(x_test)
        y_pred = np.argmax(probs_test,1)
        
        #print(f'accuracy using gini index as cost function is {accuracy*100:.1f}.')
        accuracy_list.append(np.sum(y_pred == y_test) / y_test.shape[0])
        square_error.append(np.square(np.subtract(y_pred,y_test)).mean())
    mean_square_error = sum(square_error) / len(square_error)
    accuracy = sum(accuracy_list) / len(accuracy_list)
    # print(f'accuracy is {accuracy * 100:.1f}.')
    # print(f'mean squared error is {mean_square_error}.')
    return mean_square_error


def cross_validation(depth, x, y):
    tree1 = DecisionTree(max_depth=depth) 
    tree2 = DecisionTree(max_depth=depth, cost_fn = cost_entropy) 
    tree3 = DecisionTree(max_depth=depth, cost_fn = cost_gini_index)
    accuracy_list = []
    square_error = []
    for i in range(10):
        split_x = np.split(x, 10)
        val_x = split_x.pop(i)

        train_x = np.concatenate(split_x)

        split_y = np.array_split(y, 10)
        val_y = split_y.pop(i)
        train_y = np.concatenate(split_y)

        probs_test = tree3.fit(train_x, train_y).predict(val_x)
        # To get hard predictions by choosing the class with the maximum probability
        y_pred = np.argmax(probs_test, axis=-1)
        square_error.append(np.square(np.subtract(y_pred,val_y)).mean())
        accuracy_list.append(np.sum(y_pred == val_y) / val_y.shape[0])
    mean_square_error = sum(square_error) / len(square_error)
    accuracy = sum(accuracy_list) / len(accuracy_list)
    # print(f'accuracy is {accuracy * 100:.1f}.')
    # print(f'mean squared error is {mean_square_error}.')
    return mean_square_error


def maximum_absolute_scaling(df):
    # copy the dataframe
    df_scaled = df.copy()
    # scale_col = ["AGE", "BILIRUBIN", "ALK PHOSPHATE", "SGOT", "ALBUMIN", "PROTIME"]
    # apply maximum absolute scaling
    df_scaled.iloc[:, 2:18] = df_scaled.iloc[:, 2:18] / df_scaled.iloc[:, 2:18].abs().max()
    return df_scaled


def get_average_accuracy_fold(N, i, x, y):
    accuracy1 = []
    accuracy2 = []
    for k in range(5):
        inds = np.random.permutation(N)
        # split the dataset into train and test(80% training & 20% testing)
        x_train, y_train = x[inds[:920]], y[inds[:920]]
        x_test, y_test = x[inds[920:]], y[inds[920:]]
        accuracy1.append(cross_validation(i, x_train, y_train))
        accuracy2.append(make_prediction(N, i, x, y))
    return sum(accuracy1) / len(accuracy1), sum(accuracy2) / len(accuracy2)

def L_fold_with_plot_tree(x,y,N):
    accuracy_for_test = []
    accuracy_for_validation = []
    tree_depth = []

    for i in range(1,15):
        tree_depth.append(i)
        acc1, acc2 = get_average_accuracy_fold(N,i, x, y)
        accuracy_for_validation.append(acc1)
        accuracy_for_test.append(acc2)
    std = statistics.stdev(accuracy_for_validation)
    plt.plot(tree_depth, accuracy_for_test, label="test")
    plt.errorbar(tree_depth, accuracy_for_validation, std, label="validation")
    plt.title('Plot of the mean and standard deviation in 10 fold cross-validation')
    plt.xlabel('Tree depth')
    plt.ylabel('mean squared error')
    plt.xticks(range(0,15,2))
    plt.legend()
    plt.show()


# remove feature Exudate2 and MA-1.0 accuracy

In [None]:
def remove_two_features_accuracy(dataset2):
  y = dataset2.iloc[:, -1].to_numpy()
  x1 = dataset2.iloc[:, 0:7].to_numpy()
  x2 = dataset2.iloc[:, 8:9].to_numpy()
  x3 = dataset2.iloc[:, 10:-1].to_numpy()
  x = np.concatenate((x1, x2, x3), axis=1)
      
  (N,D), C = x.shape, np.max(y)+1                                                    
  print(f'instances (N) \t {N} \nfeatures (D) \t {D} \nclasses (C) \t {C}')
    #deleted_column = df.pop(df.columns[0])

  print("The features that skipped is ", dataset2.columns[7], " and ", dataset2.columns[9])
  model = KNN(K=13)
  accuracy_list=[]
  inds = np.random.permutation(N)
  accuracies = 0
  for i in range(500):
    
    inds = np.random.permutation(N)  
    #split the dataset into train and test(80% training & 20% testing)
    x_train, y_train = x[inds[:920]], y[inds[:920]]
    x_test, y_test = x[inds[920:]], y[inds[920:]]
    y_prob, knns = model.fit(x_train, y_train).predict(x_test)

    #To get hard predictions by choosing the class with the maximum probability
    y_pred = np.argmax(y_prob,axis=-1)        
    accuracy_list.append(np.sum(y_pred == y_test)/y_test.shape[0])

    accuracy = sum(accuracy_list) /len(accuracy_list)
    accuracies += accuracy
  accuracies /= 500
  print(f'accuracy after skipping feature Exudate2 and MA-1.0 is {accuracies*100:.1f}.')




# Main

In [None]:
def main():
    #loading and pre-clean the raw data
    dataset2=load_and_clean_data()

    #scaled data into range [0,1] and split the set to feature set x and label set y
    #x,y,N=scaled_data(dataset2)

    #data without scale and split the set to feature set x and label set y
    x,y,N=unscaled_data(dataset2)

    #print the accuracy of knn model with specific num of neighbor
    #model_knn_for_specific_k(x,y,N,13)

    #generate a plot that shows the accuracy of model with different distance function 
    #compare_diff_distance_func(x,y,N)

    #implement the 10-fold cross validation of knn model
    #L_fold_with_plot_knn(x,y,N)

    #function for KNN that finds two most important features for 
    #two_most_important_feature(dataset2)

    #print the accuracy of decision tree model with specific tree depth
    #model_tree_for_specific_depth(x,y,N,2)

    #implement the 10-fold cross validation of decision tree model
    #L_fold_with_plot_tree(x,y,N)

    #generate a plot that shows the accuracy of model with different cost function 
    #compare_diff_cost_func(x,y,N)

    #This function skip the feature MA-1.0 and Exudate-2 and find the accuracy
    #remove_two_features_accuracy(dataset2)

if __name__ == '__main__':
    main()

instances (N) 	 1151 
features (D) 	 19 
classes (C) 	 2
instances (N) 	 1151 
features (D) 	 19 
classes (C) 	 2
instances (N) 	 1151 
features (D) 	 17 
classes (C) 	 2
The features that skipped is  MA-1.0  and  Exudates-2
accuracy after skipping feature Exudate2 and MA-1.0 is 67.4.
