# 1. Classification: 
Classification is to identify which category a new observation belongs to, on the basis of a training dataset. There are five datasets. For each dataset, we provide the training dataset, training label, and test dataset. Please use the training dataset and training label to build your classifier and predict the test label. A class label is represented by an integer. For example, in the 1st dataset, there are 4 classes where 1 represents the 1st class, 2 represents the 2nd class, etc. Note that, there exist some missing values in some of the dataset (a missing entry is filled by 1.00000000000000e+99), please fill the missing values before perform your classification algorithm.

TrainData 1 contains 3312 features with 150 samples. Testdata1 contains 3312 features with 53 samples. There are 5 classes in this dataset.

TrainData 2 contains 9182 features with 100 samples. Testdata2 contains 9182 features with 74 samples. There are 11 classes in this dataset.

TrainData 3 contains 13  features with 6300 samples. Testdata3 contains 13 features with 2693 samples. There are 9 classes in this dataset.

TrainData 4 contains 112 features with 2547 samples. Testdata4 contains 112 features with 1092 samples. There are 9 classes in this dataset.

TrainData 5 contains 11 features with 1119 samples. Testdata5 contains 11 features with 480 samples. There are 6 classes in this dataset.

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import random
from random import seed
from random import randrange
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode



In [2]:
# Importing the datasets
traindata1 = pd.read_csv('input/TrainData1.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
traindata2 = pd.read_csv('input/TrainData2.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
traindata3 = pd.read_csv('input/TrainData3.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
traindata4 = pd.read_csv('input/TrainData4.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
traindata5 = pd.read_csv('input/TrainData5.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
train_data = [traindata1, traindata2, traindata3, traindata4, traindata5]

testdata1 = pd.read_csv('input/TestData1.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
testdata2 = pd.read_csv('input/TestData2.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
testdata3 = pd.read_csv('input/TestData3.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
testdata4 = pd.read_csv('input/TestData4.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
testdata5 = pd.read_csv('input/TestData5.txt', sep='\s+', header=None, na_values='1.00000000000000e+99')
test_data = [testdata1, testdata2, testdata3, testdata4, testdata5]

trainlabel1 = pd.read_csv('input/TrainLabel1.txt', sep='\t', header=None)
trainlabel2 = pd.read_csv('input/TrainLabel2.txt', sep='\t', header=None)
trainlabel3 = pd.read_csv('input/TrainLabel3.txt', sep='\t', header=None)
trainlabel4 = pd.read_csv('input/TrainLabel4.txt', sep='\t', header=None)
trainlabel5 = pd.read_csv('input/TrainLabel5.txt', sep='\t', header=None)
train_label = [trainlabel1, trainlabel2, trainlabel3, trainlabel4, trainlabel5]

In [3]:
# count the numbers of coloumns where the value = 1.00000000000000e+99
def count_na(data, type):
    print("Missing values in the " + type + " data:")
    for i in range(len(data)):
        missing_val_count = data[i].isnull().sum().sum()
        print( type + " dataset " + str(i+1) + ": " + str(missing_val_count))

count_na(train_data, "train")
count_na(test_data, "test")



Missing values in the train data:
train dataset 1: 9936
train dataset 2: 0
train dataset 3: 1886
train dataset 4: 0
train dataset 5: 0
Missing values in the test data:
test dataset 1: 7021
test dataset 2: 0
test dataset 3: 0
test dataset 4: 0
test dataset 5: 0


In [4]:
# Now we need to replace the missing values in the dataset using the mean of each column
def replace_na(data):
    for i in range(len(data)):
        for column in data[i].columns:   
                data[i][column] = data[i][column].fillna(data[i][column].mean())
         
replace_na(train_data)

# Replacing test dataset 1 missing values
for column in testdata1.columns:
    testdata1[column] = testdata1[column].fillna(testdata1[column].mean())

# Now checking if any remaining missing vals left
count_na(train_data, "train")
count_na(test_data, "test")


Missing values in the train data:
train dataset 1: 0
train dataset 2: 0
train dataset 3: 0
train dataset 4: 0
train dataset 5: 0
Missing values in the test data:
test dataset 1: 0
test dataset 2: 0
test dataset 3: 0
test dataset 4: 0
test dataset 5: 0


In [5]:
# Splitting the train data into training and testing
# X_train = []
# X_test = []
# y_train = []
# y_test = []

# def split_data(data, label):
#     X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state = 42)
#     return X_train, X_test, y_train, y_test

# for i in range(len(train_data)):
#     X_train_i, X_test_i, y_train_i, y_test_i = split_data(train_data[i], train_label[i])
#     X_train.append(X_train_i)
#     X_test.append(X_test_i)
#     y_train.append(y_train_i)
#     y_test.append(y_test_i)
#     print("Train dataset " + str(i+1) + ": " + str(len(X_train_i)))
#     print("Test dataset " + str(i+1) + ": " + str(len(X_test_i)))



In [6]:
# Now we are creating are decision tree classifier https://www.youtube.com/watch?v=sgQAhG5Q7iY
# First the node class - describes node of the tree
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        # decision defined by feature index and threshold for that feature
        self.feature_index = feature_index 
        self.threshold = threshold 
        # acces left and right child nodes 
        self.left = left
        self.right = right
        # information gain of splitting data
        self.info_gain = info_gain
        # leaf node
        self.value = value 

In [7]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree ''' 
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["info_gain"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")
        
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child, mode="entropy"):
        ''' function to compute information gain '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        if mode=="gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
        else:
            gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain
    
    def entropy(self, y):
        ''' function to compute entropy '''
        
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
    
    def gini_index(self, y):
        ''' function to compute gini index '''
        
        class_labels = np.unique(y)
        gini = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            gini += p_cls**2
        return 1 - gini
        
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        Y = list(Y)
        return max(Y, key=Y.count)
    
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        ''' function to predict new dataset '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    
    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)


In [32]:
X_train, X_test, y_train, y_test = train_test_split(traindata4, trainlabel4, test_size=0.3, random_state = 21)


# pick the minsample split and max dept with the best accuracy
max_depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
for i in range (len(max_depths)):
    classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=max_depths[i])
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test.values)
    print("Accuracy for max_depth = ", max_depths[i], ": ", accuracy_score(y_test, y_pred))

0.59375
