In [24]:
import pandas as pd
import numpy as np
import itertools

data = pd.read_csv("gender.csv")
data = data[['Gender', 'Height(cm)', 'Weight(kg)']].replace(' male', 'male').replace(' female','female')
data.head()

Unnamed: 0,Gender,Height(cm),Weight(kg)
0,male,175,70
1,male,182,85
2,female,160,62
3,male,178,79
4,female,165,58


In [32]:
from sklearn.model_selection import train_test_split

X, y = data, data['Gender']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [26]:
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None,
                 info_gain=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        self.value = value

In [27]:
def entropy(y):
    p = y.value_counts()/y.shape[0] # Find Probabilities
    entropy = np.sum(-p * np.log2(p+1e-9))
    return entropy
class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=2):
        self.root = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
    
    def gini_impurity(self, y):
        p = y.value_counts()/y.shape[0] # Find Probabilities
        gini = 1-np.sum(p**2)
        return gini

    def variance(self, y):
        if len(y) == 1:
            return 0
        else:
            return y.var()

    def information_gain(self, y, mask, func=entropy):
        a = sum(mask)
        b = mask.shape[0] - a

        if (a*b==0):
            ig = 0
        else:
            if y.dtypes != 'O':
                ig = variance(y) - (a/(a+b)*variance(y[mask])) - (b/(a+b)*variance(y[-mask]))
            else:
                ig = func(y)-a/(a+b)*func(y[mask])-b/(a+b)*func(y[-mask])
        return ig

    def categorical_options(self, a):
        a = a.unique()
        opt = []
        for l in range(len(a)+1):
            for subset in itertools.combinations(a, l):
                subset = list(subset)
                opt.append(subset)
        return opt[1:-1]
    
    def max_information_gain_split(self, x, y, func=entropy):
        split_value = []
        ig = []
        numeric_variable = True if x.dtypes != 'O' else False
        if numeric_variable:
            options = x.sort_values().unique()[1:]
        else:
            options = categorical_options(x)
        for val in options:
            mask = x < val if numeric_variable else x.isin(val)
            val_ig = self.information_gain(y, mask, func)
            ig.append(val_ig)
            split_value.append(val)
        if len(ig)==0:
            return(None,None,None, False)
        else:
            best_ig = max(ig)
            best_ig_index = ig.index(best_ig)
            best_split = split_value[best_ig_index]
            return(best_ig,best_split,numeric_variable, True)

    def get_best_split(self, y, data):
        masks = data.drop(y, axis= 1).apply(self.max_information_gain_split, y=data[y])
        if sum(masks.loc[3,:]) == 0:
            return(None, None, None, None)
        else:
            masks = masks.loc[:,masks.loc[3,:]]
            split_variable = masks.iloc[0].astype(np.float32).idxmax()
            split_value = masks[split_variable][1] 
            split_ig = masks[split_variable][0]
            split_numeric = masks[split_variable][2]
            return(split_variable, split_value, split_ig, split_numeric)

    def make_split(self, variable, value, data, is_numeric):
        if is_numeric:
            data_1 = data[data[variable] < value]
            data_2 = data[(data[variable] < value) == False]
        else:
            data_1 = data[data[variable].isin(value)]
            data_2 = data[(data[variable].isin(value)) == False]
        return(data_1,data_2)
    
    def make_prediction(self, data, target_factor):
        if target_factor:
            pred = data.value_counts().idxmax()
        else:
            pred = data.mean()
        return pred
    

In [33]:
def train_tree(data,y, target_factor, max_depth = None,min_samples_split = None, min_information_gain = 1e-20, counter=0, max_categories = 20):
    dt = DecisionTree()
    if counter==0:
        types = data.dtypes
        check_columns = types[types == "object"].index
        for column in check_columns:
            var_length = len(data[column].value_counts()) 
            if var_length > max_categories:
                raise ValueError('The variable ' + column + ' has '+ str(var_length) + ' unique values, which is more than the accepted ones: ' +  str(max_categories))
    if max_depth == None:
        depth_cond = True
    else:
        if counter < max_depth:
            depth_cond = True
        else:
            depth_cond = False
    if min_samples_split == None:
        sample_cond = True
    else:
        if data.shape[0] > min_samples_split:
            sample_cond = True
        else:
            sample_cond = False

    # Check for ig condition
    if depth_cond & sample_cond:
        var,val,ig,var_type = dt.get_best_split(y, data)
        # If ig condition is fulfilled, make split 
        if ig is not None and ig >= min_information_gain:
            counter += 1
            left,right = dt.make_split(var, val, data,var_type)
        # Instantiate sub-tree
            split_type = "<=" if var_type else "in"
            question =   "{} {}  {}".format(var,split_type,val)
            subtree = {question: []}

            yes_answer = train_tree(left,y, target_factor, max_depth,min_samples_split,min_information_gain, counter)
            no_answer = train_tree(right,y, target_factor, max_depth,min_samples_split,min_information_gain, counter)

            if yes_answer == no_answer:
                subtree = yes_answer
            else:
                subtree[question].append(yes_answer)
                subtree[question].append(no_answer)
        else:
            pred = dt.make_prediction(data[y],target_factor)
            return pred
    else:
        pred = dt.make_prediction(data[y],target_factor)
        return pred

    return subtree


max_depth = 5
min_samples_split = 20
min_information_gain  = 1e-5


decisions = train_tree(X_train,'Gender',True, max_depth,min_samples_split,min_information_gain)


In [34]:
decisions

{'Height(cm) <=  174': ['female', 'male']}

In [35]:
def classify_data(observation, tree):
    question = list(tree.keys())[0]
    x = question.split()
    obs = observation[x[0]]
    print(obs)
    if x[1] == '<=':
        if obs <= float(x[2]):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    else:
        if obs in (x[2]):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree = answer
        return classify_data(observation, answer)


In [44]:
%%capture
y_pred = []
for i in range(X_test.shape[0]):
    y_pred.append(classify_data(X_test.iloc[i], decisions))

def accuracy(y_true, y_pred):   
    accuracy = np.sum(y_true == y_pred)/len(y_true)   
    return accuracy

In [45]:
accuracy(y_test, y_pred)

1.0