Decision Trees Implementation:


In [1]:
import numpy as np
import pandas as pd

class DecisionTree:
    def __init__(self, impurity='gini', max_depth=10, min_samples_split=2, min_samples_leaf=1):
        self.impurity = impurity
        self.max_depth= max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        
        criterion_funcs = {'gini': self.calculate_gini_impurity, 
                           'entropy': self.calculate_entropy,
                           'misclassification_rate': self.calculate_misclassification_rate}
        self.homogeneity_measure = criterion_funcs.get(impurity, self.calculate_gini_impurity)

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, pd.Series):
            y = y.values
        self.tree = self.split(X, y, 0)   
        
    def calculate_gini_impurity(self, y):
        _, counts = np.unique(y, return_counts=True)
        proportions = counts / len(y)
        return 1 - np.sum(proportions**2)
    
    def calculate_entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        proportions = counts / len(y)
        return -np.sum(proportions * np.log2(proportions))
    
    def calculate_misclassification_rate(self, y):
        _, counts = np.unique(y, return_counts=True)
        proportions = counts / len(y)
        return 1 - np.max(proportions)
    
    def split(self, X, y, d):
        n_samples, n_features = X.shape

        if n_samples < self.min_samples_split or d> self.max_depth:
            return {'leaf_value': np.round(np.mean(y))}
        
        b_feature, b_threshold, b_impurity = None, None, np.inf
        b_l_X, b_l_y, b_r_X, b_r_y = None, None, None, None
        
        for selected_feat in range(n_features):
            ths = np.unique(X[:, selected_feat])
            for t in ths:
                r_partition_ind = X[:, selected_feat] > t
                l_partition_ind = X[:, selected_feat] <= t

                if  len(y[r_partition_ind]) > 0 and len(y[l_partition_ind]) > 0:
                    l_y, r_y = y[l_partition_ind], y[r_partition_ind]
                    n_left, n_right = len(l_y), len(r_y)
                    total_samples = n_left + n_right
                    impurity = (n_left / total_samples) * self.homogeneity_measure(l_y) \
         + (n_right / total_samples) * self.homogeneity_measure(r_y)

                    if impurity < b_impurity:
                        b_feature, b_threshold, b_impurity = selected_feat, t, impurity
                        b_l_X, b_l_y = X[l_partition_ind], y[l_partition_ind]
                        b_r_X, b_r_y = X[r_partition_ind], y[r_partition_ind]

        if b_feature is not None:
            node = {}
            node['feature'] = b_feature
            node['threshold'] = b_threshold
            node['left'] = self.split(b_l_X, b_l_y, d+ 1)
            node['right'] = self.split(b_r_X, b_r_y, d+ 1)
            return node

        return {'leaf_value': np.round(np.mean(y))}
        
    def predict_sample(self, x, node):
        while 'leaf_value' not in node:
            if x[node['feature']] <= node['threshold']:
                node = node['left']
            else:
                node = node['right']
        return node['leaf_value']

    def predict(self, X):
        X = pd.DataFrame(X)
        X.reset_index(drop=True, inplace=True)
        y_pred = np.zeros(X.shape[0])
        
        for i in range(X.shape[0]):
            y_pred[i] = self.predict_sample(X.loc[i], self.tree)  
        return y_pred


Data Preprocessing for Titanic dataset

In [2]:
# Imports needed for the script
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont

# Loading the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Store our test passenger IDs for easy access
PassengerId = test['PassengerId']

# Showing overview of the train dataset
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
original_train = train.copy() 

full_data = [train, test]

train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

# Remove all NULLS in the Age column
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    # Next line has been improved to avoid warning
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] 

In [4]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp','Sex']
train = train.drop(drop_elements, axis = 1)

test  = test.drop(drop_elements, axis = 1)

Model 1 : Testing the Descision Tree Class

In [5]:
from sklearn.model_selection import train_test_split
y = train['Survived']
X = train.drop('Survived', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

decision_tree = DecisionTree(max_depth=5)
decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

In [6]:
Acc_for_DT = accuracy_score(y_test,y_pred)

In [7]:
print("Accuracy for Decision Tree: {:.2f}%".format(Acc_for_DT * 100))


Accuracy for Decision Tree: 79.33%


Implementing the Random Forests

In [13]:
import numpy as np
class RandomForest:
    def __init__(self, classifier, num_trees, min_features):
        self.classifier = classifier
        self.num_trees = num_trees
        self.min_features = min_features
        self.trees = []
        
    def fit(self, X, y):
        X = X.values  
        y = y.values
        
        for i in range(self.num_trees):
            # select random number of samples 

            sample_count = np.random.randint(1, X.shape[0]+1)
            sample_ids = np.random.choice(X.shape[0], sample_count, replace=True)
            X_b = X[sample_ids]
            y_b = y[sample_ids]
            
            # select a random subset of features
            feat_count = np.random.randint(self.min_features, X.shape[1]+1)
            f_ids = np.random.choice(X.shape[1], feat_count, replace=False)
            X_b = X_b[:, f_ids]
            
            tree = self.classifier(max_depth=7, 
                                    min_samples_split=4, min_samples_leaf=2)
            tree.fit(X_b, y_b)
            
            self.trees.append((tree, f_ids))
            
    def predict(self, X):
        X = X.values  
        
        pred_arr = []
        for tree, f_ids in self.trees:
            sub_X = X[:, f_ids]
            pred_arr.append(tree.predict(sub_X))
        
        pred_arr = np.array(pred_arr).astype(int)
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=pred_arr)


Model 2: Random Forest

In [14]:
rf = RandomForest(classifier=DecisionTree, num_trees=50, min_features=4)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

Acc_for_RF = accuracy_score(y_test, y_pred_rf)
print("Accuracy for Random Forest: {:.2f}%".format(Acc_for_RF * 100))


Accuracy for Random Forest: 81.56%


Ada Boost

In [10]:

class AdaBoost:
    def __init__(self, weak_learner, num_learners, learning_rate):
        self.weak_learner = weak_learner
        self.num_learners = num_learners
        self.learning_rate = learning_rate
    
    def fit(self, X, y):
        sample_count = X.shape[0]
        wts = np.ones(sample_count) / sample_count
        
        self.m_arr = []
        self.al_arr = []
        
        for _ in range(self.num_learners):
            ml = self.weak_learner()
            ml.fit(X, y)
            y_pred = ml.predict(X)
            error = np.mean(np.abs(y_pred - y) / 2 * wts) / np.mean(wts)
            if error > 0.5:
                break
            alpha = self.learning_rate * np.log((1 - error) / error)
            self.m_arr.append(ml)
            self.al_arr.append(alpha)
            
            wts *= np.exp(-alpha * y * y_pred)
            wts /= np.sum(wts)

            
    def predict(self, X):
        n_samples = X.shape[0]
        y_pred = np.zeros(n_samples)
        
        for i in range(len(self.m_arr)):
            ml = self.m_arr[i]
            al = self.al_arr[i]
            y_pred += al * ml.predict(X)
        
        return np.sign(y_pred)


Model 3: Ada Boost

In [11]:
ab = AdaBoost(weak_learner=DecisionTree, num_learners=100, learning_rate=0.1)


ab.fit(X_train, y_train)
y_pred_ab = ab.predict(X_test)

Acc_for_AdB = accuracy_score(y_test, y_pred_ab)
print("Accuracy for Ada Boost: {:.2f}%".format(Acc_for_AdB * 100))

Accuracy for Ada Boost: 80.45%


Comparing the Accuracy for 3 models

In [15]:
print("Accuracy for Decision Tree: {:.2f}%".format(Acc_for_DT * 100))
print("Accuracy for Random Forest: {:.2f}%".format(Acc_for_RF * 100))
print("Accuracy for Ada Boost: {:.2f}%".format(Acc_for_AdB * 100))

Accuracy for Decision Tree: 79.33%
Accuracy for Random Forest: 81.56%
Accuracy for Ada Boost: 80.45%
