In [2]:
import timeit
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

class Node:
    '''
    Defines a class for a tree node
    '''
    def __init__(self, feature=None, threshold=None, data_left=None, data_right=None, gain=None, value=None):
        self.feature = feature #which feature are we splitting on?
        self.threshold = threshold #which value of the feature?
        self.data_left = data_left #left partition
        self.data_right = data_right #right partition
        self.gain = gain #infogain
        self.value = value #classification of leaf node


class DecisionTree:
    '''
    Implements the DT as a class. Default vals for min_samples_split
    and max_depth chosen approximately as
    min_samples_split=n^(1/3)
    max_depth=features^(1/2)
    '''
    def __init__(self, min_samples_split=36, max_depth=4,randomized_features=False):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.root = None
        self.randomized_features=randomized_features
    

    @staticmethod
    def _entropy(s):
        '''
        Helper function to compute entropy based on list of s integer values

        Arguments:
        s: list

        Returns:
        entropy: float
        '''
        counts = np.bincount(np.array(s, dtype=np.int64)) # runtime error w/o np.int64
        probs = counts / len(s)  # prob of each label
        entropy = -np.sum(probs * np.log2(probs, where=probs > 0))
        return entropy

    def _information_gain(self, parent, left_child, right_child):
        '''
        Helper function, computes information_gain

        Arguments:
        parent: list
        left_child: list
        right_child: list

        Returns:
        information_gain: float

        '''
        left_frac = len(left_child) / len(parent)
        right_frac = len(right_child) / len(parent)

        information_gain = self._entropy(parent) - (left_frac * self._entropy(left_child) + right_frac * self._entropy(right_child))
        return information_gain

    def _best_split(self, X, y):
        '''
        Helper function, computes the best split given one feature and one target

        Arguments:
        X: np.array, data
        y: np.array, targets

        Returns:
        best_split: dict, info for the node

        '''
        best_split = {}
        best_info_gain = -1
        n_rows, n_cols = X.shape
        
        if self.randomized_features==True:
            tweak_size=int(np.sqrt(n_cols))
            chosen_cols=np.random.choice(a=range(n_cols),size=tweak_size,replace=True)
        elif self.randomized_features==False:
            chosen_cols=range(n_cols)
            

        for feature in chosen_cols:
            X_curr = X[:, feature] #choose splitting feature
            for threshold in np.unique(X_curr): #check every unique val in feature

                #split X into higher and lower than given unique val "threshold"
                df = np.concatenate((X, y.reshape(1, -1).T), axis=1)
                df_left = np.array([row for row in df if row[feature] <= threshold])
                df_right = np.array([row for row in df if row[feature] > threshold])

                
                if len(df_left) > 0 and len(df_right) > 0: # If all data is in one df, no need to test
                    # extract target vals as last column
                    y = df[:, -1]
                    y_left = df_left[:, -1]
                    y_right = df_right[:, -1]
                    
                    gain = self._information_gain(y, y_left, y_right)

                    if gain > best_info_gain: #save if new split is better than previous best
                        best_split = { # save info for node assignment
                            'feature_index': feature,
                            'threshold': threshold,
                            'df_left': df_left,
                            'df_right': df_right,
                            'gain': gain
                        }
                        best_info_gain = gain
        
        return best_split

    def _build(self, X, y, depth=0):
        '''
        Helper recursive function, builds DT

        Arguments:
        X: np.array, data
        y: np.array, targets
        depth: int, for stopping criteria

        Returns:
        Node: class instance
        '''

        n_rows = X.shape[0]
        # Get the best split
        best = self._best_split(X, y)
        
        # Check to see if a node should be leaf node
        if n_rows >= self.min_samples_split and depth <= self.max_depth and bool(best):
            # If it's not a perfect split, build both left and right
            if best['gain'] > 0:
                # Build a tree on the left
                left = self._build(
                    X=best['df_left'][:, :-1],
                    y=best['df_left'][:, -1],
                    depth=depth + 1
                )
                right = self._build(
                    X=best['df_right'][:, :-1],
                    y=best['df_right'][:, -1],
                    depth=depth + 1
                )
                return Node(
                    feature=best['feature_index'],
                    threshold=best['threshold'],
                    data_left=left,
                    data_right=right,
                    gain=best['gain']
                )
        # Leaf node - value is the most common target value, used for classification
        return Node(
            value=Counter(y).most_common(1)[0][0]
        )

    def fit(self, X, y):
        '''
        Builds DT

        Arguments:
        X: np.array, data
        y: np.array, targets
        '''
        # Call _build starting from the root
        self.root = self._build(X, y)

    def _predict(self, x, tree):
        '''
        Helper function, traverses the tree to predict for a single instance

        Arguments:
        x: np.array, single observation
        tree: the trained tree

        Returns:
        Class prediction: float

        '''
        # Leaf node
        if tree.value != None:
            return tree.value

        feature_value = x[tree.feature]

        if feature_value <= tree.threshold:
            return self._predict(x=x, tree=tree.data_left)

        if feature_value > tree.threshold:
            return self._predict(x=x, tree=tree.data_right)

    def predict(self, X):
        '''
        Function calling predict for all instances

        Arguments:
        X: np.array, data

        Returns:
        Class prediction: np.array
        '''
        return [self._predict(x, self.root) for x in X] #calls _predict for every instance

class RandomForest:
    def __init__(self, num_trees=10, min_samples_split=5, max_depth=5,randomized_features=False):
        self.num_trees = num_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.randomized_features=randomized_features
        # Will store individually trained decision trees
        self.decision_trees = []

    @staticmethod
    def _sample(X, y):
        '''
        Samples with replacement (bootstrapping)

        Arguments:
        X: np.array, features
        y: np.array, target

        Returns:
        Sample: tuple (sample of features, sample of target)
        '''
        n_rows, n_cols = X.shape
        # Sample with replacement
        samples = np.random.choice(a=n_rows, size=n_rows, replace=True)
        Sample=X[samples], y[samples]
        return Sample

    def fit(self, X, y):
        '''
        Trains the RF classifier by building num_trees decision trees

        Arguments:
        X: np.array, features
        y: np.array, target
        '''
        # Reset
        if len(self.decision_trees) > 0:
            self.decision_trees = []

        # Build each tree of the forest
        num_built = 0
        while num_built < self.num_trees:
            try:
                print("beginning to build tree nr", num_built)
                clf = DecisionTree(
                    min_samples_split=self.min_samples_split,
                    max_depth=self.max_depth,randomized_features=self.randomized_features
                )
                # Obtain data sample
                _X, _y = self._sample(X, y)
                # Train
                clf.fit(_X, _y)
                # Save the classifier
                self.decision_trees.append(clf)
                print("built tree nr",num_built)
                num_built += 1
            except Exception as e:
                print(e)
                continue

    def predict(self, X):
        '''
        Predicts class labels for new data instances.

        Arguments:
        X: np.array, instances to predict

        Returns:
        Predictions: np.array, predictions for each instance
        '''
        # Make predictions with every tree in the forest
        y = [tree.predict(X) for tree in self.decision_trees]
        # Use majority voting for the final prediction
        predictions = [max(Counter(preds), key=Counter(preds).get) for preds in np.swapaxes(a=y, axis1=0, axis2=1)]
        return predictions


In [3]:
######## DATA LOADING AND CLEANING #############
data = pd.read_csv('Hotel Reservations.csv') #load data
data_encode = data.copy()
#One hot encode
labels_to_encode = ['type_of_meal_plan', 'room_type_reserved',
                    'market_segment_type']
data_encode = pd.get_dummies(data, columns = labels_to_encode) 

data_encode['booking_status'] = data_encode['booking_status'].astype('category')
data_encode['booking_status'] = data_encode['booking_status'].cat.codes
data_encode = data_encode[data_encode['no_of_weekend_nights']+data_encode["no_of_week_nights"] > 0]

X = np.array(data_encode.drop(['Booking_ID', 'booking_status'], axis=1))
y = np.array(data_encode['booking_status'])

#sample_size=5000

#X=X[:sample_size,:]
#y=y[:sample_size]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)


data_feature_engineered=data_encode.copy()
data_feature_engineered["total_nights"]=data_encode["no_of_weekend_nights"]+data_encode["no_of_week_nights"]
data_feature_engineered["total_guests"]=data_encode["no_of_adults"]+data_encode["no_of_children"]
data_feature_engineered["total_bookings"]=data_encode["no_of_previous_cancellations"]+data_encode["no_of_previous_bookings_not_canceled"]
data_feature_engineered["total_cost"]=data_feature_engineered["total_nights"]*data_feature_engineered["avg_price_per_room"]

X_feature_engineered = np.array(data_feature_engineered.drop(['Booking_ID', 'booking_status'], axis=1))
y_feature_engineered = np.array(data_feature_engineered['booking_status'])

#X_feature_engineered=X_feature_engineered[:sample_size,:]
#y_feature_engineered=y_feature_engineered[:sample_size]

X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X_feature_engineered, y_feature_engineered, test_size=0.2, random_state=2)

In [None]:
############ PCA ############
from sklearn.preprocessing import scale 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(X_train)
X_train_s=scaler.transform(X_train)
X_test_s=scaler.transform(X_test)

pca=PCA(0.95)
pca.fit(X_train_s)
X_pca_train = pca.transform(X_train_s)
X_pca_test = pca.transform(X_test_s)

In [None]:
############ TRAINING MODEL ON OUR IMPLEMENTATION ######
model_dt = DecisionTree()
model_dt.fit(X_train, y_train)

preds_train = model_dt.predict(X_train)
preds_test = model_dt.predict(X_test)

print("DT train accuracy:",accuracy_score(y_train, preds_train))
print("DT test accuracy:",accuracy_score(y_test, preds_test))

In [None]:
############ BASE RANDOM FOREST ######
start = timeit.default_timer()
print("Training base random forest")
model_rf = RandomForest(num_trees=16,min_samples_split=4,max_depth=32)
model_rf.fit(X_train, y_train)

preds_train = model_rf.predict(X_train)
preds_test = model_rf.predict(X_test)
print("Base random forrest train accuracy:",accuracy_score(y_train, preds_train))
print("Base random forrest test accuracy:",accuracy_score(y_test, preds_test))
stop = timeit.default_timer()
print('Time: ', stop - start) 

In [None]:
############ BASE RANDOM FOREST WITH PCA######
print("Training base random forest")
start = timeit.default_timer()
model_rf = RandomForest(num_trees=16,min_samples_split=4,max_depth=32)
model_rf.fit(X_pca_train, y_train)

preds_train = model_rf.predict(X_pca_train)
preds_test = model_rf.predict(X_pca_test)

stop = timeit.default_timer()
print("Base random forrest train accuracy:",accuracy_score(y_train, preds_train))
print("Base random forrest test accuracy:",accuracy_score(y_test, preds_test))
print('Time: ', stop - start) 

In [4]:
############ FEATURE ENGINEREERED RANDOM FOREST ######
start = timeit.default_timer()
print("Training base random forrest for feature engineered data set")
model_fe = RandomForest(num_trees=16,min_samples_split=4,max_depth=32)
model_fe.fit(X_train_fe, y_train_fe)

preds_train_fe = model_fe.predict(X_train_fe)
preds_test_fe = model_fe.predict(X_test_fe)

stop = timeit.default_timer()

print("FE data train accuracy:",accuracy_score(y_train_fe, preds_train_fe))
print("FE data test accuracy:",accuracy_score(y_test_fe, preds_test_fe))
print('Time: ', stop - start) 

Training base random forrest for feature engineered data set
beginning to build tree nr 0
built tree nr 0
beginning to build tree nr 1
built tree nr 1
beginning to build tree nr 2
built tree nr 2
beginning to build tree nr 3
built tree nr 3
beginning to build tree nr 4
built tree nr 4
beginning to build tree nr 5
built tree nr 5
beginning to build tree nr 6
built tree nr 6
beginning to build tree nr 7
built tree nr 7
beginning to build tree nr 8
built tree nr 8
beginning to build tree nr 9
built tree nr 9
beginning to build tree nr 10
built tree nr 10
beginning to build tree nr 11
built tree nr 11
beginning to build tree nr 12
built tree nr 12
beginning to build tree nr 13


In [None]:
############ FEATURE ENGINEREERED RANDOM FOREST WITH RANDOMIZATION ######
print("Training randomized forrest for feature engineered data set")
start = timeit.default_timer()
model_fe_rand = RandomForest(num_trees=16,min_samples_split=4,max_depth=32,randomized_features=True)
model_fe_rand.fit(X_train_fe, y_train_fe)

preds_train_rand = model_fe_rand.predict(X_train_fe)
preds_test_rand = model_fe_rand.predict(X_test_fe)

stop = timeit.default_timer()
print("FE data train accuracy:",accuracy_score(y_train_fe, preds_train_rand))
print("FE data test accuracy:",accuracy_score(y_test_fe, preds_test_rand))
print('Time: ', stop - start) 

In [None]:
#PCA random forest, timetest
X=X[0:500,]
y=y[0:500]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

#Do PCA
from sklearn.preprocessing import scale 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
X_train_s=scaler.transform(X_train)
X_test_s=scaler.transform(X_test)
pca=PCA(0.95)
pca.fit(X_train_s)
X_pca_train = pca.transform(X_train_s)
X_pca_test = pca.transform(X_test_s)


#Random forest with PCA
import timeit
start = timeit.default_timer()
max_features=None
rf = RandomForest(num_trees=16,max_depth=32,min_samples_split=4)
rf.fit(X_pca_train, y_train)
rf_train_accuracy = accuracy_score(y_train, rf.predict(X_pca_train))
rf_test_accuracy = accuracy_score(y_test, rf.predict(X_pca_test))
print("train", rf_train_accuracy, "test",rf_test_accuracy)
stop = timeit.default_timer()
print('Time with PCA: ', stop - start) 

#Random forest without
import timeit
start = timeit.default_timer()
max_features=None
rf = RandomForest(num_trees=16,max_depth=32,min_samples_split=4)
rf.fit(X_train, y_train)
rf_train_accuracy = accuracy_score(y_train, rf.predict(X_train))
rf_test_accuracy = accuracy_score(y_test, rf.predict(X_test))
print("train", rf_train_accuracy, "test",rf_test_accuracy)
stop = timeit.default_timer()
print('Time without PCA: ', stop - start)