In [109]:
##initialize the Node
import numpy as np

class Node: 
    def __init__(self, 
                 feature =None, 
                 threshold=None, 
                 data_left= None,
                 data_right=None, 
                 gain=None, 
                 value=None):
        self.feature = feature, 
        self.threshold = threshold
        self.data_left = data_left
        self.data_right = data_right
        self.gain = gain 
        self.value = value
        

#decision tree classificiton 
class DecisionTree: 
    def __init__(self,min_sample_split = 2, max_depth = 5): 
        self.min_sample_split = min_sample_split
        self.max_depth = max_depth
        self.root= None 
        
        
    def _entropy(s):
        """
        :param s: list 
        :return: float, entropy value 
        
        """
        percentage = np.bincount(np.array(s))/len(s)
        
        #calculate entropy 
        entropy = 0 
        for pct in percentage: 
            if pct>0: 
                entropy += pct * np.log2(pct)
                
        return -entropy
    
    def information_gain(self, parent, left_child, right_child):
        num_left = len(left_child)/len(parent)
        num_right = len(right_child)/len(parent)
        
        information_gain = self.entropy(parent) - (num_left*(entropy(left_child)) + num_right*(entropy(right_child)))
        
        return information_gain
    
    
    def best_split(self, X, y):
        """
        X: np.array, features 
        y: np.array or list
        returns dict
        """
        best_split = {}
        best_info_gain = -1 
        n_rows, n_cols = X.shape
        
        for index in range(n_cols):
            X_curr = X[:,index]
            
            for threshold in np.unique(X_curr):
                
                df = np.concatenate((X, y.reshape(1,-1)), axis=1)
                df_left = np.array([row for row in df if row[index]<=threshold])
                df_right = np.array([row for row in df if row[index]>threshold])
                
                if len(df_left)>0 and len(df_right)>0: 
                    y = df[:, -1]
                    y_left = df_left[:, -1]
                    y_right = df_right[:, -1]
                    
                    #calculate information gain from the split
                    gain = self.information_gain(y, y_left, y_right)
                    
                    if gain>best_info_gain: 
                        best_split = {
                            "feature_index": index, 
                            "threshold": threshold, 
                            "df_left":df_left, 
                            "df_right":df_right, 
                            "gain":gain
                        }
                        
                        best_info_gain = gain 
                        
        return best_split
    
    
    def build(self, X, y, depth = 0):
        """
        X: np.array, features 
        y: np.array, list or target 
        depth: int, current depth of tree
        return: Node
        """
        
        n_rows, n_cols = X.shape
        
        #base case to see if this should be leaf node 
        if n_rows >= self.min_sample_split and depth<=self.max_depth:
            
            best = self.best_split(X,y)
            
            if best["gain"]>0: 
                
                left = self.build(
                    X = best["df_left"][:,:-1],
                    y = best["df_left"][:, -1], 
                    depth = depth+1
                )
                
                right = self.build(
                    X = best["df_right"][:,:-1],
                    y = best["df_right"][:, -1], 
                    depth = depth +1 
                )
                
                return Node(
                    
                    feature= best["feature_index"],
                    threshold=best['threshold'], 
                    data_left= best['df_left'], 
                    data_right = best["df_right"], 
                    gain = best["gain"]
                    
                )
            
        return Node(
            
            value = Counter(y).most_common(1)[0][0]
        )
                
        
    def fit(self, X, y):
        """
        X: np.array, features 
        y: np.array, list, target
        return: None
        """
        
        self.root = self.build(X, y)
                
    def predict_single(self, x, tree):
        """
        x: single observation
        tree: built tree
        return: float, predicted class
        
        """
        
        if tree.value != None: 
            return tree.value 
        
        feature_value = x[tree.feature]
        
        #explore left
        if feature_value <=tree.threshold: 
            return self.predict_single(x = x, tree= tree.left)
        
        if feature_value > tree.threshold: 
            return self.predict_single(x = x, tree = tree.right)
        
    
    def predict(self, x):
        """
        X: np.array, features 
        return np.array: predicted classes
        
        """
        return [self.predict_single(x, self.root) for x in x]
    
    

In [120]:
class RegressionTree: 
    def __init__(self, y, X, min_sample_split, max_depth, depth,
                features):
        self.y = y 
        self.X = X
        self.min_sample_split = min_sample_split
        self.max_depth = max_depth
        self.features = features
        self.n = len(y)
        
        self.mse_base = np.sum((y-np.mean(y))**2)/len(y)

        
        
    @staticmethod
    def get_mse(ytrue, yhat): 
        n = len(ytrue)
        
        residual = ytrue - yhat
        residual = residual **2 
        residual = sum(residual)
        
        return residual/n 
    
    @staticmethod
    def ma(x, window): 
        return np.convolve(x, np.ones(window), "valid")/window
    
    
    def split(self): 
        df = self.X.copy()
        
        df["Y"] = self.y 
        
        mse_base = self.mse_base
        best_feature = None
        best_value = None
        
        for feature in self.features:
            df_curr = df.dropna().sort_values(feature)
            
            x_mean = self.ma(df_curr[feature].unique(), 2)
            
            for value in x_mean: 
                
                left_y = df_curr[df_curr[feature]<value]["Y"].values
                right_y = df_curr[df_curr[feature]>=value]["Y"].values
                
                left_mean = np.mean(left_y)
                right_mean = np.mean(right_y)
                
                rrs_left = left_y - left_mean 
                rrs_right = right_y - right_mean 
                
                r = np.concatenate((rrs_left, rrs_right))
                
                n = len(r)
                r = r**2
                r = np.sum(r)
                mse_split  = r/n
                
                if mse_split < mse_base: 
                    mse_base = mse_split
                    best_feature = feature
                    best_value = value
               
        return (best_feature, best_value)
                    
    def build(self):
        
        df = self.X.copy()
        df["Y"] = self.Y
        
        if (self.depth < self.max_depth) and (self.n>=self.min_sample_split):
            
            best_feature, best_value  = self.split()
            
            if best_feature is not None: 
                
                self.best_feature = best_feature
                self.best_value = best_value
                
                left_df = df[df[best_feature]<=best_value].copy()
                right_df = df[df[best_feature]>best_value].copy()
                
                
                left = RegressionTree(
                    left_df["Y"].values.list(), 
                    left_df[self.feature],
                    depth = self.depth + 1, 
                    max_depth = self.max_depth, 
                    min_sample_split = self.min_sample_split, 
                )
                
                self.left = left 
                self.left.build()
                
                
                right = RegressionTree(
                
                )
                
                self.right = right 
                self.right.build()
        

In [105]:
class RandomForest: 
    
    
    def __init__(self, num_trees = 25, min_sample_split = 2, max_depth =5):
        
        self.num_trees = num_trees
        self.min_sample_split = min_sample_split
        self.max_depth = max_depth
        self.decision = []
        
    @staticmethod
    def _sample(X, y):
        """
        X: np.array, features 
        y: np.array, target
        return tuple (sample of features, sample of target)
        """
        
        n_rows, n_cols = X.shape
        
        sample = np.random.choice(a=n_rows, size=n_rows, replace=True)
        return X[bsample], y[sample]
    
    
    def fit(self, X, y):
        """
        X:np.array, features 
        y:np.array, targets
        return: None
        """
        
        #reset 
        if len(self.decision)>0: 
            self.decision = [] 
            
            
        num_built = 0 
        while num_built < self.num_trees: 
            try: 
                
                clf = DecisionTree(
                    min_sample_split= self.min_sample_split, 
                    max_depth = max_depth
                        
                )
                
                _X, _y = self._sample(X, y)
                
                clf.fit(_X, _y)
                
                self.decision.append(clf)
                num_built +=1
            
            except Exception as e: 
                continue
                
                
    def predict(self, X):
        """
        param X: np.array 
        return: None
        
        """
        y = []
        
        for tree in self.decision: 
            y.append(tree.predict(X))
            
        prediction = []
        
        result = Counter(prediction).most_common(1)[0][0]
        
        return result
        
            
        
        
        

In [None]:
import numpy as np

# Decision stump used as weak classifier
class DecisionStump():
    def __init__(self):
        self.polarity = 1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None

    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_idx]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1

        return predictions
    
class Adaboost():

    def __init__(self, n_clf=5):
        self.n_clf = n_clf

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Initialize weights to 1/N
        w = np.full(n_samples, (1 / n_samples))

        self.clfs = []
        # Iterate through classifiers
        for _ in range(self.n_clf):
            clf = DecisionStump()

            min_error = float('inf')
            # greedy search to find best threshold and feature
            for feature_i in range(n_features):
                X_column = X[:, feature_i]
                thresholds = np.unique(X_column)

                for threshold in thresholds:
                    # predict with polarity 1
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1
                    
                    # Error = sum of weights of misclassified samples
                    misclassified = w[y != predictions]
                    error = sum(misclassified)

                    if error > 0.5:
                        error = 1 - error
                        p = -1

                    # store the best configuration
                    if error < min_error:
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_idx = feature_i
                        min_error = error
                        
            # calculate alpha
            EPS = 1e-10
            clf.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))

            # calculate predictions and update weights
            predictions = clf.predict(X)

            w *= np.exp(-clf.alpha * y * predictions)
            
            # Normalize to one
            w /= np.sum(w)

            # Save classifier
            self.clfs.append(clf)
            
            

    def predict(self, X):
        clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
        y_pred = np.sum(clf_preds, axis=0)
        y_pred = np.sign(y_pred)

        return y_pred
                        
                        
                        

In [None]:
class 


class AdaBoost: 
    
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.model = [None]*n_estimators
        
        
    def fit(self, X, y):
        N = len(y)
        
        #initialize the weights 
        w = np.array([1/N for i in range(N)])
        
        
        for m in range(self.n_estimators):
            
            Gm = DecisionTree(max_depth=1).fit(X,y,sample_weight =w) 
            
            err
        
        

In [None]:
https://towardsdatascience.com/master-machine-learning-random-forest-from-scratch-with-python-3efdd51b6d7a
    
    
    