In [69]:
##initialize the Node
import numpy as np 

class Node: 
    def __init__(self, 
                 feature =None, 
                 threshold=None, 
                 data_left= None,
                 data_right=None, 
                 gain=None, 
                 value=None):
        self.feature = feature, 
        self.threshold = threshold
        self.data_left = data_left
        self.data_right = data_right
        self.gain = gain 
        self.value = value
        
        
class DecisionTree: 
    def __init__(self,min_sample_split = 2, max_depth = 5): 
        self.min_sample_split = min_sample_split
        self.max_depth = max_depth
        self.root= None 
        
        
    def _entropy(s):
        """
        :param s: list 
        :return: float, entropy value 
        
        """
        percentage = np.bincount(np.array(s))/len(s)
        
        #calculate entropy 
        entropy = 0 
        for pct in percentage: 
            if pct>0: 
                entropy += pct * np.log2(pct)
                
        return -entropy
    
    def information_gain(self, parent, left_child, right_child):
        num_left = len(left_child)/len(parent)
        num_right = len(right_child)/len(parent)
        
        information_gain = self.entropy(parent) - (num_left*(entropy(left_child)) + num_right*(entropy(right_child)))
        
        return information_gain
    
    
    def best_split(self, X, y):
        """
        X: np.array, features 
        y: np.array or list
        returns dict
        """
        best_split = {}
        best_info_gain = -1 
        n_rows, n_cols = X.shape
        
        for index in range(n_cols):
            X_curr = X[:,index]
            
            for threshold in np.unique(X_curr):
                
                df = np.concatenate((X, y.reshape(1,-1)), axis=1)
                df_left = np.array([row for row in df if row[index]<=threshold])
                df_right = np.array([row for row in df if row[index]>threshold])
                
                if len(df_left)>0 and len(df_right)>0: 
                    y = df[:, -1]
                    y_left = df_left[:, -1]
                    y_right = df_right[:, -1]
                    
                    #calculate information gain from the split
                    gain = self.information_gain(y, y_left, y_right)
                    
                    if gain>best_info_gain: 
                        best_split = {
                            "feature_index": index, 
                            "threshold": threshold, 
                            "df_left":df_left, 
                            "df_right":df_right, 
                            "gain":gain
                        }
                        
                        best_info_gain = gain 
                        
        return best_split
    
    
    def build(self, X, y, depth = 0):
        """
        X: np.array, features 
        y: np.array, list or target 
        depth: int, current depth of tree
        return: Node
        """
        
        n_rows, n_cols = X.shape
        
        #base case to see if this should be leaf node 
        if n_rows >= self.min_sample_split and depth<=self.max_depth:
            
            best = self.best_split(X,y)
            
            if best["gain"]>0: 
                
                left = self.build(
                    X = best["df_left"][:,:-1],
                    y = best["df_left"][:, -1], 
                    depth = depth+1
                )
                
                right = self.build(
                    X = best["df_right"][:,:-1],
                    y = best["df_right"][:, -1], 
                    depth = depth +1 
                )
                
                return Node(
                    
                    feature= best["feature_index"],
                    threshold=best['threshold'], 
                    data_left= best['df_left'], 
                    data_right = best["df_right"], 
                    gain = best["gain"]
                    
                )
            
        return Node(
            
            value = Counter(y).most_common(1)[0][0]
        )
                
        
    def fit(self, X, y):
        """
        X: np.array, features 
        y: np.array, list, target
        return: None
        """
        
        self.root = self.build(X, y)
                
    def predict_single(self, x, tree):
        """
        x: single observation
        tree: built tree
        return: float, predicted class
        
        """
        
        if tree.value != None: 
            return tree.value 
        
        feature_value = x[tree.feature]
        
        #explore left
        if feature_value <=tree.threshold: 
            return self.predict_single(x = x, tree= tree.left)
        
        if feature_value > tree.threshold: 
            return self.predict_single(x = x, tree = tree.right)
        
    
    def predict(self, x):
        """
        X: np.array, features 
        return np.array: predicted classes
        
        """
        return [self.predict_single(x, self.root) for x in x]
    
    

In [None]:
class RandomForest: 
    
    
    def __init__(self, num_trees = 25, min_sample_split = 2, max_depth =5):
        
        self.num_trees = num_trees
        self.min_sample_split = min_sample_split
        self.max_depth = max_depth
        self.decision = []
        
    @staticmethod
    def _sample(X, y):
        """
        X: np.array, features 
        y: np.array, target
        return tuple (sample of features, sample of target)
        """
        
        n_rows, n_cols = X.shape
        
        sample = np.random.choice(a=n_rows, size=n_rows, replace=True)
        return X[sample], y[sample]
    
    
    def fit(self, X, y):
        """
        X:np.array, features 
        y:np.array, targets
        return: None
        """
        
        #reset 
        if len(self.decision)>0: 
            self.decision = [] 
            
            
        num_built = 0 
        while num_built < self.num_trees: 
            try: 
                
                clf = DecisionTree(
                    min_sample_split= self.min_sample_split, 
                    max_depth = max_depth
                        
                )
                
                _X, _y = self._sample(X, y)
                
                clf.fit(_X, _y)
                
                self.decision.append(clf)
                num_built +=1
            
            except Exception as e: 
                continue
                
                
    def predict(self, X):
        """
        param X: np.array 
        return: None
        
        """
        y = []
        
        for tree in self.decision: 
            y.append(tree.predict(X))
            
        y = n
                
        
        

In [74]:
np.random.choice(10, 5, replace=True)


array([9, 4, 8, 4, 1])

In [36]:
import pandas as pd
import numpy as np

df = pd.read_csv("../communal/Ames_Housing_Price_Data_cleaned_3.csv")
df = pd.get_dummies(df)
df = np.array(df)

In [64]:
curr = np.concatenate((df[:,1].reshape(2624,1), df[:,1].reshape(1,-1).T), axis=1)

In [76]:
np.swapaxes(np.array([1,2,3]), axis1= 0, axis2 = 1) 

array([1, 2, 3])

In [66]:
[row for row in curr]

[array([856., 856.]),
 array([1049., 1049.]),
 array([1049., 1049.]),
 array([1001., 1001.]),
 array([1039., 1039.]),
 array([1665., 1665.]),
 array([1922., 1922.]),
 array([936., 936.]),
 array([1246., 1246.]),
 array([889., 889.]),
 array([1072., 1072.]),
 array([1342., 1342.]),
 array([1274., 1274.]),
 array([861., 861.]),
 array([1394., 1394.]),
 array([1536., 1536.]),
 array([1680., 1680.]),
 array([1274., 1274.]),
 array([864., 864.]),
 array([1610., 1610.]),
 array([1091., 1091.]),
 array([1486., 1486.]),
 array([789., 789.]),
 array([1092., 1092.]),
 array([2640., 2640.]),
 array([1092., 1092.]),
 array([1458., 1458.]),
 array([1466., 1466.]),
 array([1573., 1573.]),
 array([2090., 2090.]),
 array([2046., 2046.]),
 array([886., 886.]),
 array([1258., 1258.]),
 array([1324., 1324.]),
 array([1848., 1848.]),
 array([1269., 1269.]),
 array([1346., 1346.]),
 array([1749., 1749.]),
 array([492., 492.]),
 array([492., 492.]),
 array([2030., 2030.]),
 array([988., 988.]),
 array([988.

In [11]:

s = [1,2,3,4,1,2,3,4]

percentage = np.bincount(np.array(s))/len(s)
percentage

array([0.  , 0.25, 0.25, 0.25, 0.25])

In [None]:
class DecisionTree: 
    def __init__(self, list_of_nodes):
        self.

In [None]:
class random_forest:
    
    def __init__(self, df):
        self.df = df
        
    def load_csv(filename):
        datab
        
    def get_split(df):
        

In [None]:
# Select the best split point for a dataset
def get_split(dataset, n_features):
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	features = list()
	while len(features) < n_features:
		index = randrange(len(dataset[0])-1)
		if index not in features:
			features.append(index)
	for index in features:
		for row in dataset:
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}

In [None]:
# Random Forest Algorithm on Sonar Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt
 
# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset
 
# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())
 
# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup
 
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split
 
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0
 
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores
 
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right
 
# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
	# count all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / n_instances)
	return gini
 
# Select the best split point for a dataset
def get_split(dataset, n_features):
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	features = list()
	while len(features) < n_features:
		index = randrange(len(dataset[0])-1)
		if index not in features:
			features.append(index)
	for index in features:
		for row in dataset:
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}
 
# Create a terminal node value
def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)
 
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, n_features, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# process left child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left, n_features)
		split(node['left'], max_depth, min_size, n_features, depth+1)
	# process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right, n_features)
		split(node['right'], max_depth, min_size, n_features, depth+1)
 
# Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
	root = get_split(train, n_features)
	split(root, max_depth, min_size, n_features, 1)
	return root
 
# Make a prediction with a decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']
 
# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
	sample = list()
	n_sample = round(len(dataset) * ratio)
	while len(sample) < n_sample:
		index = randrange(len(dataset))
		sample.append(dataset[index])
	return sample
 
# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
	predictions = [predict(tree, row) for tree in trees]
	return max(set(predictions), key=predictions.count)
 
# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
	trees = list()
	for i in range(n_trees):
		sample = subsample(train, sample_size)
		tree = build_tree(sample, max_depth, min_size, n_features)
		trees.append(tree)
	predictions = [bagging_predict(trees, row) for row in test]
	return(predictions)
 
# Test the random forest algorithm
seed(2)
# load and prepare data
filename = 'sonar.all-data.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(0, len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 5
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset[0])-1))
for n_trees in [1, 5, 10]:
	scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
	print('Trees: %d' % n_trees)
	print('Scores: %s' % scores)
	print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))