# Random forst algorithmen based on regression trees - implementation

In [52]:
import numpy as np
import random
import pandas as pd

## Regression trees - implementation

To dos:  
Splitting:  
- option for minimum sample size for split
- option for minimum information gain for split
- spit by minimum SSR of all possible splits
- candidates for the root will be decided via min SSR

Evaluation:  
- average of values in note level

### Helper functions

This function evaluates the effectiveness of a splitting candidate by calculating a specified metric for two subsets of data based on the given method.

In [53]:
def evaluate(subset1, subset2, method):
    if method == "min. MSR":
        predictor1 = subset1.mean()
        predictor2 = subset2.mean()
        
        SSR1 = ((subset1 - predictor1) ** 2).sum() 
        SSR2 = ((subset2 - predictor2) ** 2).sum()
        MSR = (SSR1 + SSR2)/ (len(subset1) + len(subset2))
        return MSR
    else:
        print("Error! ", method, " is not a method!")

This function identifies the best splitting candidate for the first column of a dataset by evaluating the effectiveness of splits on the second column, based on a specified evaluation method and optimization criterion (e.g., minimizing MSR).

In [54]:
def set_optimize_best_evaluation(method):
    if (method == "min. MSR"):
        optimize = "min"
        best_evaluation = float("inf")

    elif (method == "???"):
        optimize = "max"
        best_evaluation = float("-inf")
        
    else:
        print("ERROR! ", method, " is not a implemented method!")
        method = "min. MSR"
        optimize = "min"
        best_evaluation = float("inf")
    
    return optimize, best_evaluation

In [55]:
def RT_univariate_split(independent_variable, dependent_variable, method = "min. MSR"):
    

    candidates = [x for x in independent_variable.unique() if x != independent_variable.min() and x != independent_variable.max()] #only allow for actual splits so exclude min and max values as notes
    
    optimize, best_evaluation = set_optimize_best_evaluation(method)

    best_candidate = None
    
    for candidate in candidates:
        print("DEBUG: candidate set to \n", candidate, "\n")

        subset1 = dependent_variable[independent_variable < candidate]
        subset2 = dependent_variable[independent_variable >= candidate]
        #print("DEBUG: subset1 set to \n", subset1, "\n")
        #print("DEBUG: subset2 set to \n", subset2, "\n")

        candidate_evaluation = evaluate(subset1, subset2, method)
        print("DEBUG: candidate_evaluation set to \n", candidate_evaluation, "\n")

        if optimize == "min":
            if candidate_evaluation < best_evaluation:
                best_evaluation = candidate_evaluation
                best_candidate = candidate
                print("DEBUG: best_candidate set to \n", best_candidate, "\n")
        else:
            if candidate_evaluation > best_evaluation:
                best_evaluation = candidate_evaluation
                best_candidate = candidate
                print("DEBUG: best_candidate set to \n", best_candidate, "\n")

    return best_candidate, best_evaluation

In [56]:
def RT_split(independent_variables, dependent_variable, method = "min. MSR", random_feature_count = None):
    
    optimize, best_evaluation = set_optimize_best_evaluation(method)
    best_column = None
    best_candidate = None

    split_results = {}

    feature_columns = independent_variables.columns
    print("DEBUG: feature_columns set to \n", feature_columns, "\n")
    if random_feature_count:
        if random_feature_count >= len(feature_columns):
            print("Warning! Not enough features! Deactivate random feature selection.")
            random_feature_count = None
        elif not isinstance(random_feature_count, int) or random_feature_count < 1:
            print("DEBUG: isinstance(random_feature_count, int) is set to \n", isinstance(random_feature_count, int), "\n")
            print("DEBUG: random_feature_count is set to \n", random_feature_count, "\n")
            print("Warning! random_feature_count must be int and >= 1! Deactivate random feature selection.")
            random_feature_count = None

    if random_feature_count:
        feature_columns = random.sample(list(feature_columns), random_feature_count)
        print("DEBUG: feature_columns set to \n", feature_columns, "\n")

    for col in feature_columns:
        
        candidate, evaluation = RT_univariate_split(independent_variables[col], dependent_variable, method)
        split_results[col] = {"candidate": candidate, "evaluation": evaluation}
        print("DEBUG: split_results[", col, "] set to \n", split_results[col], "\n")

        if optimize == "min":
            for col, result in split_results.items():
                if result["evaluation"] < best_evaluation:
                    best_evaluation = result["evaluation"]
                    best_column = col
                    best_candidate = result["candidate"]
        else:
            for col, result in split_results.items():
                if result["evaluation"] > best_evaluation:
                    best_evaluation = result["evaluation"]
                    best_column = col
                    best_candidate = result["candidate"]

    return best_column, best_candidate, best_evaluation

In [57]:
def subset_data(data, dependent_column):
    dependent_variable = data[dependent_column]
    independent_variables = data.drop(columns=[dependent_column])
    return independent_variables, dependent_variable

In [None]:
def regression_tree(data, dependent_column, min_evaluation_gain, method = "min. MSR", max_splits = 100, random_feature_count = None):
    
    #initialize
    optimize, best_evaluation = set_optimize_best_evaluation(method)
    num_splits = 0
    evaluation_gain = min_evaluation_gain
    results = {}

    independent_variables, dependent_variable = subset_data(data, dependent_column)

    while num_splits < max_splits and evaluation_gain >= min_evaluation_gain:
        #figure out which split to do
        column, candidate, evaluation = RT_split(independent_variables, dependent_variable, random_feature_count)
        
        if optimize == "min":
            evaluation_gain = best_evaluation/evaluation - 1
        else:
            evaluation_gain = evaluation/best_evaluation - 1

        results[num_splits] = {
            "split_column": column,
            "split_value": candidate,
            "evaluation_score": evaluation,
            "evaluation_gain": evaluation_gain
        }
        
        

        #prepare for next iteration
        best_evaluation = evaluation
        num_splits += 1

### Tree developement

In [59]:
data = pd.DataFrame({
    "Bedrooms": [2, 3, 4, 3, 5, 4, 2, 3],
    "Square Footage": [1500, 1800, 2200, 2000, 2500, 2300, 1600, 1900],
    "Age": [20, 15, 10, 25, 5, 8, 30, 12],
    "Price": [250, 300, 300, 320, 500, 450, 350, 350]
})

data2 = pd.DataFrame({
    "Bedrooms": [2, 3, 4, 3, 5, 4, 2, 3],
    "Price": [250, 300, 400, 320, 500, 450, 200, 350]
})


independent_variables, dependent_variable = subset_data(data2, "Price")
#print(independent_variables)

#print("Result of evaluate function:", evaluate(subset1, subset2, method="min. MSR"))
#print(RT_univariate_split(subset1, subset2))
print(RT_split(independent_variables, dependent_variable, random_feature_count = 2))


DEBUG: feature_columns set to 
 Index(['Bedrooms'], dtype='object') 

DEBUG: candidate set to 
 3 

DEBUG: candidate_evaluation set to 
 3947.916666666667 

DEBUG: best_candidate set to 
 3 

DEBUG: candidate set to 
 4 

DEBUG: candidate_evaluation set to 
 2390.0 

DEBUG: best_candidate set to 
 4 

DEBUG: split_results[ Bedrooms ] set to 
 {'candidate': 4, 'evaluation': 2390.0} 

('Bedrooms', 4, 2390.0)
