# Random forst algorithmen based on regression trees - implementation

In [None]:
import numpy as np
import random
import pandas as pd

## Regression trees - implementation

To dos:  
Splitting:  
- option for minimum sample size for split
- option for minimum information gain for split
- spit by minimum SSR of all possible splits
- candidates for the root will be decided via min SSR

Evaluation:  
- average of values in note level

### Helper functions

This function evaluates the effectiveness of a splitting candidate by calculating a specified metric for two subsets of data based on the given method.

In [None]:
def evaluate(subset1, subset2, method):
    if method == "min. MSR":
        predictor1 = subset1.mean()
        predictor2 = subset2.mean()
        
        SSR1 = ((subset1 - predictor1) ** 2).sum() 
        SSR2 = ((subset2 - predictor2) ** 2).sum()
        MSR = (SSR1 + SSR2)/ (len(subset1) + len(subset2))
        return MSR
    else:
        print("Error! ", method, " is not a method!")

This function identifies the best splitting candidate for the first column of a dataset by evaluating the effectiveness of splits on the second column, based on a specified evaluation method and optimization criterion (e.g., minimizing MSR).

In [None]:
def set_optimize_best_evaluation(method):
    if (method == "min. MSR"):
        optimize = "min"
        best_evaluation = float("inf")

    elif (method == "???"):
        optimize = "max"
        best_evaluation = float("-inf")
        
    else:
        print("ERROR! ", method, " is not a implemented method!")
        method = "min. MSR"
        optimize = "min"
        best_evaluation = float("inf")
    
    return optimize, best_evaluation

In [None]:
def RT_univariate_split(independent_variable, dependent_variable, method = "min. MSR"):
    
    #print("DEBUG: type(independent_variable) is \n", type(independent_variable), "\n")

    if isinstance(independent_variable, pd.DataFrame):
        independent_variable = independent_variable.iloc[:, 0]
        #print("DEBUG: type(independent_variable) is \n", type(independent_variable), "\n")
    
    candidates = [
        x for x in independent_variable.unique() if x != independent_variable.min() and x != independent_variable.max()
    ]
    ##print("DEBUG: candidates is \n", candidates, "\n")
    

    optimize, best_evaluation = set_optimize_best_evaluation(method)

    best_candidate = None
    
    for candidate in candidates:
        ##print("DEBUG: candidate set to \n", candidate, "\n")

        subset1 = dependent_variable[independent_variable < candidate]
        subset2 = dependent_variable[independent_variable >= candidate]
        #print("DEBUG: subset1 set to \n", subset1, "\n")
        #print("DEBUG: subset2 set to \n", subset2, "\n")
        #print("DEBUG: type(subset1) set to \n", type(subset1), "\n")
        #print("DEBUG: type(subset2) set to \n", type(subset2), "\n")

        subset1 = subset1.iloc[:, 0]
        subset2 = subset2.iloc[:, 0]
        #print("DEBUG: subset1 set to \n", subset1, "\n")
        #print("DEBUG: subset2 set to \n", subset2, "\n")
        #print("DEBUG: type(subset1) set to \n", type(subset1), "\n")
        #print("DEBUG: type(subset2) set to \n", type(subset2), "\n")

        min_split_size = min(len(subset1.iloc[:, 0]), len(subset2.iloc[:, 0]))

        candidate_evaluation = evaluate(subset1, subset2, method)
        ##print("DEBUG: candidate_evaluation set to \n", candidate_evaluation, "\n")

        if optimize == "min":

            ##print("DEBUG: best_evaluation is \n", best_evaluation, "\n")
            if candidate_evaluation < best_evaluation:
                best_evaluation = candidate_evaluation
                best_candidate = candidate
                ##print("DEBUG: best_candidate set to \n", best_candidate, "\n")
        else:
            if candidate_evaluation > best_evaluation:
                best_evaluation = candidate_evaluation
                best_candidate = candidate
                ##print("DEBUG: best_candidate set to \n", best_candidate, "\n")

    return best_candidate, best_evaluation, min_split_size

In [None]:
def RT_split(independent_variables, dependent_variable, method = "min. MSR", random_feature_count = None):
    
    optimize, best_evaluation = set_optimize_best_evaluation(method)
    best_column = None
    best_candidate = None

    split_results = {}

    feature_columns = independent_variables.columns
    ##print("DEBUG: feature_columns set to \n", feature_columns, "\n")
    if random_feature_count:
        if random_feature_count >= len(feature_columns):
            #print("Warning! Not enough features! Deactivate random feature selection.")
            random_feature_count = None
        elif not isinstance(random_feature_count, int) or random_feature_count < 1:
            #print("DEBUG: isinstance(random_feature_count, int) is set to \n", isinstance(random_feature_count, int), "\n")
            #print("DEBUG: random_feature_count is set to \n", random_feature_count, "\n")
            #print("Warning! random_feature_count must be int and >= 1! Deactivate random feature selection.")
            random_feature_count = None

    if random_feature_count:
        feature_columns = random.sample(list(feature_columns), random_feature_count)
        #print("DEBUG: feature_columns set to \n", feature_columns, "\n")

    for col in feature_columns:
        
        candidate, evaluation, min_split_size = RT_univariate_split(independent_variables[col], dependent_variable, method)
        split_results[col] = {"candidate": candidate, "evaluation": evaluation}
        ##print("DEBUG: split_results[", col, "] set to \n", split_results[col], "\n")

        if optimize == "min":
            for col, result in split_results.items():
                if result["evaluation"] < best_evaluation:
                    best_evaluation = result["evaluation"]
                    best_column = col
                    best_candidate = result["candidate"]
        else:
            for col, result in split_results.items():
                if result["evaluation"] > best_evaluation:
                    best_evaluation = result["evaluation"]
                    best_column = col
                    best_candidate = result["candidate"]

    return best_column, best_candidate, best_evaluation, min_split_size

In [None]:
def subset_data(data, dependent_column):
    dependent_variable = data[[dependent_column]]
    independent_variables = data.drop(columns=[dependent_column])
    return independent_variables, dependent_variable

In [None]:
def RT_recursion(independent_variables, dependent_variable, #data inputs
                    method="min. MSR", #evaluation specification
                    random_feature_count=None, bootstrap_count=None, #sampling methods
                    min_evaluation_gain=0, sample_size_limit=0, #split stopping condition
                    max_depth=20, max_splits = 100, #universal stopping conditions
                    depth = 0, num_splits = 0 #recursion passes
):
    # Initialize
    optimize, best_evaluation = set_optimize_best_evaluation(method)
    evaluation_gain = min_evaluation_gain

    if depth > max_depth or num_splits > max_splits:
          return [(column, candidate)]

    #Determine the best split
    column, candidate, evaluation, min_split_size = RT_split(independent_variables, dependent_variable, method, random_feature_count)
    
    #subset
    indep_subset_left = independent_variables[independent_variables[column] < candidate]
    indep_subset_right = independent_variables[independent_variables[column] >= candidate]
    dep_subset_left = dependent_variable[independent_variables[column] < candidate]
    dep_subset_right = dependent_variable[independent_variables[column] >= candidate]
    
    depth += 1
    num_splits += 1
    
    #from here on everything goes to shit
    left_results = RT_recursion(indep_subset_left, dep_subset_left, #data inputs
        method, #evaluation specification
        random_feature_count, bootstrap_count, #sampling methods
        min_evaluation_gain, sample_size_limit, #split skipping criterion
        max_depth, max_splits, #stopping criterions
        depth, num_splits #recursion passes
        )
    
    right_results = RT_recursion(indep_subset_right, dep_subset_right, #data inputs
        method, #evaluation specification
        random_feature_count, bootstrap_count, #sampling methods
        min_evaluation_gain, sample_size_limit, #split skipping criterion
        max_depth, max_splits, #stopping criterions
        depth, num_splits #recursion passes
        )

    return [(column, candidate)].append((left_results, right_results))

### Tree developement

In [None]:
data = pd.DataFrame({
    "Bedrooms": [2, 3, 4, 5, 3, 4, 2, 3],
    "Square Footage": [1500, 1800, 2200, 2000, 2500, 2300, 1600, 1900],
    "Age": [20, 15, 25, 10, 5, 8, 30, 12],
    "Price": [250, 300, 300, 320, 500, 450, 350, 350]
})

data2 = pd.DataFrame({
    "Bedrooms": [2, 3, 4, 3, 5, 4, 2, 3],
    "Price": [250, 300, 400, 320, 500, 450, 200, 350]
})

independent_variables, dependent_variable = subset_data(data, "Price")
#print(independent_variables)

#print("Result of evaluate function:", evaluate(subset1, subset2, method="min. MSR"))
#print(RT_univariate_split(independent_variables, dependent_variable))
#print(RT_split(independent_variables, dependent_variable, random_feature_count = 2))
print(RT_recursion(independent_variables, dependent_variable, min_evaluation_gain=0.1))


NameError: name 'pd' is not defined