In [14]:
import numpy as np
from numpy.random import default_rng
from numpy import sum, corrcoef, zeros
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor

# Prepare Synthetic Data

We start by preparing some synthetic data $Xtr, ytr$. 

The $ytr$ is the summation of the $Xtr$, plus a random noise:

In [15]:
obs = 1_000
vars = 10
rng = default_rng(seed=0)
Xtr = rng.random((obs, vars))
ytr = sum(Xtr, axis=1) + (rng.random(obs)-1/2)
Xte = rng.random((obs, vars))
yte = sum(Xte, axis=1) + (rng.random(obs)-1/2)

# Decision Tree Regressor

We continue with the training of a simple Decision Tree

In [None]:
dtree = DecisionTreeRegressor(max_depth=10, random_state=0)
dtree.fit(Xtr, ytr)

# Accuracy
pred_tr = dtree.predict(Xtr)
acc_tr = corrcoef(ytr,pred_tr)[0,1]
print(acc_tr)
pred_te = dtree.predict(Xte)
acc_te = corrcoef(yte,pred_te)[0,1]
print(acc_te)

We see a high accuracy in the Train Set, but low in the Test Set

# Random Forest

We continue with Random Forests. We set the number of trees, and create some random sub-sampling of the columns. We store the indices of the columns in $inds\_cols$.

## Random Features

In [17]:
nof_trees = 100
# we need to keep inds_cols, for train AND later for prediction
nof_cols = int(0.7*vars)
inds_cols = zeros((nof_trees,nof_cols),dtype=int)
for j in range(nof_trees):
    rng = default_rng(seed=j+1)
    jj = rng.integers(low=0, high=vars, size=nof_cols)
    inds_cols[j,:] = jj

Then, we check if the columns have been sampled uniformly:

In [None]:
inds_cols_vec = inds_cols.reshape(nof_trees*nof_cols,1)[:,0]
inds_cols_vec
counter = Counter(inds_cols_vec)
vals = counter.values()
keys = counter.keys()
plt.bar(keys,vals)
plt.show()

We continue with the Training of the Random Forest. For each random sampling of the columns that we did previously, we train a decision tree. Furthermore, for each tree, we randomly sub-sampling the rows of the dataset. We store all trained trees in the list $all\_trees$.

In [19]:
nof_rows = int(0.7*obs)
all_trees = []
for j in range(nof_trees):
    dtree = DecisionTreeRegressor(max_depth=10, random_state=j+1)
    rng = default_rng(seed=j+1)
    ii = rng.integers(low=0, high=obs, size=nof_rows)
    dtree.fit(Xtr[ii,:][:,inds_cols[j,:]], ytr[ii])
    all_trees.append(dtree)

Finally, we predict for the train and test sets:

In [20]:
pred_tr = zeros(obs)
pred_te = zeros(obs)
for j in range(nof_trees):
    j_dtree = all_trees[j]
    pred_tr += j_dtree.predict(Xtr[:,inds_cols[j,:]])
    pred_te += j_dtree.predict(Xte[:,inds_cols[j,:]])
pred_tr /= nof_trees
pred_te /= nof_trees

In [None]:
acc_tr = corrcoef(ytr,pred_tr)[0,1]
print(acc_tr)
acc_te = corrcoef(yte,pred_te)[0,1]
print(acc_te)

We see that the accuracy has been vastly improved, and it is similar for the train and test sets, indicating generalization capabilities without over-fitting.

# Random Forest Regressor using sklearn

In [22]:
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest Regressor
rf = RandomForestRegressor(n_estimators=nof_trees, max_depth=10, random_state=0)
rf.fit(Xtr, ytr)

# Predict for train and test sets
pred_tr_rf = rf.predict(Xtr)
pred_te_rf = rf.predict(Xte)

In [23]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Function to compute the Pearson Correlation Coefficient
def pearson_correlation(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]

# Function to compute MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

# Function to compute MAXAPE
def max_absolute_percentage_error(y_true, y_pred):
    return np.max(np.abs((y_true - y_pred) / y_true))

# Function to compute MAMPE
def mean_absolute_mean_percentage_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred)) / np.mean(y_true)

# Function to compute slope and intercept using least squares
def compute_slope_intercept(y_true, y_pred):
    # vstack is used to stack the arrays in sequence vertically (row-wise)
    A = np.vstack([y_true, np.ones(len(y_true))]).T
    # linalg.lstsq is used to solve the linear least squares problem. It returns the intercept and slope of the line that best fits the data.
    slope, intercept = np.linalg.lstsq(A, y_pred, rcond=None)[0]
    return slope, intercept

In [None]:
# Compute metrics for train and test sets
def compute_metrics(y_true, y_pred):
    return {
        "Pearson Correlation": pearson_correlation(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAE": mean_absolute_error(y_true, y_pred),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred),
        "MAXAPE": max_absolute_percentage_error(y_true, y_pred),
        "MAMPE": mean_absolute_mean_percentage_error(y_true, y_pred),
        "Slope": compute_slope_intercept(y_true, y_pred)[0],
        "Intercept": compute_slope_intercept(y_true, y_pred)[1]
    }# This is a Python dictionary, which is a data structure used to store key-value pairs.
    # Each key (e.g., "Pearson Correlation", "RMSE", etc.) is associated with a value, which is computed using functions like pearson_correlation(y_true, y_pred) and mean_squared_error(y_true, y_pred).

metrics_train = compute_metrics(ytr, pred_tr_rf)
metrics_test = compute_metrics(yte, pred_te_rf)

import pandas as pd
# We create a pandas DataFrame to store the metrics for train and test sets.
metrics_df = pd.DataFrame({"Train": metrics_train, "Test": metrics_test}).T
print(metrics_df)

In [None]:
import matplotlib.pyplot as plt

# Plot target versus predicted for train set
plt.figure(figsize=(6, 5))  # Create a new figure with a specific size
plt.scatter(ytr, pred_tr_rf, alpha=0.5)  # Scatter plot of actual vs predicted train values with transparency
# Extract slope, intercept, and Pearson correlation for the train set
slope_train, intercept_train, pearson_train = metrics_train["Slope"], metrics_train["Intercept"], metrics_train["Pearson Correlation"]
# Plot the trend line based on the slope and intercept
plt.plot([0, ytr.max()], [intercept_train, slope_train * ytr.max() + intercept_train], 'r-', lw=2, label=f'Trend line (slope={slope_train:.2f}, intercept={intercept_train:.2f}, Pearson={pearson_train:.2f})')
# Plot the ideal line where predicted values equal actual values
plt.plot([0, ytr.max()], [0, ytr.max()], 'k--', lw=2)
plt.legend()  # Add a legend to the plot
plt.xlabel('Actual Train Values')  # Label for the x-axis
plt.ylabel('Predicted Train Values')  # Label for the y-axis

In [None]:
# Plot target versus predicted for test set
plt.figure(figsize=(6, 5))  # Create a new figure with a specific size
plt.scatter(yte, pred_te_rf, alpha=0.5)  # Scatter plot of actual vs predicted test values with transparency
# Extract slope, intercept, and Pearson correlation for the test set
slope_test, intercept_test, pearson_test = metrics_test["Slope"], metrics_test["Intercept"], metrics_test["Pearson Correlation"]
# Plot the trend line based on the slope and intercept
plt.plot([0, yte.max()], [intercept_test, slope_test * yte.max() + intercept_test], 'r-', lw=2, label=f'Trend line (slope={slope_test:.2f}, intercept={intercept_test:.2f}, Pearson={pearson_test:.2f})')
# Plot the ideal line where predicted values equal actual values
plt.plot([0, yte.max()], [0, yte.max()], 'k--', lw=2)
plt.legend()  # Add a legend to the plot
plt.xlabel('Actual Test Values')  # Label for the x-axis
plt.ylabel('Predicted Test Values')  # Label for the y-axis