In [1]:
from sklearn import ensemble
from sklearn import metrics
import pandas as pd
import numpy as np
from IPython.display import display


## Creating samples

In [2]:
np.random.seed(0)
num_of_features = 10
n_samples = 100


#Generating covariance matrix all features and labels are drawn from multivariate normal distribution
#covariance between each pair is a random variable between -0.5 and 0.5 and the variance of each variable is 1
cov_mat = np.array([[np.random.uniform(0, 1) if i != j else 1 for j in range(num_of_features + 1)] 
                    for i in range(num_of_features + 1)])
cov_mat = (cov_mat + cov_mat.T) /2

#creating the samples mean for every feature is 0
samples = np.random.multivariate_normal([0 for i in range(num_of_features + 1)], cov_mat, n_samples)

#last column in the samples matrix represents the labels
labels = samples[:, -1]
features = samples[:, :-1]


  samples = np.random.multivariate_normal([0 for i in range(num_of_features + 1)], cov_mat, n_samples)


## Training the regressor

In [3]:
#model initialization
mod = ensemble.RandomForestRegressor(n_estimators=100, max_depth=None, 
                                        min_samples_split=2, min_samples_leaf=1, 
                                        min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, 
                                        bootstrap=True, oob_score=True, n_jobs=None, random_state=0, verbose=0)

#training. We will use 70% of the data
mod.fit(features[:int(n_samples * 0.7)], labels[:int(n_samples * 0.7)])
##oob
print("oob score %f" %mod.oob_score_)

#correlation between feature importance and the correlation with the label
import_df = pd.DataFrame()
import_df["importance"] = mod.feature_importances_
import_df["cov"] = cov_mat[-1, :-1]
display(import_df.sort_values(by = "cov"))

display(import_df.corr())

oob score 0.426511


Unnamed: 0,importance,cov
4,0.061234,0.306232
6,0.072032,0.34407
7,0.057381,0.34549
5,0.113249,0.410291
9,0.070864,0.478722
0,0.04168,0.530629
1,0.086738,0.57001
2,0.150905,0.574928
8,0.126311,0.576189
3,0.219607,0.822004


Unnamed: 0,importance,cov
importance,1.0,0.791457
cov,0.791457,1.0


## Testing

In [4]:
metrics.mean_squared_error(labels[int(n_samples * 0.7):], mod.predict(features[int(n_samples * 0.7):]))


0.4273566652560767