In [1]:
from sklearn import ensemble
from sklearn import metrics
import pandas as pd
import numpy as np
from IPython.display import display


## Creating samples

In [2]:
np.random.seed(0)
num_of_features = 15
n_samples = 1000

threshold = .5

#Generating covariance matrix all features and labels are drawn from multivariate normal distribution
#covariance between each pair is a random variable between -0.5 and 0.5 and the variance of each variable is 1
cov_mat = np.array([[np.random.uniform(0, 1) if i != j else 1 for j in range(num_of_features + 1)] 
                    for i in range(num_of_features + 1)])
cov_mat = (cov_mat + cov_mat.T) /2

#creating the samples mean for every feature is 0
samples = np.random.multivariate_normal([0 for i in range(num_of_features + 1)], cov_mat, n_samples)

#last column in the samples matrix represents the labels
labels = samples[:, -1]
##if label > threshold the label is 1
labels = labels > threshold
features = samples[:, :-1]

print("proportion of positives is %f" %(labels.sum() / labels.shape[0]))


proportion of positives is 0.315000


  samples = np.random.multivariate_normal([0 for i in range(num_of_features + 1)], cov_mat, n_samples)


## Training the classifier

In [3]:
#model initialization
mod = ensemble.RandomForestClassifier(n_estimators=100, max_depth=None, 
                                        min_samples_split=2, min_samples_leaf=1, 
                                        min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, 
                                        bootstrap=True, oob_score=True, n_jobs=None, random_state=0, verbose=0)

#training. We will use 70% of the data
mod.fit(features[:int(n_samples * 0.7)], labels[:int(n_samples * 0.7)])
##oob
print("oob score %f" %mod.oob_score_)

#correlation between feature importance and the correlation with the label
import_df = pd.DataFrame()
import_df["importance"] = mod.feature_importances_
import_df["cov"] = cov_mat[-1, :-1]
display(import_df)

display(import_df.corr())

oob score 0.747143


Unnamed: 0,importance,cov
0,0.051198,0.511914
1,0.08567,0.551075
2,0.049257,0.441137
3,0.062813,0.595898
4,0.098846,0.73506
5,0.099252,0.591619
6,0.052155,0.231033
7,0.095432,0.605366
8,0.054748,0.297814
9,0.052637,0.603159


Unnamed: 0,importance,cov
importance,1.0,0.646044
cov,0.646044,1.0


## Testing

In [4]:
metrics.accuracy_score(labels[int(n_samples * 0.7):], mod.predict(features[int(n_samples * 0.7):]))


0.7466666666666667