In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import math
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification, make_regression, load_digits, load_boston, load_iris
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier

In [2]:
RANDOM_STATE = 17

To built Random Forest:
- In the `fit` method in the loop (`i` from 0 to `n_estimators-1`), fix the seed equal to (`random_state + i`). The idea is that at each iteration there's a new value of random seed to add more "randomness", but at hte same time results are reproducible
- After fixing the seed, select `max_features` features **without replacement**, save the list of selected feature ids in `self.feat_ids_by_tree`
- Also make a bootstrap sample (i.e. **sampling with replacement**) of training instances. For that, resort to `np.random.choice` and its argument `replace`
- Train a decision tree with specified (in a constructor) arguments `max_depth`, `max_features` and `random_state` (do not specify `class_weight`) on a corresponding subset of training data. 
- The `fit` method returns the current instance of the class `RandomForestClassifierCustom`, that is `self`
- In the `predict_proba` method, we need to loop through all the trees. For each prediction, obviously, we need to take only those features which we used for training the corresponding tree. The method returns predicted probabilities (`predict_proba`), averaged for all trees

In [3]:
class MyRandomForestClassifier(BaseEstimator):
    def __init__(self, n_estimators=10, max_depth=10, max_features=10, random_state=RANDOM_STATE):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state
        
        self.trees = []
        self.feat_ids_by_tree = []
        
    def fit(self, X, y):
        for i in range(self.n_estimators):
            np.random.seed(self.random_state + i)
            col_indices = np.random.choice(range(X.columns.shape[0]), size = self.max_features, replace = True)
            self.feat_ids_by_tree.append(col_indices)
            row_indices = np.random.randint(0,len(X), size=len(X))
            sample_X = X.iloc[row_indices,col_indices]
            sample_y = y[row_indices] 
            dt = DecisionTreeClassifier(max_depth=self.max_depth,max_features=self.max_features, random_state=self.random_state)
            dt.fit(sample_X,sample_y)
            self.trees.append(dt)
        return self
    
    def predict_proba(self, X):
        prediction = None
        for ti, tree in enumerate(self.trees):
            indices = self.feat_ids_by_tree[ti]
            sample_X = X.iloc[:,indices]
            if prediction is None:
                prediction = tree.predict_proba(sample_X)
            else:
                prediction += tree.predict_proba(sample_X)
        return prediction/len(self.trees)

Use credit scoring dataset

In [4]:
data = pd.read_csv("../credit_scoring_sample.csv", sep = ";")

In [5]:
independent_columns_names = data.columns.values
independent_columns_names = [x for x in data if x != 'SeriousDlqin2yrs']
independent_columns_names
for col in data.columns:
        data[col]= data[col].fillna(data[col].median())
X = data[independent_columns_names]
y = data['SeriousDlqin2yrs']

Calculate the average ROC AUC for cross-validation for our written RF

In [9]:
rf = MyRandomForestClassifier(max_depth=7, max_features=6)
rf.fit(X,y)
print(np.mean(cross_val_score(rf, X, y.values, scoring='roc_auc')))

0.8305086051215925


Calculate the average ROC AUC for cross-validation for sklearn RF

In [7]:
rf = RandomForestClassifier(max_depth=7, max_features=6)
rf.fit(X,y)
print(np.mean(cross_val_score(rf, X, y.values, scoring='roc_auc')))

0.8308121932893137


### NB

*It works with pd Dataframes*